Linux Audio

Check our new training course

Loading...
v3.1
   1/*
   2 * Copyright (c) 2006, Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * You should have received a copy of the GNU General Public License along with
  14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  15 * Place - Suite 330, Boston, MA 02111-1307 USA.
  16 *
  17 * Copyright (C) 2006-2008 Intel Corporation
  18 * Author: Ashok Raj <ashok.raj@intel.com>
  19 * Author: Shaohua Li <shaohua.li@intel.com>
  20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
  21 * Author: Fenghua Yu <fenghua.yu@intel.com>
  22 */
  23
 
 
  24#include <linux/init.h>
  25#include <linux/bitmap.h>
  26#include <linux/debugfs.h>
 
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
 
  34#include <linux/mempool.h>
 
 
  35#include <linux/timer.h>
 
  36#include <linux/iova.h>
  37#include <linux/iommu.h>
  38#include <linux/intel-iommu.h>
  39#include <linux/syscore_ops.h>
  40#include <linux/tboot.h>
  41#include <linux/dmi.h>
  42#include <linux/pci-ats.h>
 
 
 
 
 
  43#include <asm/cacheflush.h>
  44#include <asm/iommu.h>
  45
 
 
  46#define ROOT_SIZE		VTD_PAGE_SIZE
  47#define CONTEXT_SIZE		VTD_PAGE_SIZE
  48
  49#define IS_BRIDGE_HOST_DEVICE(pdev) \
  50			    ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
  51#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
 
  52#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  53#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  54
  55#define IOAPIC_RANGE_START	(0xfee00000)
  56#define IOAPIC_RANGE_END	(0xfeefffff)
  57#define IOVA_START_ADDR		(0x1000)
  58
  59#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
  60
  61#define MAX_AGAW_WIDTH 64
 
  62
  63#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  64#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  65
  66/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  67   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  68#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  69				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  70#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  71
 
 
 
  72#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
  73#define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
  74#define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
  75
  76/* page table handling */
  77#define LEVEL_STRIDE		(9)
  78#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
  80static inline int agaw_to_level(int agaw)
  81{
  82	return agaw + 2;
  83}
  84
  85static inline int agaw_to_width(int agaw)
  86{
  87	return 30 + agaw * LEVEL_STRIDE;
  88}
  89
  90static inline int width_to_agaw(int width)
  91{
  92	return (width - 30) / LEVEL_STRIDE;
  93}
  94
  95static inline unsigned int level_to_offset_bits(int level)
  96{
  97	return (level - 1) * LEVEL_STRIDE;
  98}
  99
 100static inline int pfn_level_offset(unsigned long pfn, int level)
 101{
 102	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 103}
 104
 105static inline unsigned long level_mask(int level)
 106{
 107	return -1UL << level_to_offset_bits(level);
 108}
 109
 110static inline unsigned long level_size(int level)
 111{
 112	return 1UL << level_to_offset_bits(level);
 113}
 114
 115static inline unsigned long align_to_level(unsigned long pfn, int level)
 116{
 117	return (pfn + level_size(level) - 1) & level_mask(level);
 118}
 119
 120static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 121{
 122	return  1 << ((lvl - 1) * LEVEL_STRIDE);
 123}
 124
 125/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 126   are never going to work. */
 127static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 128{
 129	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 130}
 131
 132static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 133{
 134	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 135}
 136static inline unsigned long page_to_dma_pfn(struct page *pg)
 137{
 138	return mm_to_dma_pfn(page_to_pfn(pg));
 139}
 140static inline unsigned long virt_to_dma_pfn(void *p)
 141{
 142	return page_to_dma_pfn(virt_to_page(p));
 143}
 144
 145/* global iommu list, set NULL for ignored DMAR units */
 146static struct intel_iommu **g_iommus;
 147
 148static void __init check_tylersburg_isoch(void);
 149static int rwbf_quirk;
 150
 151/*
 152 * set to 1 to panic kernel if can't successfully enable VT-d
 153 * (used when kernel is launched w/ TXT)
 154 */
 155static int force_on = 0;
 
 156
 157/*
 158 * 0: Present
 159 * 1-11: Reserved
 160 * 12-63: Context Ptr (12 - (haw-1))
 161 * 64-127: Reserved
 162 */
 163struct root_entry {
 164	u64	val;
 165	u64	rsvd1;
 166};
 167#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 168static inline bool root_present(struct root_entry *root)
 169{
 170	return (root->val & 1);
 171}
 172static inline void set_root_present(struct root_entry *root)
 173{
 174	root->val |= 1;
 175}
 176static inline void set_root_value(struct root_entry *root, unsigned long value)
 177{
 178	root->val |= value & VTD_PAGE_MASK;
 
 
 
 179}
 180
 181static inline struct context_entry *
 182get_context_addr_from_root(struct root_entry *root)
 
 
 
 183{
 184	return (struct context_entry *)
 185		(root_present(root)?phys_to_virt(
 186		root->val & VTD_PAGE_MASK) :
 187		NULL);
 188}
 189
 
 
 190/*
 191 * low 64 bits:
 192 * 0: present
 193 * 1: fault processing disable
 194 * 2-3: translation type
 195 * 12-63: address space root
 196 * high 64 bits:
 197 * 0-2: address width
 198 * 3-6: aval
 199 * 8-23: domain id
 200 */
 201struct context_entry {
 202	u64 lo;
 203	u64 hi;
 204};
 205
 206static inline bool context_present(struct context_entry *context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 207{
 208	return (context->lo & 1);
 209}
 
 
 
 
 
 
 
 
 210static inline void context_set_present(struct context_entry *context)
 211{
 212	context->lo |= 1;
 213}
 214
 215static inline void context_set_fault_enable(struct context_entry *context)
 216{
 217	context->lo &= (((u64)-1) << 2) | 1;
 218}
 219
 220static inline void context_set_translation_type(struct context_entry *context,
 221						unsigned long value)
 222{
 223	context->lo &= (((u64)-1) << 4) | 3;
 224	context->lo |= (value & 3) << 2;
 225}
 226
 227static inline void context_set_address_root(struct context_entry *context,
 228					    unsigned long value)
 229{
 
 230	context->lo |= value & VTD_PAGE_MASK;
 231}
 232
 233static inline void context_set_address_width(struct context_entry *context,
 234					     unsigned long value)
 235{
 236	context->hi |= value & 7;
 237}
 238
 239static inline void context_set_domain_id(struct context_entry *context,
 240					 unsigned long value)
 241{
 242	context->hi |= (value & ((1 << 16) - 1)) << 8;
 243}
 244
 
 
 
 
 
 245static inline void context_clear_entry(struct context_entry *context)
 246{
 247	context->lo = 0;
 248	context->hi = 0;
 249}
 250
 251/*
 252 * 0: readable
 253 * 1: writable
 254 * 2-6: reserved
 255 * 7: super page
 256 * 8-10: available
 257 * 11: snoop behavior
 258 * 12-63: Host physcial address
 259 */
 260struct dma_pte {
 261	u64 val;
 262};
 263
 264static inline void dma_clear_pte(struct dma_pte *pte)
 265{
 266	pte->val = 0;
 267}
 268
 269static inline void dma_set_pte_readable(struct dma_pte *pte)
 270{
 271	pte->val |= DMA_PTE_READ;
 272}
 273
 274static inline void dma_set_pte_writable(struct dma_pte *pte)
 275{
 276	pte->val |= DMA_PTE_WRITE;
 277}
 278
 279static inline void dma_set_pte_snp(struct dma_pte *pte)
 280{
 281	pte->val |= DMA_PTE_SNP;
 282}
 283
 284static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
 285{
 286	pte->val = (pte->val & ~3) | (prot & 3);
 287}
 288
 289static inline u64 dma_pte_addr(struct dma_pte *pte)
 290{
 291#ifdef CONFIG_64BIT
 292	return pte->val & VTD_PAGE_MASK;
 293#else
 294	/* Must have a full atomic 64-bit read */
 295	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 296#endif
 297}
 298
 299static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
 300{
 301	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
 302}
 303
 304static inline bool dma_pte_present(struct dma_pte *pte)
 305{
 306	return (pte->val & 3) != 0;
 307}
 308
 309static inline bool dma_pte_superpage(struct dma_pte *pte)
 310{
 311	return (pte->val & (1 << 7));
 312}
 313
 314static inline int first_pte_in_page(struct dma_pte *pte)
 315{
 316	return !((unsigned long)pte & ~VTD_PAGE_MASK);
 317}
 318
 319/*
 320 * This domain is a statically identity mapping domain.
 321 *	1. This domain creats a static 1:1 mapping to all usable memory.
 322 * 	2. It maps to each iommu if successful.
 323 *	3. Each iommu mapps to this domain if successful.
 324 */
 325static struct dmar_domain *si_domain;
 326static int hw_pass_through = 1;
 327
 328/* devices under the same p2p bridge are owned in one domain */
 329#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
 330
 331/* domain represents a virtual machine, more than one devices
 332 * across iommus may be owned in one domain, e.g. kvm guest.
 333 */
 334#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1)
 335
 336/* si_domain contains mulitple devices */
 337#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2)
 
 
 
 
 338
 339struct dmar_domain {
 340	int	id;			/* domain id */
 341	int	nid;			/* node id */
 342	unsigned long iommu_bmp;	/* bitmap of iommus this domain uses*/
 343
 344	struct list_head devices; 	/* all devices' list */
 
 
 
 
 
 
 
 
 
 
 345	struct iova_domain iovad;	/* iova's that belong to this domain */
 346
 347	struct dma_pte	*pgd;		/* virtual address */
 348	int		gaw;		/* max guest address width */
 349
 350	/* adjusted guest address width, 0 is level 2 30-bit */
 351	int		agaw;
 352
 353	int		flags;		/* flags to find out type of domain */
 354
 355	int		iommu_coherency;/* indicate coherency of iommu access */
 356	int		iommu_snooping; /* indicate snooping control feature*/
 357	int		iommu_count;	/* reference count of iommu */
 358	int		iommu_superpage;/* Level of superpages supported:
 359					   0 == 4KiB (no superpages), 1 == 2MiB,
 360					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 361	spinlock_t	iommu_lock;	/* protect iommu set in domain */
 362	u64		max_addr;	/* maximum mapped address */
 
 
 
 363};
 364
 365/* PCI domain-device relationship */
 366struct device_domain_info {
 367	struct list_head link;	/* link to domain siblings */
 368	struct list_head global; /* link to global list */
 369	int segment;		/* PCI domain */
 370	u8 bus;			/* PCI bus number */
 371	u8 devfn;		/* PCI devfn number */
 372	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
 
 
 
 
 
 
 
 373	struct intel_iommu *iommu; /* IOMMU used by this device */
 374	struct dmar_domain *domain; /* pointer to domain */
 375};
 376
 377static void flush_unmaps_timeout(unsigned long data);
 378
 379DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
 
 
 
 
 
 
 380
 381#define HIGH_WATER_MARK 250
 382struct deferred_flush_tables {
 383	int next;
 384	struct iova *iova[HIGH_WATER_MARK];
 385	struct dmar_domain *domain[HIGH_WATER_MARK];
 
 386};
 387
 388static struct deferred_flush_tables *deferred_flush;
 
 
 
 
 389
 390/* bitmap for indexing intel_iommus */
 391static int g_num_of_iommus;
 392
 393static DEFINE_SPINLOCK(async_umap_flush_lock);
 394static LIST_HEAD(unmaps_to_do);
 395
 396static int timer_on;
 397static long list_size;
 398
 399static void domain_remove_dev_info(struct dmar_domain *domain);
 
 
 
 
 
 
 
 400
 401#ifdef CONFIG_DMAR_DEFAULT_ON
 402int dmar_disabled = 0;
 403#else
 404int dmar_disabled = 1;
 405#endif /*CONFIG_DMAR_DEFAULT_ON*/
 
 
 
 406
 407static int dmar_map_gfx = 1;
 408static int dmar_forcedac;
 409static int intel_iommu_strict;
 410static int intel_iommu_superpage = 1;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 411
 412int intel_iommu_gfx_mapped;
 413EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 414
 415#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 416static DEFINE_SPINLOCK(device_domain_lock);
 417static LIST_HEAD(device_domain_list);
 418
 419static struct iommu_ops intel_iommu_ops;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 420
 421static int __init intel_iommu_setup(char *str)
 422{
 423	if (!str)
 424		return -EINVAL;
 425	while (*str) {
 426		if (!strncmp(str, "on", 2)) {
 427			dmar_disabled = 0;
 428			printk(KERN_INFO "Intel-IOMMU: enabled\n");
 429		} else if (!strncmp(str, "off", 3)) {
 430			dmar_disabled = 1;
 431			printk(KERN_INFO "Intel-IOMMU: disabled\n");
 432		} else if (!strncmp(str, "igfx_off", 8)) {
 433			dmar_map_gfx = 0;
 434			printk(KERN_INFO
 435				"Intel-IOMMU: disable GFX device mapping\n");
 436		} else if (!strncmp(str, "forcedac", 8)) {
 437			printk(KERN_INFO
 438				"Intel-IOMMU: Forcing DAC for PCI devices\n");
 439			dmar_forcedac = 1;
 440		} else if (!strncmp(str, "strict", 6)) {
 441			printk(KERN_INFO
 442				"Intel-IOMMU: disable batched IOTLB flush\n");
 443			intel_iommu_strict = 1;
 444		} else if (!strncmp(str, "sp_off", 6)) {
 445			printk(KERN_INFO
 446				"Intel-IOMMU: disable supported super page\n");
 447			intel_iommu_superpage = 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 448		}
 449
 450		str += strcspn(str, ",");
 451		while (*str == ',')
 452			str++;
 453	}
 454	return 0;
 455}
 456__setup("intel_iommu=", intel_iommu_setup);
 457
 458static struct kmem_cache *iommu_domain_cache;
 459static struct kmem_cache *iommu_devinfo_cache;
 460static struct kmem_cache *iommu_iova_cache;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 461
 462static inline void *alloc_pgtable_page(int node)
 463{
 464	struct page *page;
 465	void *vaddr = NULL;
 466
 467	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 468	if (page)
 469		vaddr = page_address(page);
 470	return vaddr;
 471}
 472
 473static inline void free_pgtable_page(void *vaddr)
 474{
 475	free_page((unsigned long)vaddr);
 476}
 477
 478static inline void *alloc_domain_mem(void)
 479{
 480	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 481}
 482
 483static void free_domain_mem(void *vaddr)
 484{
 485	kmem_cache_free(iommu_domain_cache, vaddr);
 486}
 487
 488static inline void * alloc_devinfo_mem(void)
 489{
 490	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 491}
 492
 493static inline void free_devinfo_mem(void *vaddr)
 494{
 495	kmem_cache_free(iommu_devinfo_cache, vaddr);
 496}
 497
 498struct iova *alloc_iova_mem(void)
 499{
 500	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
 501}
 502
 503void free_iova_mem(struct iova *iova)
 504{
 505	kmem_cache_free(iommu_iova_cache, iova);
 506}
 507
 
 
 
 
 
 
 
 
 
 
 
 
 
 508
 509static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 510{
 511	unsigned long sagaw;
 512	int agaw = -1;
 513
 514	sagaw = cap_sagaw(iommu->cap);
 515	for (agaw = width_to_agaw(max_gaw);
 516	     agaw >= 0; agaw--) {
 517		if (test_bit(agaw, &sagaw))
 518			break;
 519	}
 520
 521	return agaw;
 522}
 523
 524/*
 525 * Calculate max SAGAW for each iommu.
 526 */
 527int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 528{
 529	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 530}
 531
 532/*
 533 * calculate agaw for each iommu.
 534 * "SAGAW" may be different across iommus, use a default agaw, and
 535 * get a supported less agaw for iommus that don't support the default agaw.
 536 */
 537int iommu_calculate_agaw(struct intel_iommu *iommu)
 538{
 539	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 540}
 541
 542/* This functionin only returns single iommu in a domain */
 543static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 544{
 545	int iommu_id;
 546
 547	/* si_domain and vm domain should not get here. */
 548	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
 549	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
 
 550
 551	iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
 552	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 553		return NULL;
 554
 555	return g_iommus[iommu_id];
 556}
 557
 558static void domain_update_iommu_coherency(struct dmar_domain *domain)
 559{
 
 
 
 560	int i;
 561
 562	domain->iommu_coherency = 1;
 563
 564	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 
 565		if (!ecap_coherent(g_iommus[i]->ecap)) {
 566			domain->iommu_coherency = 0;
 567			break;
 568		}
 569	}
 
 
 
 
 
 
 
 
 
 
 
 
 570}
 571
 572static void domain_update_iommu_snooping(struct dmar_domain *domain)
 573{
 574	int i;
 575
 576	domain->iommu_snooping = 1;
 577
 578	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
 579		if (!ecap_sc_support(g_iommus[i]->ecap)) {
 580			domain->iommu_snooping = 0;
 581			break;
 
 
 
 582		}
 583	}
 
 
 
 584}
 585
 586static void domain_update_iommu_superpage(struct dmar_domain *domain)
 587{
 588	struct dmar_drhd_unit *drhd;
 589	struct intel_iommu *iommu = NULL;
 590	int mask = 0xf;
 591
 592	if (!intel_iommu_superpage) {
 593		domain->iommu_superpage = 0;
 594		return;
 595	}
 596
 597	/* set iommu_superpage to the smallest common denominator */
 
 598	for_each_active_iommu(iommu, drhd) {
 599		mask &= cap_super_page_val(iommu->cap);
 600		if (!mask) {
 601			break;
 
 602		}
 603	}
 604	domain->iommu_superpage = fls(mask);
 
 
 605}
 606
 607/* Some capabilities may be different across iommus */
 608static void domain_update_iommu_cap(struct dmar_domain *domain)
 609{
 610	domain_update_iommu_coherency(domain);
 611	domain_update_iommu_snooping(domain);
 612	domain_update_iommu_superpage(domain);
 613}
 614
 615static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 616{
 617	struct dmar_drhd_unit *drhd = NULL;
 
 
 
 
 618	int i;
 619
 620	for_each_drhd_unit(drhd) {
 621		if (drhd->ignored)
 622			continue;
 623		if (segment != drhd->segment)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 624			continue;
 625
 626		for (i = 0; i < drhd->devices_cnt; i++) {
 627			if (drhd->devices[i] &&
 628			    drhd->devices[i]->bus->number == bus &&
 629			    drhd->devices[i]->devfn == devfn)
 630				return drhd->iommu;
 631			if (drhd->devices[i] &&
 632			    drhd->devices[i]->subordinate &&
 633			    drhd->devices[i]->subordinate->number <= bus &&
 634			    drhd->devices[i]->subordinate->subordinate >= bus)
 635				return drhd->iommu;
 
 
 
 
 
 
 
 
 
 
 
 
 
 636		}
 637
 638		if (drhd->include_all)
 639			return drhd->iommu;
 
 
 
 
 640	}
 
 
 
 641
 642	return NULL;
 643}
 644
 645static void domain_flush_cache(struct dmar_domain *domain,
 646			       void *addr, int size)
 647{
 648	if (!domain->iommu_coherency)
 649		clflush_cache_range(addr, size);
 650}
 651
 652/* Gets context entry for a given bus and devfn */
 653static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
 654		u8 bus, u8 devfn)
 655{
 656	struct root_entry *root;
 657	struct context_entry *context;
 658	unsigned long phy_addr;
 659	unsigned long flags;
 660
 661	spin_lock_irqsave(&iommu->lock, flags);
 662	root = &iommu->root_entry[bus];
 663	context = get_context_addr_from_root(root);
 664	if (!context) {
 665		context = (struct context_entry *)
 666				alloc_pgtable_page(iommu->node);
 667		if (!context) {
 668			spin_unlock_irqrestore(&iommu->lock, flags);
 669			return NULL;
 670		}
 671		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 672		phy_addr = virt_to_phys((void *)context);
 673		set_root_value(root, phy_addr);
 674		set_root_present(root);
 675		__iommu_flush_cache(iommu, root, sizeof(*root));
 676	}
 677	spin_unlock_irqrestore(&iommu->lock, flags);
 678	return &context[devfn];
 679}
 680
 681static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 682{
 683	struct root_entry *root;
 684	struct context_entry *context;
 685	int ret;
 686	unsigned long flags;
 687
 688	spin_lock_irqsave(&iommu->lock, flags);
 689	root = &iommu->root_entry[bus];
 690	context = get_context_addr_from_root(root);
 691	if (!context) {
 692		ret = 0;
 693		goto out;
 694	}
 695	ret = context_present(&context[devfn]);
 696out:
 697	spin_unlock_irqrestore(&iommu->lock, flags);
 698	return ret;
 699}
 700
 701static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
 702{
 703	struct root_entry *root;
 704	struct context_entry *context;
 705	unsigned long flags;
 706
 707	spin_lock_irqsave(&iommu->lock, flags);
 708	root = &iommu->root_entry[bus];
 709	context = get_context_addr_from_root(root);
 710	if (context) {
 711		context_clear_entry(&context[devfn]);
 712		__iommu_flush_cache(iommu, &context[devfn], \
 713			sizeof(*context));
 714	}
 715	spin_unlock_irqrestore(&iommu->lock, flags);
 716}
 717
 718static void free_context_table(struct intel_iommu *iommu)
 719{
 720	struct root_entry *root;
 721	int i;
 722	unsigned long flags;
 723	struct context_entry *context;
 724
 725	spin_lock_irqsave(&iommu->lock, flags);
 726	if (!iommu->root_entry) {
 727		goto out;
 728	}
 729	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 730		root = &iommu->root_entry[i];
 731		context = get_context_addr_from_root(root);
 732		if (context)
 733			free_pgtable_page(context);
 
 
 
 
 
 
 
 
 734	}
 735	free_pgtable_page(iommu->root_entry);
 736	iommu->root_entry = NULL;
 737out:
 738	spin_unlock_irqrestore(&iommu->lock, flags);
 739}
 740
 741static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 742				      unsigned long pfn, int target_level)
 743{
 744	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 745	struct dma_pte *parent, *pte = NULL;
 746	int level = agaw_to_level(domain->agaw);
 747	int offset;
 748
 749	BUG_ON(!domain->pgd);
 750	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
 
 
 
 
 751	parent = domain->pgd;
 752
 753	while (level > 0) {
 754		void *tmp_page;
 755
 756		offset = pfn_level_offset(pfn, level);
 757		pte = &parent[offset];
 758		if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 759			break;
 760		if (level == target_level)
 761			break;
 762
 763		if (!dma_pte_present(pte)) {
 764			uint64_t pteval;
 765
 766			tmp_page = alloc_pgtable_page(domain->nid);
 767
 768			if (!tmp_page)
 769				return NULL;
 770
 771			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 772			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 773			if (cmpxchg64(&pte->val, 0ULL, pteval)) {
 774				/* Someone else set it while we were thinking; use theirs. */
 775				free_pgtable_page(tmp_page);
 776			} else {
 777				dma_pte_addr(pte);
 778				domain_flush_cache(domain, pte, sizeof(*pte));
 779			}
 780		}
 
 
 
 781		parent = phys_to_virt(dma_pte_addr(pte));
 782		level--;
 783	}
 784
 
 
 
 785	return pte;
 786}
 787
 788
 789/* return address's pte at specific level */
 790static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 791					 unsigned long pfn,
 792					 int level, int *large_page)
 793{
 794	struct dma_pte *parent, *pte = NULL;
 795	int total = agaw_to_level(domain->agaw);
 796	int offset;
 797
 798	parent = domain->pgd;
 799	while (level <= total) {
 800		offset = pfn_level_offset(pfn, total);
 801		pte = &parent[offset];
 802		if (level == total)
 803			return pte;
 804
 805		if (!dma_pte_present(pte)) {
 806			*large_page = total;
 807			break;
 808		}
 809
 810		if (pte->val & DMA_PTE_LARGE_PAGE) {
 811			*large_page = total;
 812			return pte;
 813		}
 814
 815		parent = phys_to_virt(dma_pte_addr(pte));
 816		total--;
 817	}
 818	return NULL;
 819}
 820
 821/* clear last level pte, a tlb flush should be followed */
 822static int dma_pte_clear_range(struct dmar_domain *domain,
 823				unsigned long start_pfn,
 824				unsigned long last_pfn)
 825{
 826	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 827	unsigned int large_page = 1;
 828	struct dma_pte *first_pte, *pte;
 829	int order;
 830
 831	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 832	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 833	BUG_ON(start_pfn > last_pfn);
 834
 835	/* we don't need lock here; nobody else touches the iova range */
 836	do {
 837		large_page = 1;
 838		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 839		if (!pte) {
 840			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 841			continue;
 842		}
 843		do {
 844			dma_clear_pte(pte);
 845			start_pfn += lvl_to_nr_pages(large_page);
 846			pte++;
 847		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 848
 849		domain_flush_cache(domain, first_pte,
 850				   (void *)pte - (void *)first_pte);
 851
 852	} while (start_pfn && start_pfn <= last_pfn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 853
 854	order = (large_page - 1) * 9;
 855	return order;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 856}
 857
 858/* free page table pages. last level pte should already be cleared */
 
 
 
 859static void dma_pte_free_pagetable(struct dmar_domain *domain,
 860				   unsigned long start_pfn,
 861				   unsigned long last_pfn)
 
 862{
 863	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 864	struct dma_pte *first_pte, *pte;
 865	int total = agaw_to_level(domain->agaw);
 866	int level;
 867	unsigned long tmp;
 868	int large_page = 2;
 869
 870	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
 871	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
 872	BUG_ON(start_pfn > last_pfn);
 873
 
 
 874	/* We don't need lock here; nobody else touches the iova range */
 875	level = 2;
 876	while (level <= total) {
 877		tmp = align_to_level(start_pfn, level);
 
 
 
 
 
 
 878
 879		/* If we can't even clear one PTE at this level, we're done */
 880		if (tmp + level_size(level) - 1 > last_pfn)
 881			return;
 
 
 
 
 
 
 
 
 
 
 
 
 882
 883		do {
 884			large_page = level;
 885			first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
 886			if (large_page > level)
 887				level = large_page + 1;
 888			if (!pte) {
 889				tmp = align_to_level(tmp + 1, level + 1);
 890				continue;
 891			}
 892			do {
 893				if (dma_pte_present(pte)) {
 894					free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
 895					dma_clear_pte(pte);
 896				}
 897				pte++;
 898				tmp += level_size(level);
 899			} while (!first_pte_in_page(pte) &&
 900				 tmp + level_size(level) - 1 <= last_pfn);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 901
 902			domain_flush_cache(domain, first_pte,
 903					   (void *)pte - (void *)first_pte);
 904			
 905		} while (tmp && tmp + level_size(level) - 1 <= last_pfn);
 906		level++;
 907	}
 908	/* free pgd */
 909	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
 910		free_pgtable_page(domain->pgd);
 
 
 
 911		domain->pgd = NULL;
 912	}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 913}
 914
 915/* iommu handling */
 916static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 917{
 918	struct root_entry *root;
 919	unsigned long flags;
 920
 921	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
 922	if (!root)
 
 
 923		return -ENOMEM;
 
 924
 925	__iommu_flush_cache(iommu, root, ROOT_SIZE);
 926
 927	spin_lock_irqsave(&iommu->lock, flags);
 928	iommu->root_entry = root;
 929	spin_unlock_irqrestore(&iommu->lock, flags);
 930
 931	return 0;
 932}
 933
 934static void iommu_set_root_entry(struct intel_iommu *iommu)
 935{
 936	void *addr;
 937	u32 sts;
 938	unsigned long flag;
 939
 940	addr = iommu->root_entry;
 
 
 941
 942	spin_lock_irqsave(&iommu->register_lock, flag);
 943	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
 944
 945	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
 946
 947	/* Make sure hardware complete it */
 948	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 949		      readl, (sts & DMA_GSTS_RTPS), sts);
 950
 951	spin_unlock_irqrestore(&iommu->register_lock, flag);
 952}
 953
 954static void iommu_flush_write_buffer(struct intel_iommu *iommu)
 955{
 956	u32 val;
 957	unsigned long flag;
 958
 959	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
 960		return;
 961
 962	spin_lock_irqsave(&iommu->register_lock, flag);
 963	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
 964
 965	/* Make sure hardware complete it */
 966	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
 967		      readl, (!(val & DMA_GSTS_WBFS)), val);
 968
 969	spin_unlock_irqrestore(&iommu->register_lock, flag);
 970}
 971
 972/* return value determine if we need a write buffer flush */
 973static void __iommu_flush_context(struct intel_iommu *iommu,
 974				  u16 did, u16 source_id, u8 function_mask,
 975				  u64 type)
 976{
 977	u64 val = 0;
 978	unsigned long flag;
 979
 980	switch (type) {
 981	case DMA_CCMD_GLOBAL_INVL:
 982		val = DMA_CCMD_GLOBAL_INVL;
 983		break;
 984	case DMA_CCMD_DOMAIN_INVL:
 985		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
 986		break;
 987	case DMA_CCMD_DEVICE_INVL:
 988		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
 989			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
 990		break;
 991	default:
 992		BUG();
 993	}
 994	val |= DMA_CCMD_ICC;
 995
 996	spin_lock_irqsave(&iommu->register_lock, flag);
 997	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
 998
 999	/* Make sure hardware complete it */
1000	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1001		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1002
1003	spin_unlock_irqrestore(&iommu->register_lock, flag);
1004}
1005
1006/* return value determine if we need a write buffer flush */
1007static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1008				u64 addr, unsigned int size_order, u64 type)
1009{
1010	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1011	u64 val = 0, val_iva = 0;
1012	unsigned long flag;
1013
1014	switch (type) {
1015	case DMA_TLB_GLOBAL_FLUSH:
1016		/* global flush doesn't need set IVA_REG */
1017		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1018		break;
1019	case DMA_TLB_DSI_FLUSH:
1020		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1021		break;
1022	case DMA_TLB_PSI_FLUSH:
1023		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1024		/* Note: always flush non-leaf currently */
1025		val_iva = size_order | addr;
1026		break;
1027	default:
1028		BUG();
1029	}
1030	/* Note: set drain read/write */
1031#if 0
1032	/*
1033	 * This is probably to be super secure.. Looks like we can
1034	 * ignore it without any impact.
1035	 */
1036	if (cap_read_drain(iommu->cap))
1037		val |= DMA_TLB_READ_DRAIN;
1038#endif
1039	if (cap_write_drain(iommu->cap))
1040		val |= DMA_TLB_WRITE_DRAIN;
1041
1042	spin_lock_irqsave(&iommu->register_lock, flag);
1043	/* Note: Only uses first TLB reg currently */
1044	if (val_iva)
1045		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1046	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1047
1048	/* Make sure hardware complete it */
1049	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1050		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1051
1052	spin_unlock_irqrestore(&iommu->register_lock, flag);
1053
1054	/* check IOTLB invalidation granularity */
1055	if (DMA_TLB_IAIG(val) == 0)
1056		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1057	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1058		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1059			(unsigned long long)DMA_TLB_IIRG(type),
1060			(unsigned long long)DMA_TLB_IAIG(val));
1061}
1062
1063static struct device_domain_info *iommu_support_dev_iotlb(
1064	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
 
1065{
1066	int found = 0;
1067	unsigned long flags;
1068	struct device_domain_info *info;
1069	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1070
1071	if (!ecap_dev_iotlb_support(iommu->ecap))
1072		return NULL;
1073
1074	if (!iommu->qi)
1075		return NULL;
1076
1077	spin_lock_irqsave(&device_domain_lock, flags);
1078	list_for_each_entry(info, &domain->devices, link)
1079		if (info->bus == bus && info->devfn == devfn) {
1080			found = 1;
 
 
1081			break;
1082		}
1083	spin_unlock_irqrestore(&device_domain_lock, flags);
1084
1085	if (!found || !info->dev)
1086		return NULL;
1087
1088	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1089		return NULL;
 
 
1090
1091	if (!dmar_find_matched_atsr_unit(info->dev))
1092		return NULL;
1093
1094	info->iommu = iommu;
 
1095
1096	return info;
 
 
 
 
 
 
 
 
 
 
1097}
1098
1099static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1100{
1101	if (!info)
 
 
 
 
1102		return;
1103
1104	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1105}
1106
1107static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1108{
1109	if (!info->dev || !pci_ats_enabled(info->dev))
 
 
 
 
1110		return;
1111
1112	pci_disable_ats(info->dev);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1113}
1114
1115static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1116				  u64 addr, unsigned mask)
1117{
1118	u16 sid, qdep;
1119	unsigned long flags;
1120	struct device_domain_info *info;
1121
 
 
 
1122	spin_lock_irqsave(&device_domain_lock, flags);
1123	list_for_each_entry(info, &domain->devices, link) {
1124		if (!info->dev || !pci_ats_enabled(info->dev))
1125			continue;
1126
1127		sid = info->bus << 8 | info->devfn;
1128		qdep = pci_ats_queue_depth(info->dev);
1129		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1130	}
1131	spin_unlock_irqrestore(&device_domain_lock, flags);
1132}
1133
1134static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1135				  unsigned long pfn, unsigned int pages, int map)
 
 
1136{
1137	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1138	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
 
1139
1140	BUG_ON(pages == 0);
1141
 
 
1142	/*
1143	 * Fallback to domain selective flush if no PSI support or the size is
1144	 * too big.
1145	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1146	 * aligned to the size
1147	 */
1148	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1149		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1150						DMA_TLB_DSI_FLUSH);
1151	else
1152		iommu->flush.flush_iotlb(iommu, did, addr, mask,
1153						DMA_TLB_PSI_FLUSH);
1154
1155	/*
1156	 * In caching mode, changes of pages from non-present to present require
1157	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1158	 */
1159	if (!cap_caching_mode(iommu->cap) || !map)
1160		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161}
1162
1163static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1164{
1165	u32 pmen;
1166	unsigned long flags;
1167
1168	spin_lock_irqsave(&iommu->register_lock, flags);
1169	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1170	pmen &= ~DMA_PMEN_EPM;
1171	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1172
1173	/* wait for the protected region status bit to clear */
1174	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1175		readl, !(pmen & DMA_PMEN_PRS), pmen);
1176
1177	spin_unlock_irqrestore(&iommu->register_lock, flags);
1178}
1179
1180static int iommu_enable_translation(struct intel_iommu *iommu)
1181{
1182	u32 sts;
1183	unsigned long flags;
1184
1185	spin_lock_irqsave(&iommu->register_lock, flags);
1186	iommu->gcmd |= DMA_GCMD_TE;
1187	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1188
1189	/* Make sure hardware complete it */
1190	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1191		      readl, (sts & DMA_GSTS_TES), sts);
1192
1193	spin_unlock_irqrestore(&iommu->register_lock, flags);
1194	return 0;
1195}
1196
1197static int iommu_disable_translation(struct intel_iommu *iommu)
1198{
1199	u32 sts;
1200	unsigned long flag;
1201
1202	spin_lock_irqsave(&iommu->register_lock, flag);
1203	iommu->gcmd &= ~DMA_GCMD_TE;
1204	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1205
1206	/* Make sure hardware complete it */
1207	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1208		      readl, (!(sts & DMA_GSTS_TES)), sts);
1209
1210	spin_unlock_irqrestore(&iommu->register_lock, flag);
1211	return 0;
1212}
1213
1214
1215static int iommu_init_domains(struct intel_iommu *iommu)
1216{
1217	unsigned long ndomains;
1218	unsigned long nlongs;
1219
1220	ndomains = cap_ndoms(iommu->cap);
1221	pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1222			ndomains);
1223	nlongs = BITS_TO_LONGS(ndomains);
1224
1225	spin_lock_init(&iommu->lock);
1226
1227	/* TBD: there might be 64K domains,
1228	 * consider other allocation for future chip
1229	 */
1230	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1231	if (!iommu->domain_ids) {
1232		printk(KERN_ERR "Allocating domain id array failed\n");
 
1233		return -ENOMEM;
1234	}
1235	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1236			GFP_KERNEL);
1237	if (!iommu->domains) {
1238		printk(KERN_ERR "Allocating domain array failed\n");
 
 
 
 
 
 
 
 
 
 
 
 
1239		return -ENOMEM;
1240	}
1241
 
 
1242	/*
1243	 * if Caching mode is set, then invalid translations are tagged
1244	 * with domainid 0. Hence we need to pre-allocate it.
 
 
1245	 */
1246	if (cap_caching_mode(iommu->cap))
1247		set_bit(0, iommu->domain_ids);
1248	return 0;
1249}
1250
1251
1252static void domain_exit(struct dmar_domain *domain);
1253static void vm_domain_exit(struct dmar_domain *domain);
1254
1255void free_dmar_iommu(struct intel_iommu *iommu)
1256{
1257	struct dmar_domain *domain;
1258	int i;
1259	unsigned long flags;
1260
1261	if ((iommu->domains) && (iommu->domain_ids)) {
1262		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1263			domain = iommu->domains[i];
1264			clear_bit(i, iommu->domain_ids);
1265
1266			spin_lock_irqsave(&domain->iommu_lock, flags);
1267			if (--domain->iommu_count == 0) {
1268				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1269					vm_domain_exit(domain);
1270				else
1271					domain_exit(domain);
1272			}
1273			spin_unlock_irqrestore(&domain->iommu_lock, flags);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274		}
1275	}
 
1276
1277	if (iommu->gcmd & DMA_GCMD_TE)
1278		iommu_disable_translation(iommu);
 
1279
1280	if (iommu->irq) {
1281		irq_set_handler_data(iommu->irq, NULL);
1282		/* This will mask the irq */
1283		free_irq(iommu->irq, iommu);
1284		destroy_irq(iommu->irq);
1285	}
1286
1287	kfree(iommu->domains);
1288	kfree(iommu->domain_ids);
1289
1290	g_iommus[iommu->seq_id] = NULL;
1291
1292	/* if all iommus are freed, free g_iommus */
1293	for (i = 0; i < g_num_of_iommus; i++) {
1294		if (g_iommus[i])
1295			break;
 
 
1296	}
1297
1298	if (i == g_num_of_iommus)
1299		kfree(g_iommus);
1300
1301	/* free context mapping */
1302	free_context_table(iommu);
 
 
 
 
 
 
 
 
1303}
1304
1305static struct dmar_domain *alloc_domain(void)
1306{
1307	struct dmar_domain *domain;
1308
1309	domain = alloc_domain_mem();
1310	if (!domain)
1311		return NULL;
1312
 
1313	domain->nid = -1;
1314	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1315	domain->flags = 0;
 
1316
1317	return domain;
1318}
1319
1320static int iommu_attach_domain(struct dmar_domain *domain,
 
1321			       struct intel_iommu *iommu)
1322{
1323	int num;
1324	unsigned long ndomains;
1325	unsigned long flags;
1326
1327	ndomains = cap_ndoms(iommu->cap);
 
1328
1329	spin_lock_irqsave(&iommu->lock, flags);
 
 
 
 
1330
1331	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332	if (num >= ndomains) {
1333		spin_unlock_irqrestore(&iommu->lock, flags);
1334		printk(KERN_ERR "IOMMU: no free domain ids\n");
1335		return -ENOMEM;
1336	}
1337
1338	domain->id = num;
1339	set_bit(num, iommu->domain_ids);
1340	set_bit(iommu->seq_id, &domain->iommu_bmp);
1341	iommu->domains[num] = domain;
1342	spin_unlock_irqrestore(&iommu->lock, flags);
 
 
 
1343
1344	return 0;
1345}
1346
1347static void iommu_detach_domain(struct dmar_domain *domain,
1348				struct intel_iommu *iommu)
1349{
1350	unsigned long flags;
1351	int num, ndomains;
1352	int found = 0;
1353
1354	spin_lock_irqsave(&iommu->lock, flags);
1355	ndomains = cap_ndoms(iommu->cap);
1356	for_each_set_bit(num, iommu->domain_ids, ndomains) {
1357		if (iommu->domains[num] == domain) {
1358			found = 1;
1359			break;
1360		}
1361	}
1362
1363	if (found) {
 
 
 
1364		clear_bit(num, iommu->domain_ids);
1365		clear_bit(iommu->seq_id, &domain->iommu_bmp);
1366		iommu->domains[num] = NULL;
 
 
1367	}
1368	spin_unlock_irqrestore(&iommu->lock, flags);
 
1369}
1370
1371static struct iova_domain reserved_iova_list;
1372static struct lock_class_key reserved_rbtree_key;
1373
1374static int dmar_init_reserved_ranges(void)
1375{
1376	struct pci_dev *pdev = NULL;
1377	struct iova *iova;
1378	int i;
1379
1380	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1381
1382	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1383		&reserved_rbtree_key);
1384
1385	/* IOAPIC ranges shouldn't be accessed by DMA */
1386	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1387		IOVA_PFN(IOAPIC_RANGE_END));
1388	if (!iova) {
1389		printk(KERN_ERR "Reserve IOAPIC range failed\n");
1390		return -ENODEV;
1391	}
1392
1393	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1394	for_each_pci_dev(pdev) {
1395		struct resource *r;
1396
1397		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1398			r = &pdev->resource[i];
1399			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1400				continue;
1401			iova = reserve_iova(&reserved_iova_list,
1402					    IOVA_PFN(r->start),
1403					    IOVA_PFN(r->end));
1404			if (!iova) {
1405				printk(KERN_ERR "Reserve iova failed\n");
1406				return -ENODEV;
1407			}
1408		}
1409	}
1410	return 0;
1411}
1412
1413static void domain_reserve_special_ranges(struct dmar_domain *domain)
1414{
1415	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1416}
1417
1418static inline int guestwidth_to_adjustwidth(int gaw)
1419{
1420	int agaw;
1421	int r = (gaw - 12) % 9;
1422
1423	if (r == 0)
1424		agaw = gaw;
1425	else
1426		agaw = gaw + 9 - r;
1427	if (agaw > 64)
1428		agaw = 64;
1429	return agaw;
1430}
1431
1432static int domain_init(struct dmar_domain *domain, int guest_width)
 
1433{
1434	struct intel_iommu *iommu;
1435	int adjust_width, agaw;
1436	unsigned long sagaw;
 
1437
1438	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1439	spin_lock_init(&domain->iommu_lock);
 
 
 
 
1440
1441	domain_reserve_special_ranges(domain);
1442
1443	/* calculate AGAW */
1444	iommu = domain_get_iommu(domain);
1445	if (guest_width > cap_mgaw(iommu->cap))
1446		guest_width = cap_mgaw(iommu->cap);
1447	domain->gaw = guest_width;
1448	adjust_width = guestwidth_to_adjustwidth(guest_width);
1449	agaw = width_to_agaw(adjust_width);
1450	sagaw = cap_sagaw(iommu->cap);
1451	if (!test_bit(agaw, &sagaw)) {
1452		/* hardware doesn't support it, choose a bigger one */
1453		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1454		agaw = find_next_bit(&sagaw, 5, agaw);
1455		if (agaw >= 5)
1456			return -ENODEV;
1457	}
1458	domain->agaw = agaw;
1459	INIT_LIST_HEAD(&domain->devices);
1460
1461	if (ecap_coherent(iommu->ecap))
1462		domain->iommu_coherency = 1;
1463	else
1464		domain->iommu_coherency = 0;
1465
1466	if (ecap_sc_support(iommu->ecap))
1467		domain->iommu_snooping = 1;
1468	else
1469		domain->iommu_snooping = 0;
1470
1471	domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1472	domain->iommu_count = 1;
 
 
 
1473	domain->nid = iommu->node;
1474
1475	/* always allocate the top pgd */
1476	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1477	if (!domain->pgd)
1478		return -ENOMEM;
1479	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1480	return 0;
1481}
1482
1483static void domain_exit(struct dmar_domain *domain)
1484{
1485	struct dmar_drhd_unit *drhd;
1486	struct intel_iommu *iommu;
1487
1488	/* Domain 0 is reserved, so dont process it */
1489	if (!domain)
1490		return;
1491
1492	/* Flush any lazy unmaps that may reference this domain */
1493	if (!intel_iommu_strict)
1494		flush_unmaps_timeout(0);
1495
1496	domain_remove_dev_info(domain);
 
 
1497	/* destroy iovas */
1498	put_iova_domain(&domain->iovad);
1499
1500	/* clear ptes */
1501	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1502
1503	/* free page tables */
1504	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1505
1506	for_each_active_iommu(iommu, drhd)
1507		if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1508			iommu_detach_domain(domain, iommu);
1509
1510	free_domain_mem(domain);
1511}
1512
1513static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1514				 u8 bus, u8 devfn, int translation)
 
1515{
 
 
 
1516	struct context_entry *context;
1517	unsigned long flags;
1518	struct intel_iommu *iommu;
1519	struct dma_pte *pgd;
1520	unsigned long num;
1521	unsigned long ndomains;
1522	int id;
1523	int agaw;
1524	struct device_domain_info *info = NULL;
 
1525
1526	pr_debug("Set context mapping for %02x:%02x.%d\n",
1527		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1528
1529	BUG_ON(!domain->pgd);
1530	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1531	       translation != CONTEXT_TT_MULTI_LEVEL);
1532
1533	iommu = device_to_iommu(segment, bus, devfn);
1534	if (!iommu)
1535		return -ENODEV;
1536
1537	context = device_to_context_entry(iommu, bus, devfn);
 
1538	if (!context)
1539		return -ENOMEM;
1540	spin_lock_irqsave(&iommu->lock, flags);
1541	if (context_present(context)) {
1542		spin_unlock_irqrestore(&iommu->lock, flags);
1543		return 0;
1544	}
1545
1546	id = domain->id;
1547	pgd = domain->pgd;
 
1548
1549	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1550	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1551		int found = 0;
 
 
 
 
 
 
 
 
1552
1553		/* find an available domain id for this device in iommu */
1554		ndomains = cap_ndoms(iommu->cap);
1555		for_each_set_bit(num, iommu->domain_ids, ndomains) {
1556			if (iommu->domains[num] == domain) {
1557				id = num;
1558				found = 1;
1559				break;
1560			}
1561		}
 
1562
1563		if (found == 0) {
1564			num = find_first_zero_bit(iommu->domain_ids, ndomains);
1565			if (num >= ndomains) {
1566				spin_unlock_irqrestore(&iommu->lock, flags);
1567				printk(KERN_ERR "IOMMU: no free domain ids\n");
1568				return -EFAULT;
1569			}
1570
1571			set_bit(num, iommu->domain_ids);
1572			iommu->domains[num] = domain;
1573			id = num;
1574		}
1575
1576		/* Skip top levels of page tables for
1577		 * iommu which has less agaw than default.
1578		 * Unnecessary for PT mode.
1579		 */
1580		if (translation != CONTEXT_TT_PASS_THROUGH) {
1581			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1582				pgd = phys_to_virt(dma_pte_addr(pgd));
1583				if (!dma_pte_present(pgd)) {
1584					spin_unlock_irqrestore(&iommu->lock, flags);
1585					return -ENOMEM;
1586				}
1587			}
1588		}
1589	}
1590
1591	context_set_domain_id(context, id);
 
 
 
 
1592
1593	if (translation != CONTEXT_TT_PASS_THROUGH) {
1594		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1595		translation = info ? CONTEXT_TT_DEV_IOTLB :
1596				     CONTEXT_TT_MULTI_LEVEL;
1597	}
1598	/*
1599	 * In pass through mode, AW must be programmed to indicate the largest
1600	 * AGAW value supported by hardware. And ASR is ignored by hardware.
1601	 */
1602	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1603		context_set_address_width(context, iommu->msagaw);
1604	else {
1605		context_set_address_root(context, virt_to_phys(pgd));
1606		context_set_address_width(context, iommu->agaw);
 
 
 
 
 
 
 
1607	}
1608
1609	context_set_translation_type(context, translation);
1610	context_set_fault_enable(context);
1611	context_set_present(context);
1612	domain_flush_cache(domain, context, sizeof(*context));
1613
1614	/*
1615	 * It's a non-present to present mapping. If hardware doesn't cache
1616	 * non-present entry we only need to flush the write-buffer. If the
1617	 * _does_ cache non-present entries, then it does so in the special
1618	 * domain #0, which we have to flush:
1619	 */
1620	if (cap_caching_mode(iommu->cap)) {
1621		iommu->flush.flush_context(iommu, 0,
1622					   (((u16)bus) << 8) | devfn,
1623					   DMA_CCMD_MASK_NOBIT,
1624					   DMA_CCMD_DEVICE_INVL);
1625		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1626	} else {
1627		iommu_flush_write_buffer(iommu);
1628	}
1629	iommu_enable_dev_iotlb(info);
1630	spin_unlock_irqrestore(&iommu->lock, flags);
1631
1632	spin_lock_irqsave(&domain->iommu_lock, flags);
1633	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1634		domain->iommu_count++;
1635		if (domain->iommu_count == 1)
1636			domain->nid = iommu->node;
1637		domain_update_iommu_cap(domain);
1638	}
1639	spin_unlock_irqrestore(&domain->iommu_lock, flags);
1640	return 0;
 
 
 
 
 
 
 
 
 
 
 
 
1641}
1642
1643static int
1644domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1645			int translation)
1646{
1647	int ret;
1648	struct pci_dev *tmp, *parent;
 
1649
1650	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1651					 pdev->bus->number, pdev->devfn,
1652					 translation);
1653	if (ret)
1654		return ret;
1655
1656	/* dependent device mapping */
1657	tmp = pci_find_upstream_pcie_bridge(pdev);
1658	if (!tmp)
1659		return 0;
1660	/* Secondary interface's bus number and devfn 0 */
1661	parent = pdev->bus->self;
1662	while (parent != tmp) {
1663		ret = domain_context_mapping_one(domain,
1664						 pci_domain_nr(parent->bus),
1665						 parent->bus->number,
1666						 parent->devfn, translation);
1667		if (ret)
1668			return ret;
1669		parent = parent->bus->self;
1670	}
1671	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1672		return domain_context_mapping_one(domain,
1673					pci_domain_nr(tmp->subordinate),
1674					tmp->subordinate->number, 0,
1675					translation);
1676	else /* this is a legacy PCI bridge */
1677		return domain_context_mapping_one(domain,
1678						  pci_domain_nr(tmp->bus),
1679						  tmp->bus->number,
1680						  tmp->devfn,
1681						  translation);
1682}
1683
1684static int domain_context_mapped(struct pci_dev *pdev)
 
 
 
 
 
 
 
 
1685{
1686	int ret;
1687	struct pci_dev *tmp, *parent;
1688	struct intel_iommu *iommu;
 
1689
1690	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1691				pdev->devfn);
1692	if (!iommu)
1693		return -ENODEV;
1694
1695	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1696	if (!ret)
1697		return ret;
1698	/* dependent device mapping */
1699	tmp = pci_find_upstream_pcie_bridge(pdev);
1700	if (!tmp)
1701		return ret;
1702	/* Secondary interface's bus number and devfn 0 */
1703	parent = pdev->bus->self;
1704	while (parent != tmp) {
1705		ret = device_context_mapped(iommu, parent->bus->number,
1706					    parent->devfn);
1707		if (!ret)
1708			return ret;
1709		parent = parent->bus->self;
1710	}
1711	if (pci_is_pcie(tmp))
1712		return device_context_mapped(iommu, tmp->subordinate->number,
1713					     0);
1714	else
1715		return device_context_mapped(iommu, tmp->bus->number,
1716					     tmp->devfn);
1717}
1718
1719/* Returns a number of VTD pages, but aligned to MM page size */
1720static inline unsigned long aligned_nrpages(unsigned long host_addr,
1721					    size_t size)
1722{
1723	host_addr &= ~PAGE_MASK;
1724	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1725}
1726
1727/* Return largest possible superpage level for a given mapping */
1728static inline int hardware_largepage_caps(struct dmar_domain *domain,
1729					  unsigned long iov_pfn,
1730					  unsigned long phy_pfn,
1731					  unsigned long pages)
1732{
1733	int support, level = 1;
1734	unsigned long pfnmerge;
1735
1736	support = domain->iommu_superpage;
1737
1738	/* To use a large page, the virtual *and* physical addresses
1739	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1740	   of them will mean we have to use smaller pages. So just
1741	   merge them and check both at once. */
1742	pfnmerge = iov_pfn | phy_pfn;
1743
1744	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1745		pages >>= VTD_STRIDE_SHIFT;
1746		if (!pages)
1747			break;
1748		pfnmerge >>= VTD_STRIDE_SHIFT;
1749		level++;
1750		support--;
1751	}
1752	return level;
1753}
1754
1755static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1756			    struct scatterlist *sg, unsigned long phys_pfn,
1757			    unsigned long nr_pages, int prot)
1758{
1759	struct dma_pte *first_pte = NULL, *pte = NULL;
1760	phys_addr_t uninitialized_var(pteval);
1761	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1762	unsigned long sg_res;
1763	unsigned int largepage_lvl = 0;
1764	unsigned long lvl_pages = 0;
1765
1766	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1767
1768	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1769		return -EINVAL;
1770
1771	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1772
1773	if (sg)
1774		sg_res = 0;
1775	else {
1776		sg_res = nr_pages + 1;
1777		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1778	}
1779
1780	while (nr_pages > 0) {
1781		uint64_t tmp;
1782
1783		if (!sg_res) {
 
 
1784			sg_res = aligned_nrpages(sg->offset, sg->length);
1785			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1786			sg->dma_length = sg->length;
1787			pteval = page_to_phys(sg_page(sg)) | prot;
1788			phys_pfn = pteval >> VTD_PAGE_SHIFT;
1789		}
1790
1791		if (!pte) {
1792			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1793
1794			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1795			if (!pte)
1796				return -ENOMEM;
1797			/* It is large page*/
1798			if (largepage_lvl > 1)
 
 
1799				pteval |= DMA_PTE_LARGE_PAGE;
1800			else
 
 
 
 
 
 
 
 
 
 
 
 
 
1801				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
 
1802
1803		}
1804		/* We don't need lock here, nobody else
1805		 * touches the iova range
1806		 */
1807		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1808		if (tmp) {
1809			static int dumps = 5;
1810			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1811			       iov_pfn, tmp, (unsigned long long)pteval);
1812			if (dumps) {
1813				dumps--;
1814				debug_dma_dump_mappings(NULL);
1815			}
1816			WARN_ON(1);
1817		}
1818
1819		lvl_pages = lvl_to_nr_pages(largepage_lvl);
1820
1821		BUG_ON(nr_pages < lvl_pages);
1822		BUG_ON(sg_res < lvl_pages);
1823
1824		nr_pages -= lvl_pages;
1825		iov_pfn += lvl_pages;
1826		phys_pfn += lvl_pages;
1827		pteval += lvl_pages * VTD_PAGE_SIZE;
1828		sg_res -= lvl_pages;
1829
1830		/* If the next PTE would be the first in a new page, then we
1831		   need to flush the cache on the entries we've just written.
1832		   And then we'll need to recalculate 'pte', so clear it and
1833		   let it get set again in the if (!pte) block above.
1834
1835		   If we're done (!nr_pages) we need to flush the cache too.
1836
1837		   Also if we've been setting superpages, we may need to
1838		   recalculate 'pte' and switch back to smaller pages for the
1839		   end of the mapping, if the trailing size is not enough to
1840		   use another superpage (i.e. sg_res < lvl_pages). */
1841		pte++;
1842		if (!nr_pages || first_pte_in_page(pte) ||
1843		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
1844			domain_flush_cache(domain, first_pte,
1845					   (void *)pte - (void *)first_pte);
1846			pte = NULL;
1847		}
1848
1849		if (!sg_res && nr_pages)
1850			sg = sg_next(sg);
1851	}
1852	return 0;
1853}
1854
1855static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1856				    struct scatterlist *sg, unsigned long nr_pages,
1857				    int prot)
1858{
1859	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1860}
1861
1862static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1863				     unsigned long phys_pfn, unsigned long nr_pages,
1864				     int prot)
1865{
1866	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1867}
1868
1869static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1870{
 
 
 
 
1871	if (!iommu)
1872		return;
1873
1874	clear_context_table(iommu, bus, devfn);
1875	iommu->flush.flush_context(iommu, 0, 0, 0,
1876					   DMA_CCMD_GLOBAL_INVL);
1877	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1878}
1879
1880static void domain_remove_dev_info(struct dmar_domain *domain)
1881{
1882	struct device_domain_info *info;
1883	unsigned long flags;
1884	struct intel_iommu *iommu;
1885
1886	spin_lock_irqsave(&device_domain_lock, flags);
1887	while (!list_empty(&domain->devices)) {
1888		info = list_entry(domain->devices.next,
1889			struct device_domain_info, link);
1890		list_del(&info->link);
1891		list_del(&info->global);
1892		if (info->dev)
1893			info->dev->dev.archdata.iommu = NULL;
1894		spin_unlock_irqrestore(&device_domain_lock, flags);
1895
1896		iommu_disable_dev_iotlb(info);
1897		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1898		iommu_detach_dev(iommu, info->bus, info->devfn);
1899		free_devinfo_mem(info);
1900
1901		spin_lock_irqsave(&device_domain_lock, flags);
1902	}
1903	spin_unlock_irqrestore(&device_domain_lock, flags);
1904}
1905
1906/*
1907 * find_domain
1908 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1909 */
1910static struct dmar_domain *
1911find_domain(struct pci_dev *pdev)
1912{
1913	struct device_domain_info *info;
1914
1915	/* No lock here, assumes no domain exit in normal case */
1916	info = pdev->dev.archdata.iommu;
1917	if (info)
1918		return info->domain;
1919	return NULL;
1920}
1921
1922/* domain is initialized */
1923static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1924{
1925	struct dmar_domain *domain, *found = NULL;
1926	struct intel_iommu *iommu;
1927	struct dmar_drhd_unit *drhd;
1928	struct device_domain_info *info, *tmp;
1929	struct pci_dev *dev_tmp;
 
 
 
 
 
 
 
 
 
 
 
 
1930	unsigned long flags;
1931	int bus = 0, devfn = 0;
1932	int segment;
1933	int ret;
1934
1935	domain = find_domain(pdev);
1936	if (domain)
1937		return domain;
 
 
 
 
 
 
 
 
 
1938
1939	segment = pci_domain_nr(pdev->bus);
 
1940
1941	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1942	if (dev_tmp) {
1943		if (pci_is_pcie(dev_tmp)) {
1944			bus = dev_tmp->subordinate->number;
1945			devfn = 0;
1946		} else {
1947			bus = dev_tmp->bus->number;
1948			devfn = dev_tmp->devfn;
1949		}
1950		spin_lock_irqsave(&device_domain_lock, flags);
1951		list_for_each_entry(info, &device_domain_list, global) {
1952			if (info->segment == segment &&
1953			    info->bus == bus && info->devfn == devfn) {
1954				found = info->domain;
1955				break;
1956			}
1957		}
1958		spin_unlock_irqrestore(&device_domain_lock, flags);
1959		/* pcie-pci bridge already has a domain, uses it */
1960		if (found) {
1961			domain = found;
1962			goto found_domain;
1963		}
1964	}
1965
1966	domain = alloc_domain();
1967	if (!domain)
1968		goto error;
1969
1970	/* Allocate new domain for the device */
1971	drhd = dmar_find_matched_drhd_unit(pdev);
1972	if (!drhd) {
1973		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1974			pci_name(pdev));
1975		return NULL;
 
 
 
 
 
 
 
 
1976	}
1977	iommu = drhd->iommu;
1978
1979	ret = iommu_attach_domain(domain, iommu);
 
 
 
1980	if (ret) {
1981		free_domain_mem(domain);
1982		goto error;
 
1983	}
1984
1985	if (domain_init(domain, gaw)) {
1986		domain_exit(domain);
1987		goto error;
 
 
 
 
 
 
 
1988	}
1989
1990	/* register pcie-to-pci device */
1991	if (dev_tmp) {
1992		info = alloc_devinfo_mem();
1993		if (!info) {
1994			domain_exit(domain);
1995			goto error;
1996		}
1997		info->segment = segment;
1998		info->bus = bus;
1999		info->devfn = devfn;
2000		info->dev = NULL;
2001		info->domain = domain;
2002		/* This domain is shared by devices under p2p bridge */
2003		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2004
2005		/* pcie-to-pci bridge already has a domain, uses it */
2006		found = NULL;
2007		spin_lock_irqsave(&device_domain_lock, flags);
2008		list_for_each_entry(tmp, &device_domain_list, global) {
2009			if (tmp->segment == segment &&
2010			    tmp->bus == bus && tmp->devfn == devfn) {
2011				found = tmp->domain;
2012				break;
2013			}
2014		}
2015		if (found) {
2016			spin_unlock_irqrestore(&device_domain_lock, flags);
2017			free_devinfo_mem(info);
2018			domain_exit(domain);
2019			domain = found;
2020		} else {
2021			list_add(&info->link, &domain->devices);
2022			list_add(&info->global, &device_domain_list);
2023			spin_unlock_irqrestore(&device_domain_lock, flags);
2024		}
 
 
 
 
 
2025	}
2026
2027found_domain:
2028	info = alloc_devinfo_mem();
2029	if (!info)
2030		goto error;
2031	info->segment = segment;
2032	info->bus = pdev->bus->number;
2033	info->devfn = pdev->devfn;
2034	info->dev = pdev;
2035	info->domain = domain;
2036	spin_lock_irqsave(&device_domain_lock, flags);
2037	/* somebody is fast */
2038	found = find_domain(pdev);
2039	if (found != NULL) {
2040		spin_unlock_irqrestore(&device_domain_lock, flags);
2041		if (found != domain) {
2042			domain_exit(domain);
2043			domain = found;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2044		}
2045		free_devinfo_mem(info);
2046		return domain;
2047	}
2048	list_add(&info->link, &domain->devices);
2049	list_add(&info->global, &device_domain_list);
2050	pdev->dev.archdata.iommu = info;
2051	spin_unlock_irqrestore(&device_domain_lock, flags);
 
2052	return domain;
2053error:
2054	/* recheck it here, maybe others set it */
2055	return find_domain(pdev);
2056}
2057
2058static int iommu_identity_mapping;
2059#define IDENTMAP_ALL		1
2060#define IDENTMAP_GFX		2
2061#define IDENTMAP_AZALIA		4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2062
2063static int iommu_domain_identity_map(struct dmar_domain *domain,
2064				     unsigned long long start,
2065				     unsigned long long end)
2066{
2067	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2068	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2069
2070	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2071			  dma_to_mm_pfn(last_vpfn))) {
2072		printk(KERN_ERR "IOMMU: reserve iova failed\n");
2073		return -ENOMEM;
2074	}
2075
2076	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2077		 start, end, domain->id);
2078	/*
2079	 * RMRR range might have overlap with physical memory range,
2080	 * clear it first
2081	 */
2082	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2083
2084	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2085				  last_vpfn - first_vpfn + 1,
2086				  DMA_PTE_READ|DMA_PTE_WRITE);
2087}
2088
2089static int iommu_prepare_identity_map(struct pci_dev *pdev,
2090				      unsigned long long start,
2091				      unsigned long long end)
 
2092{
2093	struct dmar_domain *domain;
2094	int ret;
2095
2096	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2097	if (!domain)
2098		return -ENOMEM;
2099
2100	/* For _hardware_ passthrough, don't bother. But for software
2101	   passthrough, we do it anyway -- it may indicate a memory
2102	   range which is reserved in E820, so which didn't get set
2103	   up to start with in si_domain */
2104	if (domain == si_domain && hw_pass_through) {
2105		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2106		       pci_name(pdev), start, end);
2107		return 0;
2108	}
2109
2110	printk(KERN_INFO
2111	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2112	       pci_name(pdev), start, end);
2113	
2114	if (end < start) {
2115		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2116			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2117			dmi_get_system_info(DMI_BIOS_VENDOR),
2118			dmi_get_system_info(DMI_BIOS_VERSION),
2119		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2120		ret = -EIO;
2121		goto error;
2122	}
2123
2124	if (end >> agaw_to_width(domain->agaw)) {
2125		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2126		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2127		     agaw_to_width(domain->agaw),
2128		     dmi_get_system_info(DMI_BIOS_VENDOR),
2129		     dmi_get_system_info(DMI_BIOS_VERSION),
2130		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2131		ret = -EIO;
2132		goto error;
2133	}
2134
2135	ret = iommu_domain_identity_map(domain, start, end);
2136	if (ret)
2137		goto error;
2138
2139	/* context entry init */
2140	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2141	if (ret)
2142		goto error;
 
 
2143
2144	return 0;
 
 
 
 
 
 
2145
2146 error:
2147	domain_exit(domain);
2148	return ret;
2149}
2150
2151static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2152	struct pci_dev *pdev)
2153{
2154	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2155		return 0;
2156	return iommu_prepare_identity_map(pdev, rmrr->base_address,
2157		rmrr->end_address);
2158}
2159
2160#ifdef CONFIG_DMAR_FLOPPY_WA
2161static inline void iommu_prepare_isa(void)
2162{
2163	struct pci_dev *pdev;
2164	int ret;
2165
2166	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2167	if (!pdev)
2168		return;
2169
2170	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2171	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2172
2173	if (ret)
2174		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2175		       "floppy might not work\n");
2176
 
2177}
2178#else
2179static inline void iommu_prepare_isa(void)
2180{
2181	return;
2182}
2183#endif /* !CONFIG_DMAR_FLPY_WA */
2184
2185static int md_domain_init(struct dmar_domain *domain, int guest_width);
2186
2187static int __init si_domain_work_fn(unsigned long start_pfn,
2188				    unsigned long end_pfn, void *datax)
2189{
2190	int *ret = datax;
2191
2192	*ret = iommu_domain_identity_map(si_domain,
2193					 (uint64_t)start_pfn << PAGE_SHIFT,
2194					 (uint64_t)end_pfn << PAGE_SHIFT);
2195	return *ret;
2196
2197}
2198
2199static int __init si_domain_init(int hw)
2200{
2201	struct dmar_drhd_unit *drhd;
2202	struct intel_iommu *iommu;
2203	int nid, ret = 0;
2204
2205	si_domain = alloc_domain();
2206	if (!si_domain)
2207		return -EFAULT;
2208
2209	pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2210
2211	for_each_active_iommu(iommu, drhd) {
2212		ret = iommu_attach_domain(si_domain, iommu);
2213		if (ret) {
2214			domain_exit(si_domain);
2215			return -EFAULT;
2216		}
2217	}
2218
2219	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2220		domain_exit(si_domain);
2221		return -EFAULT;
2222	}
2223
2224	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2225
2226	if (hw)
2227		return 0;
2228
2229	for_each_online_node(nid) {
2230		work_with_active_regions(nid, si_domain_work_fn, &ret);
2231		if (ret)
2232			return ret;
 
 
 
 
 
 
2233	}
2234
2235	return 0;
2236}
2237
2238static void domain_remove_one_dev_info(struct dmar_domain *domain,
2239					  struct pci_dev *pdev);
2240static int identity_mapping(struct pci_dev *pdev)
2241{
2242	struct device_domain_info *info;
2243
2244	if (likely(!iommu_identity_mapping))
2245		return 0;
2246
2247	info = pdev->dev.archdata.iommu;
2248	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2249		return (info->domain == si_domain);
2250
2251	return 0;
2252}
2253
2254static int domain_add_dev_info(struct dmar_domain *domain,
2255			       struct pci_dev *pdev,
2256			       int translation)
2257{
2258	struct device_domain_info *info;
2259	unsigned long flags;
2260	int ret;
2261
2262	info = alloc_devinfo_mem();
2263	if (!info)
2264		return -ENOMEM;
2265
2266	ret = domain_context_mapping(domain, pdev, translation);
2267	if (ret) {
2268		free_devinfo_mem(info);
2269		return ret;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2270	}
 
 
 
2271
2272	info->segment = pci_domain_nr(pdev->bus);
2273	info->bus = pdev->bus->number;
2274	info->devfn = pdev->devfn;
2275	info->dev = pdev;
2276	info->domain = domain;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2277
2278	spin_lock_irqsave(&device_domain_lock, flags);
2279	list_add(&info->link, &domain->devices);
2280	list_add(&info->global, &device_domain_list);
2281	pdev->dev.archdata.iommu = info;
2282	spin_unlock_irqrestore(&device_domain_lock, flags);
2283
2284	return 0;
 
 
 
 
2285}
2286
2287static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2288{
2289	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2290		return 1;
2291
2292	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2293		return 1;
2294
2295	if (!(iommu_identity_mapping & IDENTMAP_ALL))
2296		return 0;
2297
2298	/*
2299	 * We want to start off with all devices in the 1:1 domain, and
2300	 * take them out later if we find they can't access all of memory.
2301	 *
2302	 * However, we can't do this for PCI devices behind bridges,
2303	 * because all PCI devices behind the same bridge will end up
2304	 * with the same source-id on their transactions.
2305	 *
2306	 * Practically speaking, we can't change things around for these
2307	 * devices at run-time, because we can't be sure there'll be no
2308	 * DMA transactions in flight for any of their siblings.
2309	 * 
2310	 * So PCI devices (unless they're on the root bus) as well as
2311	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2312	 * the 1:1 domain, just in _case_ one of their siblings turns out
2313	 * not to be able to map all of memory.
2314	 */
2315	if (!pci_is_pcie(pdev)) {
2316		if (!pci_is_root_bus(pdev->bus))
2317			return 0;
2318		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2319			return 0;
2320	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2321		return 0;
 
 
2322
2323	/* 
2324	 * At boot time, we don't yet know if devices will be 64-bit capable.
2325	 * Assume that they will -- if they turn out not to be, then we can 
2326	 * take them out of the 1:1 domain later.
2327	 */
2328	if (!startup) {
2329		/*
2330		 * If the device's dma_mask is less than the system's memory
2331		 * size then this is not a candidate for identity mapping.
2332		 */
2333		u64 dma_mask = pdev->dma_mask;
2334
2335		if (pdev->dev.coherent_dma_mask &&
2336		    pdev->dev.coherent_dma_mask < dma_mask)
2337			dma_mask = pdev->dev.coherent_dma_mask;
2338
2339		return dma_mask >= dma_get_required_mask(&pdev->dev);
2340	}
2341
2342	return 1;
2343}
2344
2345static int __init iommu_prepare_static_identity_mapping(int hw)
2346{
2347	struct pci_dev *pdev = NULL;
2348	int ret;
2349
2350	ret = si_domain_init(hw);
2351	if (ret)
2352		return -EFAULT;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2353
2354	for_each_pci_dev(pdev) {
2355		/* Skip Host/PCI Bridge devices */
2356		if (IS_BRIDGE_HOST_DEVICE(pdev))
2357			continue;
2358		if (iommu_should_identity_map(pdev, 1)) {
2359			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2360			       hw ? "hardware" : "software", pci_name(pdev));
2361
2362			ret = domain_add_dev_info(si_domain, pdev,
2363						     hw ? CONTEXT_TT_PASS_THROUGH :
2364						     CONTEXT_TT_MULTI_LEVEL);
 
 
 
 
 
 
 
 
 
 
 
2365			if (ret)
2366				return ret;
2367		}
2368	}
2369
2370	return 0;
2371}
2372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2373static int __init init_dmars(void)
2374{
2375	struct dmar_drhd_unit *drhd;
2376	struct dmar_rmrr_unit *rmrr;
2377	struct pci_dev *pdev;
 
2378	struct intel_iommu *iommu;
2379	int i, ret;
2380
2381	/*
2382	 * for each drhd
2383	 *    allocate root
2384	 *    initialize and program root entry to not present
2385	 * endfor
2386	 */
2387	for_each_drhd_unit(drhd) {
2388		g_num_of_iommus++;
2389		/*
2390		 * lock not needed as this is only incremented in the single
2391		 * threaded kernel __init code path all other access are read
2392		 * only
2393		 */
 
 
 
 
 
2394	}
2395
 
 
 
 
2396	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2397			GFP_KERNEL);
2398	if (!g_iommus) {
2399		printk(KERN_ERR "Allocating global iommu array failed\n");
2400		ret = -ENOMEM;
2401		goto error;
2402	}
2403
2404	deferred_flush = kzalloc(g_num_of_iommus *
2405		sizeof(struct deferred_flush_tables), GFP_KERNEL);
2406	if (!deferred_flush) {
2407		ret = -ENOMEM;
2408		goto error;
2409	}
2410
2411	for_each_drhd_unit(drhd) {
2412		if (drhd->ignored)
2413			continue;
2414
2415		iommu = drhd->iommu;
2416		g_iommus[iommu->seq_id] = iommu;
2417
 
 
2418		ret = iommu_init_domains(iommu);
2419		if (ret)
2420			goto error;
 
 
 
 
 
 
 
 
 
2421
2422		/*
2423		 * TBD:
2424		 * we could share the same root & context tables
2425		 * among all IOMMU's. Need to Split it later.
2426		 */
2427		ret = iommu_alloc_root_entry(iommu);
2428		if (ret) {
2429			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2430			goto error;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2431		}
 
2432		if (!ecap_pass_through(iommu->ecap))
2433			hw_pass_through = 0;
 
 
 
 
2434	}
2435
2436	/*
2437	 * Start from the sane iommu hardware state.
 
 
2438	 */
2439	for_each_drhd_unit(drhd) {
2440		if (drhd->ignored)
2441			continue;
2442
2443		iommu = drhd->iommu;
2444
2445		/*
2446		 * If the queued invalidation is already initialized by us
2447		 * (for example, while enabling interrupt-remapping) then
2448		 * we got the things already rolling from a sane state.
2449		 */
2450		if (iommu->qi)
2451			continue;
2452
2453		/*
2454		 * Clear any previous faults.
2455		 */
2456		dmar_fault(-1, iommu);
2457		/*
2458		 * Disable queued invalidation if supported and already enabled
2459		 * before OS handover.
2460		 */
2461		dmar_disable_qi(iommu);
2462	}
2463
2464	for_each_drhd_unit(drhd) {
2465		if (drhd->ignored)
2466			continue;
2467
2468		iommu = drhd->iommu;
2469
2470		if (dmar_enable_qi(iommu)) {
2471			/*
2472			 * Queued Invalidate not enabled, use Register Based
2473			 * Invalidate
2474			 */
2475			iommu->flush.flush_context = __iommu_flush_context;
2476			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2477			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2478			       "invalidation\n",
2479				iommu->seq_id,
2480			       (unsigned long long)drhd->reg_base_addr);
2481		} else {
2482			iommu->flush.flush_context = qi_flush_context;
2483			iommu->flush.flush_iotlb = qi_flush_iotlb;
2484			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2485			       "invalidation\n",
2486				iommu->seq_id,
2487			       (unsigned long long)drhd->reg_base_addr);
2488		}
2489	}
2490
2491	if (iommu_pass_through)
2492		iommu_identity_mapping |= IDENTMAP_ALL;
2493
2494#ifdef CONFIG_DMAR_BROKEN_GFX_WA
2495	iommu_identity_mapping |= IDENTMAP_GFX;
2496#endif
2497
2498	check_tylersburg_isoch();
2499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2500	/*
2501	 * If pass through is not set or not enabled, setup context entries for
2502	 * identity mappings for rmrr, gfx, and isa and may fall back to static
2503	 * identity mapping if iommu_identity_mapping is set.
2504	 */
2505	if (iommu_identity_mapping) {
2506		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2507		if (ret) {
2508			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2509			goto error;
2510		}
2511	}
2512	/*
2513	 * For each rmrr
2514	 *   for each dev attached to rmrr
2515	 *   do
2516	 *     locate drhd for dev, alloc domain for dev
2517	 *     allocate free domain
2518	 *     allocate page table entries for rmrr
2519	 *     if context not allocated for bus
2520	 *           allocate and init context
2521	 *           set present in root table for this bus
2522	 *     init context with domain, translation etc
2523	 *    endfor
2524	 * endfor
2525	 */
2526	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2527	for_each_rmrr_units(rmrr) {
2528		for (i = 0; i < rmrr->devices_cnt; i++) {
2529			pdev = rmrr->devices[i];
2530			/*
2531			 * some BIOS lists non-exist devices in DMAR
2532			 * table.
2533			 */
2534			if (!pdev)
2535				continue;
2536			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2537			if (ret)
2538				printk(KERN_ERR
2539				       "IOMMU: mapping reserved region failed\n");
2540		}
2541	}
2542
2543	iommu_prepare_isa();
2544
 
 
2545	/*
2546	 * for each drhd
2547	 *   enable fault log
2548	 *   global invalidate context cache
2549	 *   global invalidate iotlb
2550	 *   enable translation
2551	 */
2552	for_each_drhd_unit(drhd) {
2553		if (drhd->ignored) {
2554			/*
2555			 * we always have to disable PMRs or DMA may fail on
2556			 * this device
2557			 */
2558			if (force_on)
2559				iommu_disable_protect_mem_regions(drhd->iommu);
2560			continue;
2561		}
2562		iommu = drhd->iommu;
2563
2564		iommu_flush_write_buffer(iommu);
2565
 
 
 
 
 
 
 
2566		ret = dmar_set_interrupt(iommu);
2567		if (ret)
2568			goto error;
2569
2570		iommu_set_root_entry(iommu);
2571
2572		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2573		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2574
2575		ret = iommu_enable_translation(iommu);
2576		if (ret)
2577			goto error;
2578
2579		iommu_disable_protect_mem_regions(iommu);
2580	}
2581
2582	return 0;
2583error:
2584	for_each_drhd_unit(drhd) {
2585		if (drhd->ignored)
2586			continue;
2587		iommu = drhd->iommu;
2588		free_iommu(iommu);
2589	}
 
2590	kfree(g_iommus);
 
 
2591	return ret;
2592}
2593
2594/* This takes a number of _MM_ pages, not VTD pages */
2595static struct iova *intel_alloc_iova(struct device *dev,
2596				     struct dmar_domain *domain,
2597				     unsigned long nrpages, uint64_t dma_mask)
2598{
2599	struct pci_dev *pdev = to_pci_dev(dev);
2600	struct iova *iova = NULL;
2601
2602	/* Restrict dma_mask to the width that the iommu can handle */
2603	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
 
 
2604
2605	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2606		/*
2607		 * First try to allocate an io virtual address in
2608		 * DMA_BIT_MASK(32) and if that fails then try allocating
2609		 * from higher range
2610		 */
2611		iova = alloc_iova(&domain->iovad, nrpages,
2612				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2613		if (iova)
2614			return iova;
2615	}
2616	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2617	if (unlikely(!iova)) {
2618		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2619		       nrpages, pci_name(pdev));
2620		return NULL;
 
2621	}
2622
2623	return iova;
2624}
2625
2626static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2627{
2628	struct dmar_domain *domain;
2629	int ret;
 
 
2630
2631	domain = get_domain_for_dev(pdev,
2632			DEFAULT_DOMAIN_ADDRESS_WIDTH);
2633	if (!domain) {
2634		printk(KERN_ERR
2635			"Allocating domain for %s failed", pci_name(pdev));
2636		return NULL;
2637	}
2638
2639	/* make sure context mapping is ok */
2640	if (unlikely(!domain_context_mapped(pdev))) {
2641		ret = domain_context_mapping(domain, pdev,
2642					     CONTEXT_TT_MULTI_LEVEL);
2643		if (ret) {
2644			printk(KERN_ERR
2645				"Domain context map for %s failed",
2646				pci_name(pdev));
2647			return NULL;
 
 
 
 
 
 
 
 
2648		}
2649	}
 
2650
2651	return domain;
2652}
 
 
 
2653
2654static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2655{
2656	struct device_domain_info *info;
2657
2658	/* No lock here, assumes no domain exit in normal case */
2659	info = dev->dev.archdata.iommu;
2660	if (likely(info))
2661		return info->domain;
2662
2663	return __get_valid_domain_for_dev(dev);
2664}
2665
2666static int iommu_dummy(struct pci_dev *pdev)
2667{
2668	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2669}
2670
2671/* Check if the pdev needs to go through non-identity map and unmap process.*/
2672static int iommu_no_mapping(struct device *dev)
2673{
2674	struct pci_dev *pdev;
2675	int found;
2676
2677	if (unlikely(dev->bus != &pci_bus_type))
2678		return 1;
2679
2680	pdev = to_pci_dev(dev);
2681	if (iommu_dummy(pdev))
2682		return 1;
2683
2684	if (!iommu_identity_mapping)
2685		return 0;
2686
2687	found = identity_mapping(pdev);
2688	if (found) {
2689		if (iommu_should_identity_map(pdev, 0))
2690			return 1;
2691		else {
2692			/*
2693			 * 32 bit DMA is removed from si_domain and fall back
2694			 * to non-identity mapping.
2695			 */
2696			domain_remove_one_dev_info(si_domain, pdev);
2697			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2698			       pci_name(pdev));
2699			return 0;
2700		}
2701	} else {
2702		/*
2703		 * In case of a detached 64 bit DMA device from vm, the device
2704		 * is put into si_domain for identity mapping.
2705		 */
2706		if (iommu_should_identity_map(pdev, 0)) {
2707			int ret;
2708			ret = domain_add_dev_info(si_domain, pdev,
2709						  hw_pass_through ?
2710						  CONTEXT_TT_PASS_THROUGH :
2711						  CONTEXT_TT_MULTI_LEVEL);
2712			if (!ret) {
2713				printk(KERN_INFO "64bit %s uses identity mapping\n",
2714				       pci_name(pdev));
2715				return 1;
2716			}
2717		}
2718	}
2719
2720	return 0;
2721}
2722
2723static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2724				     size_t size, int dir, u64 dma_mask)
2725{
2726	struct pci_dev *pdev = to_pci_dev(hwdev);
2727	struct dmar_domain *domain;
2728	phys_addr_t start_paddr;
2729	struct iova *iova;
2730	int prot = 0;
2731	int ret;
2732	struct intel_iommu *iommu;
2733	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2734
2735	BUG_ON(dir == DMA_NONE);
2736
2737	if (iommu_no_mapping(hwdev))
2738		return paddr;
2739
2740	domain = get_valid_domain_for_dev(pdev);
2741	if (!domain)
2742		return 0;
2743
2744	iommu = domain_get_iommu(domain);
2745	size = aligned_nrpages(paddr, size);
2746
2747	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2748	if (!iova)
2749		goto error;
2750
2751	/*
2752	 * Check if DMAR supports zero-length reads on write only
2753	 * mappings..
2754	 */
2755	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2756			!cap_zlr(iommu->cap))
2757		prot |= DMA_PTE_READ;
2758	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2759		prot |= DMA_PTE_WRITE;
2760	/*
2761	 * paddr - (paddr + size) might be partial page, we should map the whole
2762	 * page.  Note: if two part of one page are separately mapped, we
2763	 * might have two guest_addr mapping to the same host paddr, but this
2764	 * is not a big problem
2765	 */
2766	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2767				 mm_to_dma_pfn(paddr_pfn), size, prot);
2768	if (ret)
2769		goto error;
2770
2771	/* it's a non-present to present mapping. Only flush if caching mode */
2772	if (cap_caching_mode(iommu->cap))
2773		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
 
 
2774	else
2775		iommu_flush_write_buffer(iommu);
2776
2777	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2778	start_paddr += paddr & ~PAGE_MASK;
2779	return start_paddr;
2780
2781error:
2782	if (iova)
2783		__free_iova(&domain->iovad, iova);
2784	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2785		pci_name(pdev), size, (unsigned long long)paddr, dir);
2786	return 0;
2787}
2788
2789static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2790				 unsigned long offset, size_t size,
2791				 enum dma_data_direction dir,
2792				 struct dma_attrs *attrs)
2793{
2794	return __intel_map_single(dev, page_to_phys(page) + offset, size,
2795				  dir, to_pci_dev(dev)->dma_mask);
2796}
2797
2798static void flush_unmaps(void)
2799{
2800	int i, j;
2801
2802	timer_on = 0;
2803
2804	/* just flush them all */
2805	for (i = 0; i < g_num_of_iommus; i++) {
2806		struct intel_iommu *iommu = g_iommus[i];
2807		if (!iommu)
2808			continue;
2809
2810		if (!deferred_flush[i].next)
2811			continue;
2812
2813		/* In caching mode, global flushes turn emulation expensive */
2814		if (!cap_caching_mode(iommu->cap))
2815			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2816					 DMA_TLB_GLOBAL_FLUSH);
2817		for (j = 0; j < deferred_flush[i].next; j++) {
2818			unsigned long mask;
2819			struct iova *iova = deferred_flush[i].iova[j];
2820			struct dmar_domain *domain = deferred_flush[i].domain[j];
2821
2822			/* On real hardware multiple invalidations are expensive */
2823			if (cap_caching_mode(iommu->cap))
2824				iommu_flush_iotlb_psi(iommu, domain->id,
2825				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2826			else {
2827				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2828				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2829						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2830			}
2831			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2832		}
2833		deferred_flush[i].next = 0;
2834	}
2835
2836	list_size = 0;
2837}
2838
2839static void flush_unmaps_timeout(unsigned long data)
2840{
2841	unsigned long flags;
2842
2843	spin_lock_irqsave(&async_umap_flush_lock, flags);
2844	flush_unmaps();
2845	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2846}
2847
2848static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2849{
2850	unsigned long flags;
2851	int next, iommu_id;
2852	struct intel_iommu *iommu;
2853
2854	spin_lock_irqsave(&async_umap_flush_lock, flags);
2855	if (list_size == HIGH_WATER_MARK)
2856		flush_unmaps();
2857
2858	iommu = domain_get_iommu(dom);
2859	iommu_id = iommu->seq_id;
2860
2861	next = deferred_flush[iommu_id].next;
2862	deferred_flush[iommu_id].domain[next] = dom;
2863	deferred_flush[iommu_id].iova[next] = iova;
2864	deferred_flush[iommu_id].next++;
2865
2866	if (!timer_on) {
2867		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2868		timer_on = 1;
2869	}
2870	list_size++;
2871	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2872}
2873
2874static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2875			     size_t size, enum dma_data_direction dir,
2876			     struct dma_attrs *attrs)
2877{
2878	struct pci_dev *pdev = to_pci_dev(dev);
2879	struct dmar_domain *domain;
2880	unsigned long start_pfn, last_pfn;
2881	struct iova *iova;
 
2882	struct intel_iommu *iommu;
 
2883
2884	if (iommu_no_mapping(dev))
2885		return;
2886
2887	domain = find_domain(pdev);
2888	BUG_ON(!domain);
2889
2890	iommu = domain_get_iommu(domain);
2891
2892	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2893	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2894		      (unsigned long long)dev_addr))
2895		return;
2896
2897	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2898	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
 
2899
2900	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2901		 pci_name(pdev), start_pfn, last_pfn);
2902
2903	/*  clear the whole page */
2904	dma_pte_clear_range(domain, start_pfn, last_pfn);
2905
2906	/* free page tables */
2907	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2908
2909	if (intel_iommu_strict) {
2910		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2911				      last_pfn - start_pfn + 1, 0);
2912		/* free iova */
2913		__free_iova(&domain->iovad, iova);
 
2914	} else {
2915		add_unmap(domain, iova);
 
2916		/*
2917		 * queue up the release of the unmap to save the 1/6th of the
2918		 * cpu used up by the iotlb flush operation...
2919		 */
2920	}
2921}
2922
2923static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2924				  dma_addr_t *dma_handle, gfp_t flags)
 
2925{
2926	void *vaddr;
2927	int order;
2928
2929	size = PAGE_ALIGN(size);
2930	order = get_order(size);
 
 
 
2931
2932	if (!iommu_no_mapping(hwdev))
2933		flags &= ~(GFP_DMA | GFP_DMA32);
2934	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2935		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2936			flags |= GFP_DMA;
2937		else
2938			flags |= GFP_DMA32;
2939	}
2940
2941	vaddr = (void *)__get_free_pages(flags, order);
2942	if (!vaddr)
2943		return NULL;
2944	memset(vaddr, 0, size);
 
 
2945
2946	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2947					 DMA_BIDIRECTIONAL,
2948					 hwdev->coherent_dma_mask);
2949	if (*dma_handle)
2950		return vaddr;
2951	free_pages((unsigned long)vaddr, order);
2952	return NULL;
2953}
2954
2955static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2956				dma_addr_t dma_handle)
2957{
2958	int order;
2959
2960	size = PAGE_ALIGN(size);
2961	order = get_order(size);
2962
2963	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2964	free_pages((unsigned long)vaddr, order);
2965}
2966
2967static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2968			   int nelems, enum dma_data_direction dir,
2969			   struct dma_attrs *attrs)
2970{
2971	struct pci_dev *pdev = to_pci_dev(hwdev);
2972	struct dmar_domain *domain;
2973	unsigned long start_pfn, last_pfn;
2974	struct iova *iova;
2975	struct intel_iommu *iommu;
2976
2977	if (iommu_no_mapping(hwdev))
2978		return;
2979
2980	domain = find_domain(pdev);
2981	BUG_ON(!domain);
2982
2983	iommu = domain_get_iommu(domain);
2984
2985	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2986	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2987		      (unsigned long long)sglist[0].dma_address))
2988		return;
2989
2990	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2991	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2992
2993	/*  clear the whole page */
2994	dma_pte_clear_range(domain, start_pfn, last_pfn);
2995
2996	/* free page tables */
2997	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2998
2999	if (intel_iommu_strict) {
3000		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3001				      last_pfn - start_pfn + 1, 0);
3002		/* free iova */
3003		__free_iova(&domain->iovad, iova);
3004	} else {
3005		add_unmap(domain, iova);
3006		/*
3007		 * queue up the release of the unmap to save the 1/6th of the
3008		 * cpu used up by the iotlb flush operation...
3009		 */
3010	}
 
 
3011}
3012
3013static int intel_nontranslate_map_sg(struct device *hddev,
3014	struct scatterlist *sglist, int nelems, int dir)
3015{
3016	int i;
3017	struct scatterlist *sg;
3018
3019	for_each_sg(sglist, sg, nelems, i) {
3020		BUG_ON(!sg_page(sg));
3021		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3022		sg->dma_length = sg->length;
3023	}
3024	return nelems;
3025}
3026
3027static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3028			enum dma_data_direction dir, struct dma_attrs *attrs)
3029{
3030	int i;
3031	struct pci_dev *pdev = to_pci_dev(hwdev);
3032	struct dmar_domain *domain;
3033	size_t size = 0;
3034	int prot = 0;
3035	struct iova *iova = NULL;
3036	int ret;
3037	struct scatterlist *sg;
3038	unsigned long start_vpfn;
3039	struct intel_iommu *iommu;
3040
3041	BUG_ON(dir == DMA_NONE);
3042	if (iommu_no_mapping(hwdev))
3043		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3044
3045	domain = get_valid_domain_for_dev(pdev);
3046	if (!domain)
3047		return 0;
3048
3049	iommu = domain_get_iommu(domain);
3050
3051	for_each_sg(sglist, sg, nelems, i)
3052		size += aligned_nrpages(sg->offset, sg->length);
3053
3054	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3055				pdev->dma_mask);
3056	if (!iova) {
3057		sglist->dma_length = 0;
3058		return 0;
3059	}
3060
3061	/*
3062	 * Check if DMAR supports zero-length reads on write only
3063	 * mappings..
3064	 */
3065	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3066			!cap_zlr(iommu->cap))
3067		prot |= DMA_PTE_READ;
3068	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3069		prot |= DMA_PTE_WRITE;
3070
3071	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3072
3073	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3074	if (unlikely(ret)) {
3075		/*  clear the page */
3076		dma_pte_clear_range(domain, start_vpfn,
3077				    start_vpfn + size - 1);
3078		/* free page tables */
3079		dma_pte_free_pagetable(domain, start_vpfn,
3080				       start_vpfn + size - 1);
3081		/* free iova */
3082		__free_iova(&domain->iovad, iova);
3083		return 0;
3084	}
3085
3086	/* it's a non-present to present mapping. Only flush if caching mode */
3087	if (cap_caching_mode(iommu->cap))
3088		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3089	else
3090		iommu_flush_write_buffer(iommu);
3091
3092	return nelems;
3093}
3094
3095static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3096{
3097	return !dma_addr;
3098}
3099
3100struct dma_map_ops intel_dma_ops = {
3101	.alloc_coherent = intel_alloc_coherent,
3102	.free_coherent = intel_free_coherent,
3103	.map_sg = intel_map_sg,
3104	.unmap_sg = intel_unmap_sg,
3105	.map_page = intel_map_page,
3106	.unmap_page = intel_unmap_page,
3107	.mapping_error = intel_mapping_error,
 
 
 
3108};
3109
3110static inline int iommu_domain_cache_init(void)
3111{
3112	int ret = 0;
3113
3114	iommu_domain_cache = kmem_cache_create("iommu_domain",
3115					 sizeof(struct dmar_domain),
3116					 0,
3117					 SLAB_HWCACHE_ALIGN,
3118
3119					 NULL);
3120	if (!iommu_domain_cache) {
3121		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3122		ret = -ENOMEM;
3123	}
3124
3125	return ret;
3126}
3127
3128static inline int iommu_devinfo_cache_init(void)
3129{
3130	int ret = 0;
3131
3132	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3133					 sizeof(struct device_domain_info),
3134					 0,
3135					 SLAB_HWCACHE_ALIGN,
3136					 NULL);
3137	if (!iommu_devinfo_cache) {
3138		printk(KERN_ERR "Couldn't create devinfo cache\n");
3139		ret = -ENOMEM;
3140	}
3141
3142	return ret;
3143}
3144
3145static inline int iommu_iova_cache_init(void)
3146{
3147	int ret = 0;
3148
3149	iommu_iova_cache = kmem_cache_create("iommu_iova",
3150					 sizeof(struct iova),
3151					 0,
3152					 SLAB_HWCACHE_ALIGN,
3153					 NULL);
3154	if (!iommu_iova_cache) {
3155		printk(KERN_ERR "Couldn't create iova cache\n");
3156		ret = -ENOMEM;
3157	}
3158
3159	return ret;
3160}
3161
3162static int __init iommu_init_mempool(void)
3163{
3164	int ret;
3165	ret = iommu_iova_cache_init();
3166	if (ret)
3167		return ret;
3168
3169	ret = iommu_domain_cache_init();
3170	if (ret)
3171		goto domain_error;
3172
3173	ret = iommu_devinfo_cache_init();
3174	if (!ret)
3175		return ret;
3176
3177	kmem_cache_destroy(iommu_domain_cache);
3178domain_error:
3179	kmem_cache_destroy(iommu_iova_cache);
3180
3181	return -ENOMEM;
3182}
3183
3184static void __init iommu_exit_mempool(void)
3185{
3186	kmem_cache_destroy(iommu_devinfo_cache);
3187	kmem_cache_destroy(iommu_domain_cache);
3188	kmem_cache_destroy(iommu_iova_cache);
3189
3190}
3191
3192static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3193{
3194	struct dmar_drhd_unit *drhd;
3195	u32 vtbar;
3196	int rc;
3197
3198	/* We know that this device on this chipset has its own IOMMU.
3199	 * If we find it under a different IOMMU, then the BIOS is lying
3200	 * to us. Hope that the IOMMU for this device is actually
3201	 * disabled, and it needs no translation...
3202	 */
3203	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3204	if (rc) {
3205		/* "can't" happen */
3206		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3207		return;
3208	}
3209	vtbar &= 0xffff0000;
3210
3211	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3212	drhd = dmar_find_matched_drhd_unit(pdev);
3213	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3214			    TAINT_FIRMWARE_WORKAROUND,
3215			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3216		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3217}
3218DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3219
3220static void __init init_no_remapping_devices(void)
3221{
3222	struct dmar_drhd_unit *drhd;
 
 
3223
3224	for_each_drhd_unit(drhd) {
3225		if (!drhd->include_all) {
3226			int i;
3227			for (i = 0; i < drhd->devices_cnt; i++)
3228				if (drhd->devices[i] != NULL)
3229					break;
3230			/* ignore DMAR unit if no pci devices exist */
3231			if (i == drhd->devices_cnt)
3232				drhd->ignored = 1;
3233		}
3234	}
3235
3236	for_each_drhd_unit(drhd) {
3237		int i;
3238		if (drhd->ignored || drhd->include_all)
3239			continue;
3240
3241		for (i = 0; i < drhd->devices_cnt; i++)
3242			if (drhd->devices[i] &&
3243			    !IS_GFX_DEVICE(drhd->devices[i]))
3244				break;
3245
3246		if (i < drhd->devices_cnt)
3247			continue;
3248
3249		/* This IOMMU has *only* gfx devices. Either bypass it or
3250		   set the gfx_mapped flag, as appropriate */
3251		if (dmar_map_gfx) {
3252			intel_iommu_gfx_mapped = 1;
3253		} else {
3254			drhd->ignored = 1;
3255			for (i = 0; i < drhd->devices_cnt; i++) {
3256				if (!drhd->devices[i])
3257					continue;
3258				drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259			}
3260		}
3261	}
3262}
3263
3264#ifdef CONFIG_SUSPEND
3265static int init_iommu_hw(void)
3266{
3267	struct dmar_drhd_unit *drhd;
3268	struct intel_iommu *iommu = NULL;
3269
3270	for_each_active_iommu(iommu, drhd)
3271		if (iommu->qi)
3272			dmar_reenable_qi(iommu);
3273
3274	for_each_iommu(iommu, drhd) {
3275		if (drhd->ignored) {
3276			/*
3277			 * we always have to disable PMRs or DMA may fail on
3278			 * this device
3279			 */
3280			if (force_on)
3281				iommu_disable_protect_mem_regions(iommu);
3282			continue;
3283		}
3284	
3285		iommu_flush_write_buffer(iommu);
3286
3287		iommu_set_root_entry(iommu);
3288
3289		iommu->flush.flush_context(iommu, 0, 0, 0,
3290					   DMA_CCMD_GLOBAL_INVL);
3291		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3292					 DMA_TLB_GLOBAL_FLUSH);
3293		if (iommu_enable_translation(iommu))
3294			return 1;
3295		iommu_disable_protect_mem_regions(iommu);
3296	}
3297
3298	return 0;
3299}
3300
3301static void iommu_flush_all(void)
3302{
3303	struct dmar_drhd_unit *drhd;
3304	struct intel_iommu *iommu;
3305
3306	for_each_active_iommu(iommu, drhd) {
3307		iommu->flush.flush_context(iommu, 0, 0, 0,
3308					   DMA_CCMD_GLOBAL_INVL);
3309		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3310					 DMA_TLB_GLOBAL_FLUSH);
3311	}
3312}
3313
3314static int iommu_suspend(void)
3315{
3316	struct dmar_drhd_unit *drhd;
3317	struct intel_iommu *iommu = NULL;
3318	unsigned long flag;
3319
3320	for_each_active_iommu(iommu, drhd) {
3321		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3322						 GFP_ATOMIC);
3323		if (!iommu->iommu_state)
3324			goto nomem;
3325	}
3326
3327	iommu_flush_all();
3328
3329	for_each_active_iommu(iommu, drhd) {
3330		iommu_disable_translation(iommu);
3331
3332		spin_lock_irqsave(&iommu->register_lock, flag);
3333
3334		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3335			readl(iommu->reg + DMAR_FECTL_REG);
3336		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3337			readl(iommu->reg + DMAR_FEDATA_REG);
3338		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3339			readl(iommu->reg + DMAR_FEADDR_REG);
3340		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3341			readl(iommu->reg + DMAR_FEUADDR_REG);
3342
3343		spin_unlock_irqrestore(&iommu->register_lock, flag);
3344	}
3345	return 0;
3346
3347nomem:
3348	for_each_active_iommu(iommu, drhd)
3349		kfree(iommu->iommu_state);
3350
3351	return -ENOMEM;
3352}
3353
3354static void iommu_resume(void)
3355{
3356	struct dmar_drhd_unit *drhd;
3357	struct intel_iommu *iommu = NULL;
3358	unsigned long flag;
3359
3360	if (init_iommu_hw()) {
3361		if (force_on)
3362			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3363		else
3364			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3365		return;
3366	}
3367
3368	for_each_active_iommu(iommu, drhd) {
3369
3370		spin_lock_irqsave(&iommu->register_lock, flag);
3371
3372		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3373			iommu->reg + DMAR_FECTL_REG);
3374		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3375			iommu->reg + DMAR_FEDATA_REG);
3376		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3377			iommu->reg + DMAR_FEADDR_REG);
3378		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3379			iommu->reg + DMAR_FEUADDR_REG);
3380
3381		spin_unlock_irqrestore(&iommu->register_lock, flag);
3382	}
3383
3384	for_each_active_iommu(iommu, drhd)
3385		kfree(iommu->iommu_state);
3386}
3387
3388static struct syscore_ops iommu_syscore_ops = {
3389	.resume		= iommu_resume,
3390	.suspend	= iommu_suspend,
3391};
3392
3393static void __init init_iommu_pm_ops(void)
3394{
3395	register_syscore_ops(&iommu_syscore_ops);
3396}
3397
3398#else
3399static inline void init_iommu_pm_ops(void) {}
3400#endif	/* CONFIG_PM */
3401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3402/*
3403 * Here we only respond to action of unbound device from driver.
3404 *
3405 * Added device is not attached to its DMAR domain here yet. That will happen
3406 * when mapping the device to iova.
3407 */
3408static int device_notifier(struct notifier_block *nb,
3409				  unsigned long action, void *data)
3410{
3411	struct device *dev = data;
3412	struct pci_dev *pdev = to_pci_dev(dev);
3413	struct dmar_domain *domain;
3414
3415	if (iommu_no_mapping(dev))
3416		return 0;
3417
3418	domain = find_domain(pdev);
3419	if (!domain)
3420		return 0;
3421
3422	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3423		domain_remove_one_dev_info(domain, pdev);
 
3424
3425		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3426		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3427		    list_empty(&domain->devices))
3428			domain_exit(domain);
3429	}
3430
3431	return 0;
3432}
3433
3434static struct notifier_block device_nb = {
3435	.notifier_call = device_notifier,
3436};
3437
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3438int __init intel_iommu_init(void)
3439{
3440	int ret = 0;
 
 
3441
3442	/* VT-d is required for a TXT/tboot launch, so enforce that */
3443	force_on = tboot_force_iommu();
3444
 
 
 
 
 
 
 
3445	if (dmar_table_init()) {
3446		if (force_on)
3447			panic("tboot: Failed to initialize DMAR table\n");
3448		return 	-ENODEV;
3449	}
3450
3451	if (dmar_dev_scope_init()) {
3452		if (force_on)
3453			panic("tboot: Failed to initialize DMAR device scope\n");
3454		return 	-ENODEV;
3455	}
3456
 
 
3457	/*
3458	 * Check the need for DMA-remapping initialization now.
3459	 * Above initialization will also be used by Interrupt-remapping.
3460	 */
3461	if (no_iommu || dmar_disabled)
3462		return -ENODEV;
3463
3464	if (iommu_init_mempool()) {
3465		if (force_on)
3466			panic("tboot: Failed to initialize iommu memory\n");
3467		return 	-ENODEV;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3468	}
3469
 
 
 
 
 
 
3470	if (dmar_init_reserved_ranges()) {
3471		if (force_on)
3472			panic("tboot: Failed to reserve iommu ranges\n");
3473		return 	-ENODEV;
3474	}
3475
3476	init_no_remapping_devices();
3477
3478	ret = init_dmars();
3479	if (ret) {
3480		if (force_on)
3481			panic("tboot: Failed to initialize DMARs\n");
3482		printk(KERN_ERR "IOMMU: dmar init failed\n");
3483		put_iova_domain(&reserved_iova_list);
3484		iommu_exit_mempool();
3485		return ret;
3486	}
3487	printk(KERN_INFO
3488	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3489
3490	init_timer(&unmap_timer);
3491#ifdef CONFIG_SWIOTLB
3492	swiotlb = 0;
3493#endif
3494	dma_ops = &intel_dma_ops;
3495
3496	init_iommu_pm_ops();
3497
3498	register_iommu(&intel_iommu_ops);
 
 
 
 
 
 
3499
 
3500	bus_register_notifier(&pci_bus_type, &device_nb);
 
 
 
 
 
3501
3502	return 0;
 
 
 
 
 
 
 
 
3503}
3504
3505static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3506					   struct pci_dev *pdev)
3507{
3508	struct pci_dev *tmp, *parent;
3509
3510	if (!iommu || !pdev)
 
 
 
 
 
 
 
 
 
 
 
 
3511		return;
3512
3513	/* dependent device detach */
3514	tmp = pci_find_upstream_pcie_bridge(pdev);
3515	/* Secondary interface's bus number and devfn 0 */
3516	if (tmp) {
3517		parent = pdev->bus->self;
3518		while (parent != tmp) {
3519			iommu_detach_dev(iommu, parent->bus->number,
3520					 parent->devfn);
3521			parent = parent->bus->self;
3522		}
3523		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3524			iommu_detach_dev(iommu,
3525				tmp->subordinate->number, 0);
3526		else /* this is a legacy PCI bridge */
3527			iommu_detach_dev(iommu, tmp->bus->number,
3528					 tmp->devfn);
3529	}
3530}
3531
3532static void domain_remove_one_dev_info(struct dmar_domain *domain,
3533					  struct pci_dev *pdev)
3534{
3535	struct device_domain_info *info;
3536	struct intel_iommu *iommu;
3537	unsigned long flags;
3538	int found = 0;
3539	struct list_head *entry, *tmp;
3540
3541	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3542				pdev->devfn);
3543	if (!iommu)
3544		return;
3545
3546	spin_lock_irqsave(&device_domain_lock, flags);
3547	list_for_each_safe(entry, tmp, &domain->devices) {
3548		info = list_entry(entry, struct device_domain_info, link);
3549		if (info->segment == pci_domain_nr(pdev->bus) &&
3550		    info->bus == pdev->bus->number &&
3551		    info->devfn == pdev->devfn) {
3552			list_del(&info->link);
3553			list_del(&info->global);
3554			if (info->dev)
3555				info->dev->dev.archdata.iommu = NULL;
3556			spin_unlock_irqrestore(&device_domain_lock, flags);
3557
3558			iommu_disable_dev_iotlb(info);
3559			iommu_detach_dev(iommu, info->bus, info->devfn);
3560			iommu_detach_dependent_devices(iommu, pdev);
3561			free_devinfo_mem(info);
3562
3563			spin_lock_irqsave(&device_domain_lock, flags);
3564
3565			if (found)
3566				break;
3567			else
3568				continue;
3569		}
3570
3571		/* if there is no other devices under the same iommu
3572		 * owned by this domain, clear this iommu in iommu_bmp
3573		 * update iommu count and coherency
3574		 */
3575		if (iommu == device_to_iommu(info->segment, info->bus,
3576					    info->devfn))
3577			found = 1;
3578	}
3579
3580	spin_unlock_irqrestore(&device_domain_lock, flags);
3581
3582	if (found == 0) {
3583		unsigned long tmp_flags;
3584		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3585		clear_bit(iommu->seq_id, &domain->iommu_bmp);
3586		domain->iommu_count--;
3587		domain_update_iommu_cap(domain);
3588		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3589
3590		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3591		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3592			spin_lock_irqsave(&iommu->lock, tmp_flags);
3593			clear_bit(domain->id, iommu->domain_ids);
3594			iommu->domains[domain->id] = NULL;
3595			spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3596		}
3597	}
3598}
3599
3600static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
 
3601{
3602	struct device_domain_info *info;
3603	struct intel_iommu *iommu;
3604	unsigned long flags1, flags2;
3605
3606	spin_lock_irqsave(&device_domain_lock, flags1);
3607	while (!list_empty(&domain->devices)) {
3608		info = list_entry(domain->devices.next,
3609			struct device_domain_info, link);
3610		list_del(&info->link);
3611		list_del(&info->global);
3612		if (info->dev)
3613			info->dev->dev.archdata.iommu = NULL;
3614
3615		spin_unlock_irqrestore(&device_domain_lock, flags1);
3616
3617		iommu_disable_dev_iotlb(info);
3618		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3619		iommu_detach_dev(iommu, info->bus, info->devfn);
3620		iommu_detach_dependent_devices(iommu, info->dev);
3621
3622		/* clear this iommu in iommu_bmp, update iommu count
3623		 * and capabilities
3624		 */
3625		spin_lock_irqsave(&domain->iommu_lock, flags2);
3626		if (test_and_clear_bit(iommu->seq_id,
3627				       &domain->iommu_bmp)) {
3628			domain->iommu_count--;
3629			domain_update_iommu_cap(domain);
3630		}
3631		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3632
3633		free_devinfo_mem(info);
3634		spin_lock_irqsave(&device_domain_lock, flags1);
3635	}
3636	spin_unlock_irqrestore(&device_domain_lock, flags1);
3637}
3638
3639/* domain id for virtual machine, it won't be set in context */
3640static unsigned long vm_domid;
3641
3642static struct dmar_domain *iommu_alloc_vm_domain(void)
3643{
3644	struct dmar_domain *domain;
3645
3646	domain = alloc_domain_mem();
3647	if (!domain)
3648		return NULL;
3649
3650	domain->id = vm_domid++;
3651	domain->nid = -1;
3652	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3653	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3654
3655	return domain;
 
 
 
3656}
3657
3658static int md_domain_init(struct dmar_domain *domain, int guest_width)
3659{
3660	int adjust_width;
3661
3662	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3663	spin_lock_init(&domain->iommu_lock);
3664
3665	domain_reserve_special_ranges(domain);
3666
3667	/* calculate AGAW */
3668	domain->gaw = guest_width;
3669	adjust_width = guestwidth_to_adjustwidth(guest_width);
3670	domain->agaw = width_to_agaw(adjust_width);
3671
3672	INIT_LIST_HEAD(&domain->devices);
3673
3674	domain->iommu_count = 0;
3675	domain->iommu_coherency = 0;
3676	domain->iommu_snooping = 0;
3677	domain->iommu_superpage = 0;
3678	domain->max_addr = 0;
3679	domain->nid = -1;
3680
3681	/* always allocate the top pgd */
3682	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3683	if (!domain->pgd)
3684		return -ENOMEM;
3685	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3686	return 0;
3687}
3688
3689static void iommu_free_vm_domain(struct dmar_domain *domain)
3690{
3691	unsigned long flags;
3692	struct dmar_drhd_unit *drhd;
3693	struct intel_iommu *iommu;
3694	unsigned long i;
3695	unsigned long ndomains;
3696
3697	for_each_drhd_unit(drhd) {
3698		if (drhd->ignored)
3699			continue;
3700		iommu = drhd->iommu;
3701
3702		ndomains = cap_ndoms(iommu->cap);
3703		for_each_set_bit(i, iommu->domain_ids, ndomains) {
3704			if (iommu->domains[i] == domain) {
3705				spin_lock_irqsave(&iommu->lock, flags);
3706				clear_bit(i, iommu->domain_ids);
3707				iommu->domains[i] = NULL;
3708				spin_unlock_irqrestore(&iommu->lock, flags);
3709				break;
3710			}
3711		}
3712	}
3713}
3714
3715static void vm_domain_exit(struct dmar_domain *domain)
3716{
3717	/* Domain 0 is reserved, so dont process it */
3718	if (!domain)
3719		return;
3720
3721	vm_domain_remove_all_dev_info(domain);
3722	/* destroy iovas */
3723	put_iova_domain(&domain->iovad);
3724
3725	/* clear ptes */
3726	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3727
3728	/* free page tables */
3729	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3730
3731	iommu_free_vm_domain(domain);
3732	free_domain_mem(domain);
3733}
3734
3735static int intel_iommu_domain_init(struct iommu_domain *domain)
3736{
3737	struct dmar_domain *dmar_domain;
 
 
 
 
3738
3739	dmar_domain = iommu_alloc_vm_domain();
3740	if (!dmar_domain) {
3741		printk(KERN_ERR
3742			"intel_iommu_domain_init: dmar_domain == NULL\n");
3743		return -ENOMEM;
3744	}
3745	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3746		printk(KERN_ERR
3747			"intel_iommu_domain_init() failed\n");
3748		vm_domain_exit(dmar_domain);
3749		return -ENOMEM;
3750	}
3751	domain_update_iommu_cap(dmar_domain);
3752	domain->priv = dmar_domain;
3753
3754	return 0;
 
 
 
 
 
3755}
3756
3757static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3758{
3759	struct dmar_domain *dmar_domain = domain->priv;
3760
3761	domain->priv = NULL;
3762	vm_domain_exit(dmar_domain);
3763}
3764
3765static int intel_iommu_attach_device(struct iommu_domain *domain,
3766				     struct device *dev)
3767{
3768	struct dmar_domain *dmar_domain = domain->priv;
3769	struct pci_dev *pdev = to_pci_dev(dev);
3770	struct intel_iommu *iommu;
3771	int addr_width;
 
 
 
 
 
 
3772
3773	/* normally pdev is not mapped */
3774	if (unlikely(domain_context_mapped(pdev))) {
3775		struct dmar_domain *old_domain;
3776
3777		old_domain = find_domain(pdev);
3778		if (old_domain) {
3779			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3780			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3781				domain_remove_one_dev_info(old_domain, pdev);
3782			else
3783				domain_remove_dev_info(old_domain);
 
 
3784		}
3785	}
3786
3787	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3788				pdev->devfn);
3789	if (!iommu)
3790		return -ENODEV;
3791
3792	/* check if this iommu agaw is sufficient for max mapped address */
3793	addr_width = agaw_to_width(iommu->agaw);
3794	if (addr_width > cap_mgaw(iommu->cap))
3795		addr_width = cap_mgaw(iommu->cap);
3796
3797	if (dmar_domain->max_addr > (1LL << addr_width)) {
3798		printk(KERN_ERR "%s: iommu width (%d) is not "
3799		       "sufficient for the mapped address (%llx)\n",
3800		       __func__, addr_width, dmar_domain->max_addr);
3801		return -EFAULT;
3802	}
3803	dmar_domain->gaw = addr_width;
3804
3805	/*
3806	 * Knock out extra levels of page tables if necessary
3807	 */
3808	while (iommu->agaw < dmar_domain->agaw) {
3809		struct dma_pte *pte;
3810
3811		pte = dmar_domain->pgd;
3812		if (dma_pte_present(pte)) {
3813			dmar_domain->pgd = (struct dma_pte *)
3814				phys_to_virt(dma_pte_addr(pte));
3815			free_pgtable_page(pte);
3816		}
3817		dmar_domain->agaw--;
3818	}
3819
3820	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3821}
3822
3823static void intel_iommu_detach_device(struct iommu_domain *domain,
3824				      struct device *dev)
3825{
3826	struct dmar_domain *dmar_domain = domain->priv;
3827	struct pci_dev *pdev = to_pci_dev(dev);
3828
3829	domain_remove_one_dev_info(dmar_domain, pdev);
3830}
3831
3832static int intel_iommu_map(struct iommu_domain *domain,
3833			   unsigned long iova, phys_addr_t hpa,
3834			   int gfp_order, int iommu_prot)
3835{
3836	struct dmar_domain *dmar_domain = domain->priv;
3837	u64 max_addr;
3838	int prot = 0;
3839	size_t size;
3840	int ret;
3841
3842	if (iommu_prot & IOMMU_READ)
3843		prot |= DMA_PTE_READ;
3844	if (iommu_prot & IOMMU_WRITE)
3845		prot |= DMA_PTE_WRITE;
3846	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3847		prot |= DMA_PTE_SNP;
3848
3849	size     = PAGE_SIZE << gfp_order;
3850	max_addr = iova + size;
3851	if (dmar_domain->max_addr < max_addr) {
3852		u64 end;
3853
3854		/* check if minimum agaw is sufficient for mapped address */
3855		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3856		if (end < max_addr) {
3857			printk(KERN_ERR "%s: iommu width (%d) is not "
3858			       "sufficient for the mapped address (%llx)\n",
3859			       __func__, dmar_domain->gaw, max_addr);
3860			return -EFAULT;
3861		}
3862		dmar_domain->max_addr = max_addr;
3863	}
3864	/* Round up size to next multiple of PAGE_SIZE, if it and
3865	   the low bits of hpa would take us onto the next page */
3866	size = aligned_nrpages(hpa, size);
3867	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3868				 hpa >> VTD_PAGE_SHIFT, size, prot);
3869	return ret;
3870}
3871
3872static int intel_iommu_unmap(struct iommu_domain *domain,
3873			     unsigned long iova, int gfp_order)
3874{
3875	struct dmar_domain *dmar_domain = domain->priv;
3876	size_t size = PAGE_SIZE << gfp_order;
3877	int order;
 
 
 
 
 
 
 
 
 
 
 
 
3878
3879	order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3880			    (iova + size - 1) >> VTD_PAGE_SHIFT);
 
 
 
 
 
 
 
3881
3882	if (dmar_domain->max_addr == iova + size)
3883		dmar_domain->max_addr = iova;
3884
3885	return order;
3886}
3887
3888static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3889					    unsigned long iova)
3890{
3891	struct dmar_domain *dmar_domain = domain->priv;
3892	struct dma_pte *pte;
 
3893	u64 phys = 0;
3894
3895	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3896	if (pte)
3897		phys = dma_pte_addr(pte);
3898
3899	return phys;
3900}
3901
3902static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3903				      unsigned long cap)
3904{
3905	struct dmar_domain *dmar_domain = domain->priv;
3906
3907	if (cap == IOMMU_CAP_CACHE_COHERENCY)
3908		return dmar_domain->iommu_snooping;
3909	if (cap == IOMMU_CAP_INTR_REMAP)
3910		return intr_remapping_enabled;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3911
 
 
 
 
 
 
3912	return 0;
3913}
3914
3915static struct iommu_ops intel_iommu_ops = {
3916	.domain_init	= intel_iommu_domain_init,
3917	.domain_destroy = intel_iommu_domain_destroy,
3918	.attach_dev	= intel_iommu_attach_device,
3919	.detach_dev	= intel_iommu_detach_device,
3920	.map		= intel_iommu_map,
3921	.unmap		= intel_iommu_unmap,
3922	.iova_to_phys	= intel_iommu_iova_to_phys,
3923	.domain_has_cap = intel_iommu_domain_has_cap,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3924};
3925
3926static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3927{
3928	/*
3929	 * Mobile 4 Series Chipset neglects to set RWBF capability,
3930	 * but needs it:
3931	 */
3932	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3933	rwbf_quirk = 1;
3934
3935	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3936	if (dev->revision == 0x07) {
3937		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3938		dmar_map_gfx = 0;
3939	}
3940}
3941
3942DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
 
 
 
 
 
 
3943
3944#define GGC 0x52
3945#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
3946#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
3947#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
3948#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
3949#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
3950#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
3951#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
3952#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
3953
3954static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3955{
3956	unsigned short ggc;
3957
3958	if (pci_read_config_word(dev, GGC, &ggc))
3959		return;
3960
3961	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3962		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3963		dmar_map_gfx = 0;
3964	} else if (dmar_map_gfx) {
3965		/* we have to ensure the gfx device is idle before we flush */
3966		printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
3967		intel_iommu_strict = 1;
3968       }
3969}
3970DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3971DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3972DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3973DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3974
3975/* On Tylersburg chipsets, some BIOSes have been known to enable the
3976   ISOCH DMAR unit for the Azalia sound device, but not give it any
3977   TLB entries, which causes it to deadlock. Check for that.  We do
3978   this in a function called from init_dmars(), instead of in a PCI
3979   quirk, because we don't want to print the obnoxious "BIOS broken"
3980   message if VT-d is actually disabled.
3981*/
3982static void __init check_tylersburg_isoch(void)
3983{
3984	struct pci_dev *pdev;
3985	uint32_t vtisochctrl;
3986
3987	/* If there's no Azalia in the system anyway, forget it. */
3988	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3989	if (!pdev)
3990		return;
3991	pci_dev_put(pdev);
3992
3993	/* System Management Registers. Might be hidden, in which case
3994	   we can't do the sanity check. But that's OK, because the
3995	   known-broken BIOSes _don't_ actually hide it, so far. */
3996	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3997	if (!pdev)
3998		return;
3999
4000	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4001		pci_dev_put(pdev);
4002		return;
4003	}
4004
4005	pci_dev_put(pdev);
4006
4007	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4008	if (vtisochctrl & 1)
4009		return;
4010
4011	/* Drop all bits other than the number of TLB entries */
4012	vtisochctrl &= 0x1c;
4013
4014	/* If we have the recommended number of TLB entries (16), fine. */
4015	if (vtisochctrl == 0x10)
4016		return;
4017
4018	/* Zero TLB entries? You get to ride the short bus to school. */
4019	if (!vtisochctrl) {
4020		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4021		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4022		     dmi_get_system_info(DMI_BIOS_VENDOR),
4023		     dmi_get_system_info(DMI_BIOS_VERSION),
4024		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4025		iommu_identity_mapping |= IDENTMAP_AZALIA;
4026		return;
4027	}
4028	
4029	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4030	       vtisochctrl);
4031}
v4.17
   1/*
   2 * Copyright © 2006-2014 Intel Corporation.
   3 *
   4 * This program is free software; you can redistribute it and/or modify it
   5 * under the terms and conditions of the GNU General Public License,
   6 * version 2, as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope it will be useful, but WITHOUT
   9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11 * more details.
  12 *
  13 * Authors: David Woodhouse <dwmw2@infradead.org>,
  14 *          Ashok Raj <ashok.raj@intel.com>,
  15 *          Shaohua Li <shaohua.li@intel.com>,
  16 *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17 *          Fenghua Yu <fenghua.yu@intel.com>
  18 *          Joerg Roedel <jroedel@suse.de>
 
 
 
  19 */
  20
  21#define pr_fmt(fmt)     "DMAR: " fmt
  22
  23#include <linux/init.h>
  24#include <linux/bitmap.h>
  25#include <linux/debugfs.h>
  26#include <linux/export.h>
  27#include <linux/slab.h>
  28#include <linux/irq.h>
  29#include <linux/interrupt.h>
  30#include <linux/spinlock.h>
  31#include <linux/pci.h>
  32#include <linux/dmar.h>
  33#include <linux/dma-mapping.h>
  34#include <linux/dma-direct.h>
  35#include <linux/mempool.h>
  36#include <linux/memory.h>
  37#include <linux/cpu.h>
  38#include <linux/timer.h>
  39#include <linux/io.h>
  40#include <linux/iova.h>
  41#include <linux/iommu.h>
  42#include <linux/intel-iommu.h>
  43#include <linux/syscore_ops.h>
  44#include <linux/tboot.h>
  45#include <linux/dmi.h>
  46#include <linux/pci-ats.h>
  47#include <linux/memblock.h>
  48#include <linux/dma-contiguous.h>
  49#include <linux/dma-direct.h>
  50#include <linux/crash_dump.h>
  51#include <asm/irq_remapping.h>
  52#include <asm/cacheflush.h>
  53#include <asm/iommu.h>
  54
  55#include "irq_remapping.h"
  56
  57#define ROOT_SIZE		VTD_PAGE_SIZE
  58#define CONTEXT_SIZE		VTD_PAGE_SIZE
  59
 
 
  60#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  61#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  62#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  63#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  64
  65#define IOAPIC_RANGE_START	(0xfee00000)
  66#define IOAPIC_RANGE_END	(0xfeefffff)
  67#define IOVA_START_ADDR		(0x1000)
  68
  69#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  70
  71#define MAX_AGAW_WIDTH 64
  72#define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  73
  74#define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  75#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  76
  77/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  78   to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  79#define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
  80				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  81#define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  82
  83/* IO virtual address start page frame number */
  84#define IOVA_START_PFN		(1)
  85
  86#define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
 
 
  87
  88/* page table handling */
  89#define LEVEL_STRIDE		(9)
  90#define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
  91
  92/*
  93 * This bitmap is used to advertise the page sizes our hardware support
  94 * to the IOMMU core, which will then use this information to split
  95 * physically contiguous memory regions it is mapping into page sizes
  96 * that we support.
  97 *
  98 * Traditionally the IOMMU core just handed us the mappings directly,
  99 * after making sure the size is an order of a 4KiB page and that the
 100 * mapping has natural alignment.
 101 *
 102 * To retain this behavior, we currently advertise that we support
 103 * all page sizes that are an order of 4KiB.
 104 *
 105 * If at some point we'd like to utilize the IOMMU core's new behavior,
 106 * we could change this to advertise the real page sizes we support.
 107 */
 108#define INTEL_IOMMU_PGSIZES	(~0xFFFUL)
 109
 110static inline int agaw_to_level(int agaw)
 111{
 112	return agaw + 2;
 113}
 114
 115static inline int agaw_to_width(int agaw)
 116{
 117	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118}
 119
 120static inline int width_to_agaw(int width)
 121{
 122	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123}
 124
 125static inline unsigned int level_to_offset_bits(int level)
 126{
 127	return (level - 1) * LEVEL_STRIDE;
 128}
 129
 130static inline int pfn_level_offset(unsigned long pfn, int level)
 131{
 132	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133}
 134
 135static inline unsigned long level_mask(int level)
 136{
 137	return -1UL << level_to_offset_bits(level);
 138}
 139
 140static inline unsigned long level_size(int level)
 141{
 142	return 1UL << level_to_offset_bits(level);
 143}
 144
 145static inline unsigned long align_to_level(unsigned long pfn, int level)
 146{
 147	return (pfn + level_size(level) - 1) & level_mask(level);
 148}
 149
 150static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151{
 152	return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153}
 154
 155/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156   are never going to work. */
 157static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158{
 159	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160}
 161
 162static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163{
 164	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165}
 166static inline unsigned long page_to_dma_pfn(struct page *pg)
 167{
 168	return mm_to_dma_pfn(page_to_pfn(pg));
 169}
 170static inline unsigned long virt_to_dma_pfn(void *p)
 171{
 172	return page_to_dma_pfn(virt_to_page(p));
 173}
 174
 175/* global iommu list, set NULL for ignored DMAR units */
 176static struct intel_iommu **g_iommus;
 177
 178static void __init check_tylersburg_isoch(void);
 179static int rwbf_quirk;
 180
 181/*
 182 * set to 1 to panic kernel if can't successfully enable VT-d
 183 * (used when kernel is launched w/ TXT)
 184 */
 185static int force_on = 0;
 186int intel_iommu_tboot_noforce;
 187
 188/*
 189 * 0: Present
 190 * 1-11: Reserved
 191 * 12-63: Context Ptr (12 - (haw-1))
 192 * 64-127: Reserved
 193 */
 194struct root_entry {
 195	u64	lo;
 196	u64	hi;
 197};
 198#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 199
 200/*
 201 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 202 * if marked present.
 203 */
 204static phys_addr_t root_entry_lctp(struct root_entry *re)
 
 
 
 205{
 206	if (!(re->lo & 1))
 207		return 0;
 208
 209	return re->lo & VTD_PAGE_MASK;
 210}
 211
 212/*
 213 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 214 * if marked present.
 215 */
 216static phys_addr_t root_entry_uctp(struct root_entry *re)
 217{
 218	if (!(re->hi & 1))
 219		return 0;
 
 
 
 220
 221	return re->hi & VTD_PAGE_MASK;
 222}
 223/*
 224 * low 64 bits:
 225 * 0: present
 226 * 1: fault processing disable
 227 * 2-3: translation type
 228 * 12-63: address space root
 229 * high 64 bits:
 230 * 0-2: address width
 231 * 3-6: aval
 232 * 8-23: domain id
 233 */
 234struct context_entry {
 235	u64 lo;
 236	u64 hi;
 237};
 238
 239static inline void context_clear_pasid_enable(struct context_entry *context)
 240{
 241	context->lo &= ~(1ULL << 11);
 242}
 243
 244static inline bool context_pasid_enabled(struct context_entry *context)
 245{
 246	return !!(context->lo & (1ULL << 11));
 247}
 248
 249static inline void context_set_copied(struct context_entry *context)
 250{
 251	context->hi |= (1ull << 3);
 252}
 253
 254static inline bool context_copied(struct context_entry *context)
 255{
 256	return !!(context->hi & (1ULL << 3));
 257}
 258
 259static inline bool __context_present(struct context_entry *context)
 260{
 261	return (context->lo & 1);
 262}
 263
 264static inline bool context_present(struct context_entry *context)
 265{
 266	return context_pasid_enabled(context) ?
 267	     __context_present(context) :
 268	     __context_present(context) && !context_copied(context);
 269}
 270
 271static inline void context_set_present(struct context_entry *context)
 272{
 273	context->lo |= 1;
 274}
 275
 276static inline void context_set_fault_enable(struct context_entry *context)
 277{
 278	context->lo &= (((u64)-1) << 2) | 1;
 279}
 280
 281static inline void context_set_translation_type(struct context_entry *context,
 282						unsigned long value)
 283{
 284	context->lo &= (((u64)-1) << 4) | 3;
 285	context->lo |= (value & 3) << 2;
 286}
 287
 288static inline void context_set_address_root(struct context_entry *context,
 289					    unsigned long value)
 290{
 291	context->lo &= ~VTD_PAGE_MASK;
 292	context->lo |= value & VTD_PAGE_MASK;
 293}
 294
 295static inline void context_set_address_width(struct context_entry *context,
 296					     unsigned long value)
 297{
 298	context->hi |= value & 7;
 299}
 300
 301static inline void context_set_domain_id(struct context_entry *context,
 302					 unsigned long value)
 303{
 304	context->hi |= (value & ((1 << 16) - 1)) << 8;
 305}
 306
 307static inline int context_domain_id(struct context_entry *c)
 308{
 309	return((c->hi >> 8) & 0xffff);
 310}
 311
 312static inline void context_clear_entry(struct context_entry *context)
 313{
 314	context->lo = 0;
 315	context->hi = 0;
 316}
 317
 318/*
 319 * 0: readable
 320 * 1: writable
 321 * 2-6: reserved
 322 * 7: super page
 323 * 8-10: available
 324 * 11: snoop behavior
 325 * 12-63: Host physcial address
 326 */
 327struct dma_pte {
 328	u64 val;
 329};
 330
 331static inline void dma_clear_pte(struct dma_pte *pte)
 332{
 333	pte->val = 0;
 334}
 335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 336static inline u64 dma_pte_addr(struct dma_pte *pte)
 337{
 338#ifdef CONFIG_64BIT
 339	return pte->val & VTD_PAGE_MASK;
 340#else
 341	/* Must have a full atomic 64-bit read */
 342	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
 343#endif
 344}
 345
 
 
 
 
 
 346static inline bool dma_pte_present(struct dma_pte *pte)
 347{
 348	return (pte->val & 3) != 0;
 349}
 350
 351static inline bool dma_pte_superpage(struct dma_pte *pte)
 352{
 353	return (pte->val & DMA_PTE_LARGE_PAGE);
 354}
 355
 356static inline int first_pte_in_page(struct dma_pte *pte)
 357{
 358	return !((unsigned long)pte & ~VTD_PAGE_MASK);
 359}
 360
 361/*
 362 * This domain is a statically identity mapping domain.
 363 *	1. This domain creats a static 1:1 mapping to all usable memory.
 364 * 	2. It maps to each iommu if successful.
 365 *	3. Each iommu mapps to this domain if successful.
 366 */
 367static struct dmar_domain *si_domain;
 368static int hw_pass_through = 1;
 369
 370/*
 371 * Domain represents a virtual machine, more than one devices
 
 
 372 * across iommus may be owned in one domain, e.g. kvm guest.
 373 */
 374#define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 0)
 375
 376/* si_domain contains mulitple devices */
 377#define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 1)
 378
 379#define for_each_domain_iommu(idx, domain)			\
 380	for (idx = 0; idx < g_num_of_iommus; idx++)		\
 381		if (domain->iommu_refcnt[idx])
 382
 383struct dmar_domain {
 
 384	int	nid;			/* node id */
 
 385
 386	unsigned	iommu_refcnt[DMAR_UNITS_SUPPORTED];
 387					/* Refcount of devices per iommu */
 388
 389
 390	u16		iommu_did[DMAR_UNITS_SUPPORTED];
 391					/* Domain ids per IOMMU. Use u16 since
 392					 * domain ids are 16 bit wide according
 393					 * to VT-d spec, section 9.3 */
 394
 395	bool has_iotlb_device;
 396	struct list_head devices;	/* all devices' list */
 397	struct iova_domain iovad;	/* iova's that belong to this domain */
 398
 399	struct dma_pte	*pgd;		/* virtual address */
 400	int		gaw;		/* max guest address width */
 401
 402	/* adjusted guest address width, 0 is level 2 30-bit */
 403	int		agaw;
 404
 405	int		flags;		/* flags to find out type of domain */
 406
 407	int		iommu_coherency;/* indicate coherency of iommu access */
 408	int		iommu_snooping; /* indicate snooping control feature*/
 409	int		iommu_count;	/* reference count of iommu */
 410	int		iommu_superpage;/* Level of superpages supported:
 411					   0 == 4KiB (no superpages), 1 == 2MiB,
 412					   2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
 
 413	u64		max_addr;	/* maximum mapped address */
 414
 415	struct iommu_domain domain;	/* generic domain data structure for
 416					   iommu core */
 417};
 418
 419/* PCI domain-device relationship */
 420struct device_domain_info {
 421	struct list_head link;	/* link to domain siblings */
 422	struct list_head global; /* link to global list */
 
 423	u8 bus;			/* PCI bus number */
 424	u8 devfn;		/* PCI devfn number */
 425	u8 pasid_supported:3;
 426	u8 pasid_enabled:1;
 427	u8 pri_supported:1;
 428	u8 pri_enabled:1;
 429	u8 ats_supported:1;
 430	u8 ats_enabled:1;
 431	u8 ats_qdep;
 432	struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
 433	struct intel_iommu *iommu; /* IOMMU used by this device */
 434	struct dmar_domain *domain; /* pointer to domain */
 435};
 436
 437struct dmar_rmrr_unit {
 438	struct list_head list;		/* list of rmrr units	*/
 439	struct acpi_dmar_header *hdr;	/* ACPI header		*/
 440	u64	base_address;		/* reserved base address*/
 441	u64	end_address;		/* reserved end address */
 442	struct dmar_dev_scope *devices;	/* target devices */
 443	int	devices_cnt;		/* target device count */
 444	struct iommu_resv_region *resv; /* reserved region handle */
 445};
 446
 447struct dmar_atsr_unit {
 448	struct list_head list;		/* list of ATSR units */
 449	struct acpi_dmar_header *hdr;	/* ACPI header */
 450	struct dmar_dev_scope *devices;	/* target devices */
 451	int devices_cnt;		/* target device count */
 452	u8 include_all:1;		/* include all ports */
 453};
 454
 455static LIST_HEAD(dmar_atsr_units);
 456static LIST_HEAD(dmar_rmrr_units);
 457
 458#define for_each_rmrr_units(rmrr) \
 459	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 460
 461/* bitmap for indexing intel_iommus */
 462static int g_num_of_iommus;
 463
 464static void domain_exit(struct dmar_domain *domain);
 
 
 
 
 
 465static void domain_remove_dev_info(struct dmar_domain *domain);
 466static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 467				     struct device *dev);
 468static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 469static void domain_context_clear(struct intel_iommu *iommu,
 470				 struct device *dev);
 471static int domain_detach_iommu(struct dmar_domain *domain,
 472			       struct intel_iommu *iommu);
 473
 474#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 475int dmar_disabled = 0;
 476#else
 477int dmar_disabled = 1;
 478#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 479
 480int intel_iommu_enabled = 0;
 481EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 482
 483static int dmar_map_gfx = 1;
 484static int dmar_forcedac;
 485static int intel_iommu_strict;
 486static int intel_iommu_superpage = 1;
 487static int intel_iommu_ecs = 1;
 488static int intel_iommu_pasid28;
 489static int iommu_identity_mapping;
 490
 491#define IDENTMAP_ALL		1
 492#define IDENTMAP_GFX		2
 493#define IDENTMAP_AZALIA		4
 494
 495/* Broadwell and Skylake have broken ECS support — normal so-called "second
 496 * level" translation of DMA requests-without-PASID doesn't actually happen
 497 * unless you also set the NESTE bit in an extended context-entry. Which of
 498 * course means that SVM doesn't work because it's trying to do nested
 499 * translation of the physical addresses it finds in the process page tables,
 500 * through the IOVA->phys mapping found in the "second level" page tables.
 501 *
 502 * The VT-d specification was retroactively changed to change the definition
 503 * of the capability bits and pretend that Broadwell/Skylake never happened...
 504 * but unfortunately the wrong bit was changed. It's ECS which is broken, but
 505 * for some reason it was the PASID capability bit which was redefined (from
 506 * bit 28 on BDW/SKL to bit 40 in future).
 507 *
 508 * So our test for ECS needs to eschew those implementations which set the old
 509 * PASID capabiity bit 28, since those are the ones on which ECS is broken.
 510 * Unless we are working around the 'pasid28' limitations, that is, by putting
 511 * the device into passthrough mode for normal DMA and thus masking the bug.
 512 */
 513#define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
 514			    (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
 515/* PASID support is thus enabled if ECS is enabled and *either* of the old
 516 * or new capability bits are set. */
 517#define pasid_enabled(iommu) (ecs_enabled(iommu) &&			\
 518			      (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
 519
 520int intel_iommu_gfx_mapped;
 521EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 522
 523#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 524static DEFINE_SPINLOCK(device_domain_lock);
 525static LIST_HEAD(device_domain_list);
 526
 527const struct iommu_ops intel_iommu_ops;
 528
 529static bool translation_pre_enabled(struct intel_iommu *iommu)
 530{
 531	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 532}
 533
 534static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 535{
 536	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 537}
 538
 539static void init_translation_status(struct intel_iommu *iommu)
 540{
 541	u32 gsts;
 542
 543	gsts = readl(iommu->reg + DMAR_GSTS_REG);
 544	if (gsts & DMA_GSTS_TES)
 545		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 546}
 547
 548/* Convert generic 'struct iommu_domain to private struct dmar_domain */
 549static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 550{
 551	return container_of(dom, struct dmar_domain, domain);
 552}
 553
 554static int __init intel_iommu_setup(char *str)
 555{
 556	if (!str)
 557		return -EINVAL;
 558	while (*str) {
 559		if (!strncmp(str, "on", 2)) {
 560			dmar_disabled = 0;
 561			pr_info("IOMMU enabled\n");
 562		} else if (!strncmp(str, "off", 3)) {
 563			dmar_disabled = 1;
 564			pr_info("IOMMU disabled\n");
 565		} else if (!strncmp(str, "igfx_off", 8)) {
 566			dmar_map_gfx = 0;
 567			pr_info("Disable GFX device mapping\n");
 
 568		} else if (!strncmp(str, "forcedac", 8)) {
 569			pr_info("Forcing DAC for PCI devices\n");
 
 570			dmar_forcedac = 1;
 571		} else if (!strncmp(str, "strict", 6)) {
 572			pr_info("Disable batched IOTLB flush\n");
 
 573			intel_iommu_strict = 1;
 574		} else if (!strncmp(str, "sp_off", 6)) {
 575			pr_info("Disable supported super page\n");
 
 576			intel_iommu_superpage = 0;
 577		} else if (!strncmp(str, "ecs_off", 7)) {
 578			printk(KERN_INFO
 579				"Intel-IOMMU: disable extended context table support\n");
 580			intel_iommu_ecs = 0;
 581		} else if (!strncmp(str, "pasid28", 7)) {
 582			printk(KERN_INFO
 583				"Intel-IOMMU: enable pre-production PASID support\n");
 584			intel_iommu_pasid28 = 1;
 585			iommu_identity_mapping |= IDENTMAP_GFX;
 586		} else if (!strncmp(str, "tboot_noforce", 13)) {
 587			printk(KERN_INFO
 588				"Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 589			intel_iommu_tboot_noforce = 1;
 590		}
 591
 592		str += strcspn(str, ",");
 593		while (*str == ',')
 594			str++;
 595	}
 596	return 0;
 597}
 598__setup("intel_iommu=", intel_iommu_setup);
 599
 600static struct kmem_cache *iommu_domain_cache;
 601static struct kmem_cache *iommu_devinfo_cache;
 602
 603static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 604{
 605	struct dmar_domain **domains;
 606	int idx = did >> 8;
 607
 608	domains = iommu->domains[idx];
 609	if (!domains)
 610		return NULL;
 611
 612	return domains[did & 0xff];
 613}
 614
 615static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 616			     struct dmar_domain *domain)
 617{
 618	struct dmar_domain **domains;
 619	int idx = did >> 8;
 620
 621	if (!iommu->domains[idx]) {
 622		size_t size = 256 * sizeof(struct dmar_domain *);
 623		iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 624	}
 625
 626	domains = iommu->domains[idx];
 627	if (WARN_ON(!domains))
 628		return;
 629	else
 630		domains[did & 0xff] = domain;
 631}
 632
 633static inline void *alloc_pgtable_page(int node)
 634{
 635	struct page *page;
 636	void *vaddr = NULL;
 637
 638	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 639	if (page)
 640		vaddr = page_address(page);
 641	return vaddr;
 642}
 643
 644static inline void free_pgtable_page(void *vaddr)
 645{
 646	free_page((unsigned long)vaddr);
 647}
 648
 649static inline void *alloc_domain_mem(void)
 650{
 651	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 652}
 653
 654static void free_domain_mem(void *vaddr)
 655{
 656	kmem_cache_free(iommu_domain_cache, vaddr);
 657}
 658
 659static inline void * alloc_devinfo_mem(void)
 660{
 661	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 662}
 663
 664static inline void free_devinfo_mem(void *vaddr)
 665{
 666	kmem_cache_free(iommu_devinfo_cache, vaddr);
 667}
 668
 669static inline int domain_type_is_vm(struct dmar_domain *domain)
 670{
 671	return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 672}
 673
 674static inline int domain_type_is_si(struct dmar_domain *domain)
 675{
 676	return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 677}
 678
 679static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 680{
 681	return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 682				DOMAIN_FLAG_STATIC_IDENTITY);
 683}
 684
 685static inline int domain_pfn_supported(struct dmar_domain *domain,
 686				       unsigned long pfn)
 687{
 688	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 689
 690	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 691}
 692
 693static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 694{
 695	unsigned long sagaw;
 696	int agaw = -1;
 697
 698	sagaw = cap_sagaw(iommu->cap);
 699	for (agaw = width_to_agaw(max_gaw);
 700	     agaw >= 0; agaw--) {
 701		if (test_bit(agaw, &sagaw))
 702			break;
 703	}
 704
 705	return agaw;
 706}
 707
 708/*
 709 * Calculate max SAGAW for each iommu.
 710 */
 711int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 712{
 713	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 714}
 715
 716/*
 717 * calculate agaw for each iommu.
 718 * "SAGAW" may be different across iommus, use a default agaw, and
 719 * get a supported less agaw for iommus that don't support the default agaw.
 720 */
 721int iommu_calculate_agaw(struct intel_iommu *iommu)
 722{
 723	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 724}
 725
 726/* This functionin only returns single iommu in a domain */
 727static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 728{
 729	int iommu_id;
 730
 731	/* si_domain and vm domain should not get here. */
 732	BUG_ON(domain_type_is_vm_or_si(domain));
 733	for_each_domain_iommu(iommu_id, domain)
 734		break;
 735
 
 736	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 737		return NULL;
 738
 739	return g_iommus[iommu_id];
 740}
 741
 742static void domain_update_iommu_coherency(struct dmar_domain *domain)
 743{
 744	struct dmar_drhd_unit *drhd;
 745	struct intel_iommu *iommu;
 746	bool found = false;
 747	int i;
 748
 749	domain->iommu_coherency = 1;
 750
 751	for_each_domain_iommu(i, domain) {
 752		found = true;
 753		if (!ecap_coherent(g_iommus[i]->ecap)) {
 754			domain->iommu_coherency = 0;
 755			break;
 756		}
 757	}
 758	if (found)
 759		return;
 760
 761	/* No hardware attached; use lowest common denominator */
 762	rcu_read_lock();
 763	for_each_active_iommu(iommu, drhd) {
 764		if (!ecap_coherent(iommu->ecap)) {
 765			domain->iommu_coherency = 0;
 766			break;
 767		}
 768	}
 769	rcu_read_unlock();
 770}
 771
 772static int domain_update_iommu_snooping(struct intel_iommu *skip)
 773{
 774	struct dmar_drhd_unit *drhd;
 775	struct intel_iommu *iommu;
 776	int ret = 1;
 777
 778	rcu_read_lock();
 779	for_each_active_iommu(iommu, drhd) {
 780		if (iommu != skip) {
 781			if (!ecap_sc_support(iommu->ecap)) {
 782				ret = 0;
 783				break;
 784			}
 785		}
 786	}
 787	rcu_read_unlock();
 788
 789	return ret;
 790}
 791
 792static int domain_update_iommu_superpage(struct intel_iommu *skip)
 793{
 794	struct dmar_drhd_unit *drhd;
 795	struct intel_iommu *iommu;
 796	int mask = 0xf;
 797
 798	if (!intel_iommu_superpage) {
 799		return 0;
 
 800	}
 801
 802	/* set iommu_superpage to the smallest common denominator */
 803	rcu_read_lock();
 804	for_each_active_iommu(iommu, drhd) {
 805		if (iommu != skip) {
 806			mask &= cap_super_page_val(iommu->cap);
 807			if (!mask)
 808				break;
 809		}
 810	}
 811	rcu_read_unlock();
 812
 813	return fls(mask);
 814}
 815
 816/* Some capabilities may be different across iommus */
 817static void domain_update_iommu_cap(struct dmar_domain *domain)
 818{
 819	domain_update_iommu_coherency(domain);
 820	domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 821	domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 822}
 823
 824static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
 825						       u8 bus, u8 devfn, int alloc)
 826{
 827	struct root_entry *root = &iommu->root_entry[bus];
 828	struct context_entry *context;
 829	u64 *entry;
 830
 831	entry = &root->lo;
 832	if (ecs_enabled(iommu)) {
 833		if (devfn >= 0x80) {
 834			devfn -= 0x80;
 835			entry = &root->hi;
 836		}
 837		devfn *= 2;
 838	}
 839	if (*entry & 1)
 840		context = phys_to_virt(*entry & VTD_PAGE_MASK);
 841	else {
 842		unsigned long phy_addr;
 843		if (!alloc)
 844			return NULL;
 845
 846		context = alloc_pgtable_page(iommu->node);
 847		if (!context)
 848			return NULL;
 849
 850		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 851		phy_addr = virt_to_phys((void *)context);
 852		*entry = phy_addr | 1;
 853		__iommu_flush_cache(iommu, entry, sizeof(*entry));
 854	}
 855	return &context[devfn];
 856}
 857
 858static int iommu_dummy(struct device *dev)
 859{
 860	return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 861}
 862
 863static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 864{
 865	struct dmar_drhd_unit *drhd = NULL;
 866	struct intel_iommu *iommu;
 867	struct device *tmp;
 868	struct pci_dev *ptmp, *pdev = NULL;
 869	u16 segment = 0;
 870	int i;
 871
 872	if (iommu_dummy(dev))
 873		return NULL;
 874
 875	if (dev_is_pci(dev)) {
 876		struct pci_dev *pf_pdev;
 877
 878		pdev = to_pci_dev(dev);
 879
 880#ifdef CONFIG_X86
 881		/* VMD child devices currently cannot be handled individually */
 882		if (is_vmd(pdev->bus))
 883			return NULL;
 884#endif
 885
 886		/* VFs aren't listed in scope tables; we need to look up
 887		 * the PF instead to find the IOMMU. */
 888		pf_pdev = pci_physfn(pdev);
 889		dev = &pf_pdev->dev;
 890		segment = pci_domain_nr(pdev->bus);
 891	} else if (has_acpi_companion(dev))
 892		dev = &ACPI_COMPANION(dev)->dev;
 893
 894	rcu_read_lock();
 895	for_each_active_iommu(iommu, drhd) {
 896		if (pdev && segment != drhd->segment)
 897			continue;
 898
 899		for_each_active_dev_scope(drhd->devices,
 900					  drhd->devices_cnt, i, tmp) {
 901			if (tmp == dev) {
 902				/* For a VF use its original BDF# not that of the PF
 903				 * which we used for the IOMMU lookup. Strictly speaking
 904				 * we could do this for all PCI devices; we only need to
 905				 * get the BDF# from the scope table for ACPI matches. */
 906				if (pdev && pdev->is_virtfn)
 907					goto got_pdev;
 908
 909				*bus = drhd->devices[i].bus;
 910				*devfn = drhd->devices[i].devfn;
 911				goto out;
 912			}
 913
 914			if (!pdev || !dev_is_pci(tmp))
 915				continue;
 916
 917			ptmp = to_pci_dev(tmp);
 918			if (ptmp->subordinate &&
 919			    ptmp->subordinate->number <= pdev->bus->number &&
 920			    ptmp->subordinate->busn_res.end >= pdev->bus->number)
 921				goto got_pdev;
 922		}
 923
 924		if (pdev && drhd->include_all) {
 925		got_pdev:
 926			*bus = pdev->bus->number;
 927			*devfn = pdev->devfn;
 928			goto out;
 929		}
 930	}
 931	iommu = NULL;
 932 out:
 933	rcu_read_unlock();
 934
 935	return iommu;
 936}
 937
 938static void domain_flush_cache(struct dmar_domain *domain,
 939			       void *addr, int size)
 940{
 941	if (!domain->iommu_coherency)
 942		clflush_cache_range(addr, size);
 943}
 944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 945static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 946{
 
 947	struct context_entry *context;
 948	int ret = 0;
 949	unsigned long flags;
 950
 951	spin_lock_irqsave(&iommu->lock, flags);
 952	context = iommu_context_addr(iommu, bus, devfn, 0);
 953	if (context)
 954		ret = context_present(context);
 
 
 
 
 
 955	spin_unlock_irqrestore(&iommu->lock, flags);
 956	return ret;
 957}
 958
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 959static void free_context_table(struct intel_iommu *iommu)
 960{
 
 961	int i;
 962	unsigned long flags;
 963	struct context_entry *context;
 964
 965	spin_lock_irqsave(&iommu->lock, flags);
 966	if (!iommu->root_entry) {
 967		goto out;
 968	}
 969	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 970		context = iommu_context_addr(iommu, i, 0, 0);
 
 971		if (context)
 972			free_pgtable_page(context);
 973
 974		if (!ecs_enabled(iommu))
 975			continue;
 976
 977		context = iommu_context_addr(iommu, i, 0x80, 0);
 978		if (context)
 979			free_pgtable_page(context);
 980
 981	}
 982	free_pgtable_page(iommu->root_entry);
 983	iommu->root_entry = NULL;
 984out:
 985	spin_unlock_irqrestore(&iommu->lock, flags);
 986}
 987
 988static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 989				      unsigned long pfn, int *target_level)
 990{
 
 991	struct dma_pte *parent, *pte = NULL;
 992	int level = agaw_to_level(domain->agaw);
 993	int offset;
 994
 995	BUG_ON(!domain->pgd);
 996
 997	if (!domain_pfn_supported(domain, pfn))
 998		/* Address beyond IOMMU's addressing capabilities. */
 999		return NULL;
1000
1001	parent = domain->pgd;
1002
1003	while (1) {
1004		void *tmp_page;
1005
1006		offset = pfn_level_offset(pfn, level);
1007		pte = &parent[offset];
1008		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1009			break;
1010		if (level == *target_level)
1011			break;
1012
1013		if (!dma_pte_present(pte)) {
1014			uint64_t pteval;
1015
1016			tmp_page = alloc_pgtable_page(domain->nid);
1017
1018			if (!tmp_page)
1019				return NULL;
1020
1021			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1022			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1023			if (cmpxchg64(&pte->val, 0ULL, pteval))
1024				/* Someone else set it while we were thinking; use theirs. */
1025				free_pgtable_page(tmp_page);
1026			else
 
1027				domain_flush_cache(domain, pte, sizeof(*pte));
 
1028		}
1029		if (level == 1)
1030			break;
1031
1032		parent = phys_to_virt(dma_pte_addr(pte));
1033		level--;
1034	}
1035
1036	if (!*target_level)
1037		*target_level = level;
1038
1039	return pte;
1040}
1041
1042
1043/* return address's pte at specific level */
1044static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1045					 unsigned long pfn,
1046					 int level, int *large_page)
1047{
1048	struct dma_pte *parent, *pte = NULL;
1049	int total = agaw_to_level(domain->agaw);
1050	int offset;
1051
1052	parent = domain->pgd;
1053	while (level <= total) {
1054		offset = pfn_level_offset(pfn, total);
1055		pte = &parent[offset];
1056		if (level == total)
1057			return pte;
1058
1059		if (!dma_pte_present(pte)) {
1060			*large_page = total;
1061			break;
1062		}
1063
1064		if (dma_pte_superpage(pte)) {
1065			*large_page = total;
1066			return pte;
1067		}
1068
1069		parent = phys_to_virt(dma_pte_addr(pte));
1070		total--;
1071	}
1072	return NULL;
1073}
1074
1075/* clear last level pte, a tlb flush should be followed */
1076static void dma_pte_clear_range(struct dmar_domain *domain,
1077				unsigned long start_pfn,
1078				unsigned long last_pfn)
1079{
 
1080	unsigned int large_page = 1;
1081	struct dma_pte *first_pte, *pte;
 
1082
1083	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1084	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1085	BUG_ON(start_pfn > last_pfn);
1086
1087	/* we don't need lock here; nobody else touches the iova range */
1088	do {
1089		large_page = 1;
1090		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1091		if (!pte) {
1092			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1093			continue;
1094		}
1095		do {
1096			dma_clear_pte(pte);
1097			start_pfn += lvl_to_nr_pages(large_page);
1098			pte++;
1099		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1100
1101		domain_flush_cache(domain, first_pte,
1102				   (void *)pte - (void *)first_pte);
1103
1104	} while (start_pfn && start_pfn <= last_pfn);
1105}
1106
1107static void dma_pte_free_level(struct dmar_domain *domain, int level,
1108			       int retain_level, struct dma_pte *pte,
1109			       unsigned long pfn, unsigned long start_pfn,
1110			       unsigned long last_pfn)
1111{
1112	pfn = max(start_pfn, pfn);
1113	pte = &pte[pfn_level_offset(pfn, level)];
1114
1115	do {
1116		unsigned long level_pfn;
1117		struct dma_pte *level_pte;
1118
1119		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1120			goto next;
1121
1122		level_pfn = pfn & level_mask(level);
1123		level_pte = phys_to_virt(dma_pte_addr(pte));
1124
1125		if (level > 2) {
1126			dma_pte_free_level(domain, level - 1, retain_level,
1127					   level_pte, level_pfn, start_pfn,
1128					   last_pfn);
1129		}
1130
1131		/*
1132		 * Free the page table if we're below the level we want to
1133		 * retain and the range covers the entire table.
1134		 */
1135		if (level < retain_level && !(start_pfn > level_pfn ||
1136		      last_pfn < level_pfn + level_size(level) - 1)) {
1137			dma_clear_pte(pte);
1138			domain_flush_cache(domain, pte, sizeof(*pte));
1139			free_pgtable_page(level_pte);
1140		}
1141next:
1142		pfn += level_size(level);
1143	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1144}
1145
1146/*
1147 * clear last level (leaf) ptes and free page table pages below the
1148 * level we wish to keep intact.
1149 */
1150static void dma_pte_free_pagetable(struct dmar_domain *domain,
1151				   unsigned long start_pfn,
1152				   unsigned long last_pfn,
1153				   int retain_level)
1154{
1155	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1156	BUG_ON(!domain_pfn_supported(domain, last_pfn));
 
 
 
 
 
 
 
1157	BUG_ON(start_pfn > last_pfn);
1158
1159	dma_pte_clear_range(domain, start_pfn, last_pfn);
1160
1161	/* We don't need lock here; nobody else touches the iova range */
1162	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1163			   domain->pgd, 0, start_pfn, last_pfn);
1164
1165	/* free pgd */
1166	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1167		free_pgtable_page(domain->pgd);
1168		domain->pgd = NULL;
1169	}
1170}
1171
1172/* When a page at a given level is being unlinked from its parent, we don't
1173   need to *modify* it at all. All we need to do is make a list of all the
1174   pages which can be freed just as soon as we've flushed the IOTLB and we
1175   know the hardware page-walk will no longer touch them.
1176   The 'pte' argument is the *parent* PTE, pointing to the page that is to
1177   be freed. */
1178static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1179					    int level, struct dma_pte *pte,
1180					    struct page *freelist)
1181{
1182	struct page *pg;
1183
1184	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1185	pg->freelist = freelist;
1186	freelist = pg;
1187
1188	if (level == 1)
1189		return freelist;
1190
1191	pte = page_address(pg);
1192	do {
1193		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1194			freelist = dma_pte_list_pagetables(domain, level - 1,
1195							   pte, freelist);
1196		pte++;
1197	} while (!first_pte_in_page(pte));
1198
1199	return freelist;
1200}
1201
1202static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1203					struct dma_pte *pte, unsigned long pfn,
1204					unsigned long start_pfn,
1205					unsigned long last_pfn,
1206					struct page *freelist)
1207{
1208	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1209
1210	pfn = max(start_pfn, pfn);
1211	pte = &pte[pfn_level_offset(pfn, level)];
1212
1213	do {
1214		unsigned long level_pfn;
1215
1216		if (!dma_pte_present(pte))
1217			goto next;
1218
1219		level_pfn = pfn & level_mask(level);
1220
1221		/* If range covers entire pagetable, free it */
1222		if (start_pfn <= level_pfn &&
1223		    last_pfn >= level_pfn + level_size(level) - 1) {
1224			/* These suborbinate page tables are going away entirely. Don't
1225			   bother to clear them; we're just going to *free* them. */
1226			if (level > 1 && !dma_pte_superpage(pte))
1227				freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1228
1229			dma_clear_pte(pte);
1230			if (!first_pte)
1231				first_pte = pte;
1232			last_pte = pte;
1233		} else if (level > 1) {
1234			/* Recurse down into a level that isn't *entirely* obsolete */
1235			freelist = dma_pte_clear_level(domain, level - 1,
1236						       phys_to_virt(dma_pte_addr(pte)),
1237						       level_pfn, start_pfn, last_pfn,
1238						       freelist);
1239		}
1240next:
1241		pfn += level_size(level);
1242	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1243
1244	if (first_pte)
1245		domain_flush_cache(domain, first_pte,
1246				   (void *)++last_pte - (void *)first_pte);
1247
1248	return freelist;
1249}
1250
1251/* We can't just free the pages because the IOMMU may still be walking
1252   the page tables, and may have cached the intermediate levels. The
1253   pages can only be freed after the IOTLB flush has been done. */
1254static struct page *domain_unmap(struct dmar_domain *domain,
1255				 unsigned long start_pfn,
1256				 unsigned long last_pfn)
1257{
1258	struct page *freelist = NULL;
1259
1260	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1261	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1262	BUG_ON(start_pfn > last_pfn);
1263
1264	/* we don't need lock here; nobody else touches the iova range */
1265	freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1266				       domain->pgd, 0, start_pfn, last_pfn, NULL);
1267
 
 
 
 
 
 
1268	/* free pgd */
1269	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1270		struct page *pgd_page = virt_to_page(domain->pgd);
1271		pgd_page->freelist = freelist;
1272		freelist = pgd_page;
1273
1274		domain->pgd = NULL;
1275	}
1276
1277	return freelist;
1278}
1279
1280static void dma_free_pagelist(struct page *freelist)
1281{
1282	struct page *pg;
1283
1284	while ((pg = freelist)) {
1285		freelist = pg->freelist;
1286		free_pgtable_page(page_address(pg));
1287	}
1288}
1289
1290static void iova_entry_free(unsigned long data)
1291{
1292	struct page *freelist = (struct page *)data;
1293
1294	dma_free_pagelist(freelist);
1295}
1296
1297/* iommu handling */
1298static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1299{
1300	struct root_entry *root;
1301	unsigned long flags;
1302
1303	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1304	if (!root) {
1305		pr_err("Allocating root entry for %s failed\n",
1306			iommu->name);
1307		return -ENOMEM;
1308	}
1309
1310	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1311
1312	spin_lock_irqsave(&iommu->lock, flags);
1313	iommu->root_entry = root;
1314	spin_unlock_irqrestore(&iommu->lock, flags);
1315
1316	return 0;
1317}
1318
1319static void iommu_set_root_entry(struct intel_iommu *iommu)
1320{
1321	u64 addr;
1322	u32 sts;
1323	unsigned long flag;
1324
1325	addr = virt_to_phys(iommu->root_entry);
1326	if (ecs_enabled(iommu))
1327		addr |= DMA_RTADDR_RTT;
1328
1329	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1330	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1331
1332	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1333
1334	/* Make sure hardware complete it */
1335	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1336		      readl, (sts & DMA_GSTS_RTPS), sts);
1337
1338	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1339}
1340
1341static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1342{
1343	u32 val;
1344	unsigned long flag;
1345
1346	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1347		return;
1348
1349	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1350	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1351
1352	/* Make sure hardware complete it */
1353	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1354		      readl, (!(val & DMA_GSTS_WBFS)), val);
1355
1356	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1357}
1358
1359/* return value determine if we need a write buffer flush */
1360static void __iommu_flush_context(struct intel_iommu *iommu,
1361				  u16 did, u16 source_id, u8 function_mask,
1362				  u64 type)
1363{
1364	u64 val = 0;
1365	unsigned long flag;
1366
1367	switch (type) {
1368	case DMA_CCMD_GLOBAL_INVL:
1369		val = DMA_CCMD_GLOBAL_INVL;
1370		break;
1371	case DMA_CCMD_DOMAIN_INVL:
1372		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1373		break;
1374	case DMA_CCMD_DEVICE_INVL:
1375		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1376			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1377		break;
1378	default:
1379		BUG();
1380	}
1381	val |= DMA_CCMD_ICC;
1382
1383	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1384	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1385
1386	/* Make sure hardware complete it */
1387	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1388		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1389
1390	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1391}
1392
1393/* return value determine if we need a write buffer flush */
1394static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1395				u64 addr, unsigned int size_order, u64 type)
1396{
1397	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1398	u64 val = 0, val_iva = 0;
1399	unsigned long flag;
1400
1401	switch (type) {
1402	case DMA_TLB_GLOBAL_FLUSH:
1403		/* global flush doesn't need set IVA_REG */
1404		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1405		break;
1406	case DMA_TLB_DSI_FLUSH:
1407		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1408		break;
1409	case DMA_TLB_PSI_FLUSH:
1410		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1411		/* IH bit is passed in as part of address */
1412		val_iva = size_order | addr;
1413		break;
1414	default:
1415		BUG();
1416	}
1417	/* Note: set drain read/write */
1418#if 0
1419	/*
1420	 * This is probably to be super secure.. Looks like we can
1421	 * ignore it without any impact.
1422	 */
1423	if (cap_read_drain(iommu->cap))
1424		val |= DMA_TLB_READ_DRAIN;
1425#endif
1426	if (cap_write_drain(iommu->cap))
1427		val |= DMA_TLB_WRITE_DRAIN;
1428
1429	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1430	/* Note: Only uses first TLB reg currently */
1431	if (val_iva)
1432		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1433	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1434
1435	/* Make sure hardware complete it */
1436	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1437		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1438
1439	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1440
1441	/* check IOTLB invalidation granularity */
1442	if (DMA_TLB_IAIG(val) == 0)
1443		pr_err("Flush IOTLB failed\n");
1444	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1445		pr_debug("TLB flush request %Lx, actual %Lx\n",
1446			(unsigned long long)DMA_TLB_IIRG(type),
1447			(unsigned long long)DMA_TLB_IAIG(val));
1448}
1449
1450static struct device_domain_info *
1451iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1452			 u8 bus, u8 devfn)
1453{
 
 
1454	struct device_domain_info *info;
 
1455
1456	assert_spin_locked(&device_domain_lock);
 
1457
1458	if (!iommu->qi)
1459		return NULL;
1460
 
1461	list_for_each_entry(info, &domain->devices, link)
1462		if (info->iommu == iommu && info->bus == bus &&
1463		    info->devfn == devfn) {
1464			if (info->ats_supported && info->dev)
1465				return info;
1466			break;
1467		}
 
1468
1469	return NULL;
1470}
1471
1472static void domain_update_iotlb(struct dmar_domain *domain)
1473{
1474	struct device_domain_info *info;
1475	bool has_iotlb_device = false;
1476
1477	assert_spin_locked(&device_domain_lock);
 
1478
1479	list_for_each_entry(info, &domain->devices, link) {
1480		struct pci_dev *pdev;
1481
1482		if (!info->dev || !dev_is_pci(info->dev))
1483			continue;
1484
1485		pdev = to_pci_dev(info->dev);
1486		if (pdev->ats_enabled) {
1487			has_iotlb_device = true;
1488			break;
1489		}
1490	}
1491
1492	domain->has_iotlb_device = has_iotlb_device;
1493}
1494
1495static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1496{
1497	struct pci_dev *pdev;
1498
1499	assert_spin_locked(&device_domain_lock);
1500
1501	if (!info || !dev_is_pci(info->dev))
1502		return;
1503
1504	pdev = to_pci_dev(info->dev);
1505
1506#ifdef CONFIG_INTEL_IOMMU_SVM
1507	/* The PCIe spec, in its wisdom, declares that the behaviour of
1508	   the device if you enable PASID support after ATS support is
1509	   undefined. So always enable PASID support on devices which
1510	   have it, even if we can't yet know if we're ever going to
1511	   use it. */
1512	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1513		info->pasid_enabled = 1;
1514
1515	if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1516		info->pri_enabled = 1;
1517#endif
1518	if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1519		info->ats_enabled = 1;
1520		domain_update_iotlb(info->domain);
1521		info->ats_qdep = pci_ats_queue_depth(pdev);
1522	}
1523}
1524
1525static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1526{
1527	struct pci_dev *pdev;
1528
1529	assert_spin_locked(&device_domain_lock);
1530
1531	if (!dev_is_pci(info->dev))
1532		return;
1533
1534	pdev = to_pci_dev(info->dev);
1535
1536	if (info->ats_enabled) {
1537		pci_disable_ats(pdev);
1538		info->ats_enabled = 0;
1539		domain_update_iotlb(info->domain);
1540	}
1541#ifdef CONFIG_INTEL_IOMMU_SVM
1542	if (info->pri_enabled) {
1543		pci_disable_pri(pdev);
1544		info->pri_enabled = 0;
1545	}
1546	if (info->pasid_enabled) {
1547		pci_disable_pasid(pdev);
1548		info->pasid_enabled = 0;
1549	}
1550#endif
1551}
1552
1553static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1554				  u64 addr, unsigned mask)
1555{
1556	u16 sid, qdep;
1557	unsigned long flags;
1558	struct device_domain_info *info;
1559
1560	if (!domain->has_iotlb_device)
1561		return;
1562
1563	spin_lock_irqsave(&device_domain_lock, flags);
1564	list_for_each_entry(info, &domain->devices, link) {
1565		if (!info->ats_enabled)
1566			continue;
1567
1568		sid = info->bus << 8 | info->devfn;
1569		qdep = info->ats_qdep;
1570		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1571	}
1572	spin_unlock_irqrestore(&device_domain_lock, flags);
1573}
1574
1575static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1576				  struct dmar_domain *domain,
1577				  unsigned long pfn, unsigned int pages,
1578				  int ih, int map)
1579{
1580	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1581	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1582	u16 did = domain->iommu_did[iommu->seq_id];
1583
1584	BUG_ON(pages == 0);
1585
1586	if (ih)
1587		ih = 1 << 6;
1588	/*
1589	 * Fallback to domain selective flush if no PSI support or the size is
1590	 * too big.
1591	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1592	 * aligned to the size
1593	 */
1594	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1595		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1596						DMA_TLB_DSI_FLUSH);
1597	else
1598		iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1599						DMA_TLB_PSI_FLUSH);
1600
1601	/*
1602	 * In caching mode, changes of pages from non-present to present require
1603	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1604	 */
1605	if (!cap_caching_mode(iommu->cap) || !map)
1606		iommu_flush_dev_iotlb(domain, addr, mask);
1607}
1608
1609static void iommu_flush_iova(struct iova_domain *iovad)
1610{
1611	struct dmar_domain *domain;
1612	int idx;
1613
1614	domain = container_of(iovad, struct dmar_domain, iovad);
1615
1616	for_each_domain_iommu(idx, domain) {
1617		struct intel_iommu *iommu = g_iommus[idx];
1618		u16 did = domain->iommu_did[iommu->seq_id];
1619
1620		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1621
1622		if (!cap_caching_mode(iommu->cap))
1623			iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1624					      0, MAX_AGAW_PFN_WIDTH);
1625	}
1626}
1627
1628static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1629{
1630	u32 pmen;
1631	unsigned long flags;
1632
1633	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1635	pmen &= ~DMA_PMEN_EPM;
1636	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1637
1638	/* wait for the protected region status bit to clear */
1639	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1640		readl, !(pmen & DMA_PMEN_PRS), pmen);
1641
1642	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643}
1644
1645static void iommu_enable_translation(struct intel_iommu *iommu)
1646{
1647	u32 sts;
1648	unsigned long flags;
1649
1650	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1651	iommu->gcmd |= DMA_GCMD_TE;
1652	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654	/* Make sure hardware complete it */
1655	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656		      readl, (sts & DMA_GSTS_TES), sts);
1657
1658	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
 
1659}
1660
1661static void iommu_disable_translation(struct intel_iommu *iommu)
1662{
1663	u32 sts;
1664	unsigned long flag;
1665
1666	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1667	iommu->gcmd &= ~DMA_GCMD_TE;
1668	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1669
1670	/* Make sure hardware complete it */
1671	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1672		      readl, (!(sts & DMA_GSTS_TES)), sts);
1673
1674	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
 
1675}
1676
1677
1678static int iommu_init_domains(struct intel_iommu *iommu)
1679{
1680	u32 ndomains, nlongs;
1681	size_t size;
1682
1683	ndomains = cap_ndoms(iommu->cap);
1684	pr_debug("%s: Number of Domains supported <%d>\n",
1685		 iommu->name, ndomains);
1686	nlongs = BITS_TO_LONGS(ndomains);
1687
1688	spin_lock_init(&iommu->lock);
1689
 
 
 
1690	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1691	if (!iommu->domain_ids) {
1692		pr_err("%s: Allocating domain id array failed\n",
1693		       iommu->name);
1694		return -ENOMEM;
1695	}
1696
1697	size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1698	iommu->domains = kzalloc(size, GFP_KERNEL);
1699
1700	if (iommu->domains) {
1701		size = 256 * sizeof(struct dmar_domain *);
1702		iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1703	}
1704
1705	if (!iommu->domains || !iommu->domains[0]) {
1706		pr_err("%s: Allocating domain array failed\n",
1707		       iommu->name);
1708		kfree(iommu->domain_ids);
1709		kfree(iommu->domains);
1710		iommu->domain_ids = NULL;
1711		iommu->domains    = NULL;
1712		return -ENOMEM;
1713	}
1714
1715
1716
1717	/*
1718	 * If Caching mode is set, then invalid translations are tagged
1719	 * with domain-id 0, hence we need to pre-allocate it. We also
1720	 * use domain-id 0 as a marker for non-allocated domain-id, so
1721	 * make sure it is not used for a real domain.
1722	 */
1723	set_bit(0, iommu->domain_ids);
1724
1725	return 0;
1726}
1727
1728static void disable_dmar_iommu(struct intel_iommu *iommu)
 
 
 
 
1729{
1730	struct device_domain_info *info, *tmp;
 
1731	unsigned long flags;
1732
1733	if (!iommu->domains || !iommu->domain_ids)
1734		return;
1735
1736again:
1737	spin_lock_irqsave(&device_domain_lock, flags);
1738	list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1739		struct dmar_domain *domain;
1740
1741		if (info->iommu != iommu)
1742			continue;
1743
1744		if (!info->dev || !info->domain)
1745			continue;
1746
1747		domain = info->domain;
1748
1749		__dmar_remove_one_dev_info(info);
1750
1751		if (!domain_type_is_vm_or_si(domain)) {
1752			/*
1753			 * The domain_exit() function  can't be called under
1754			 * device_domain_lock, as it takes this lock itself.
1755			 * So release the lock here and re-run the loop
1756			 * afterwards.
1757			 */
1758			spin_unlock_irqrestore(&device_domain_lock, flags);
1759			domain_exit(domain);
1760			goto again;
1761		}
1762	}
1763	spin_unlock_irqrestore(&device_domain_lock, flags);
1764
1765	if (iommu->gcmd & DMA_GCMD_TE)
1766		iommu_disable_translation(iommu);
1767}
1768
1769static void free_dmar_iommu(struct intel_iommu *iommu)
1770{
1771	if ((iommu->domains) && (iommu->domain_ids)) {
1772		int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1773		int i;
 
 
 
 
 
 
1774
1775		for (i = 0; i < elems; i++)
1776			kfree(iommu->domains[i]);
1777		kfree(iommu->domains);
1778		kfree(iommu->domain_ids);
1779		iommu->domains = NULL;
1780		iommu->domain_ids = NULL;
1781	}
1782
1783	g_iommus[iommu->seq_id] = NULL;
 
1784
1785	/* free context mapping */
1786	free_context_table(iommu);
1787
1788#ifdef CONFIG_INTEL_IOMMU_SVM
1789	if (pasid_enabled(iommu)) {
1790		if (ecap_prs(iommu->ecap))
1791			intel_svm_finish_prq(iommu);
1792		intel_svm_free_pasid_tables(iommu);
1793	}
1794#endif
1795}
1796
1797static struct dmar_domain *alloc_domain(int flags)
1798{
1799	struct dmar_domain *domain;
1800
1801	domain = alloc_domain_mem();
1802	if (!domain)
1803		return NULL;
1804
1805	memset(domain, 0, sizeof(*domain));
1806	domain->nid = -1;
1807	domain->flags = flags;
1808	domain->has_iotlb_device = false;
1809	INIT_LIST_HEAD(&domain->devices);
1810
1811	return domain;
1812}
1813
1814/* Must be called with iommu->lock */
1815static int domain_attach_iommu(struct dmar_domain *domain,
1816			       struct intel_iommu *iommu)
1817{
 
1818	unsigned long ndomains;
1819	int num;
1820
1821	assert_spin_locked(&device_domain_lock);
1822	assert_spin_locked(&iommu->lock);
1823
1824	domain->iommu_refcnt[iommu->seq_id] += 1;
1825	domain->iommu_count += 1;
1826	if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1827		ndomains = cap_ndoms(iommu->cap);
1828		num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1829
1830		if (num >= ndomains) {
1831			pr_err("%s: No free domain ids\n", iommu->name);
1832			domain->iommu_refcnt[iommu->seq_id] -= 1;
1833			domain->iommu_count -= 1;
1834			return -ENOSPC;
1835		}
1836
1837		set_bit(num, iommu->domain_ids);
1838		set_iommu_domain(iommu, num, domain);
1839
1840		domain->iommu_did[iommu->seq_id] = num;
1841		domain->nid			 = iommu->node;
1842
1843		domain_update_iommu_cap(domain);
1844	}
1845
1846	return 0;
1847}
1848
1849static int domain_detach_iommu(struct dmar_domain *domain,
1850			       struct intel_iommu *iommu)
1851{
1852	int num, count = INT_MAX;
 
 
1853
1854	assert_spin_locked(&device_domain_lock);
1855	assert_spin_locked(&iommu->lock);
 
 
 
 
 
 
1856
1857	domain->iommu_refcnt[iommu->seq_id] -= 1;
1858	count = --domain->iommu_count;
1859	if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1860		num = domain->iommu_did[iommu->seq_id];
1861		clear_bit(num, iommu->domain_ids);
1862		set_iommu_domain(iommu, num, NULL);
1863
1864		domain_update_iommu_cap(domain);
1865		domain->iommu_did[iommu->seq_id] = 0;
1866	}
1867
1868	return count;
1869}
1870
1871static struct iova_domain reserved_iova_list;
1872static struct lock_class_key reserved_rbtree_key;
1873
1874static int dmar_init_reserved_ranges(void)
1875{
1876	struct pci_dev *pdev = NULL;
1877	struct iova *iova;
1878	int i;
1879
1880	init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1881
1882	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1883		&reserved_rbtree_key);
1884
1885	/* IOAPIC ranges shouldn't be accessed by DMA */
1886	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1887		IOVA_PFN(IOAPIC_RANGE_END));
1888	if (!iova) {
1889		pr_err("Reserve IOAPIC range failed\n");
1890		return -ENODEV;
1891	}
1892
1893	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1894	for_each_pci_dev(pdev) {
1895		struct resource *r;
1896
1897		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1898			r = &pdev->resource[i];
1899			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1900				continue;
1901			iova = reserve_iova(&reserved_iova_list,
1902					    IOVA_PFN(r->start),
1903					    IOVA_PFN(r->end));
1904			if (!iova) {
1905				pr_err("Reserve iova failed\n");
1906				return -ENODEV;
1907			}
1908		}
1909	}
1910	return 0;
1911}
1912
1913static void domain_reserve_special_ranges(struct dmar_domain *domain)
1914{
1915	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1916}
1917
1918static inline int guestwidth_to_adjustwidth(int gaw)
1919{
1920	int agaw;
1921	int r = (gaw - 12) % 9;
1922
1923	if (r == 0)
1924		agaw = gaw;
1925	else
1926		agaw = gaw + 9 - r;
1927	if (agaw > 64)
1928		agaw = 64;
1929	return agaw;
1930}
1931
1932static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1933		       int guest_width)
1934{
 
1935	int adjust_width, agaw;
1936	unsigned long sagaw;
1937	int err;
1938
1939	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1940
1941	err = init_iova_flush_queue(&domain->iovad,
1942				    iommu_flush_iova, iova_entry_free);
1943	if (err)
1944		return err;
1945
1946	domain_reserve_special_ranges(domain);
1947
1948	/* calculate AGAW */
 
1949	if (guest_width > cap_mgaw(iommu->cap))
1950		guest_width = cap_mgaw(iommu->cap);
1951	domain->gaw = guest_width;
1952	adjust_width = guestwidth_to_adjustwidth(guest_width);
1953	agaw = width_to_agaw(adjust_width);
1954	sagaw = cap_sagaw(iommu->cap);
1955	if (!test_bit(agaw, &sagaw)) {
1956		/* hardware doesn't support it, choose a bigger one */
1957		pr_debug("Hardware doesn't support agaw %d\n", agaw);
1958		agaw = find_next_bit(&sagaw, 5, agaw);
1959		if (agaw >= 5)
1960			return -ENODEV;
1961	}
1962	domain->agaw = agaw;
 
1963
1964	if (ecap_coherent(iommu->ecap))
1965		domain->iommu_coherency = 1;
1966	else
1967		domain->iommu_coherency = 0;
1968
1969	if (ecap_sc_support(iommu->ecap))
1970		domain->iommu_snooping = 1;
1971	else
1972		domain->iommu_snooping = 0;
1973
1974	if (intel_iommu_superpage)
1975		domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1976	else
1977		domain->iommu_superpage = 0;
1978
1979	domain->nid = iommu->node;
1980
1981	/* always allocate the top pgd */
1982	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1983	if (!domain->pgd)
1984		return -ENOMEM;
1985	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1986	return 0;
1987}
1988
1989static void domain_exit(struct dmar_domain *domain)
1990{
1991	struct page *freelist = NULL;
 
1992
1993	/* Domain 0 is reserved, so dont process it */
1994	if (!domain)
1995		return;
1996
1997	/* Remove associated devices and clear attached or cached domains */
1998	rcu_read_lock();
 
 
1999	domain_remove_dev_info(domain);
2000	rcu_read_unlock();
2001
2002	/* destroy iovas */
2003	put_iova_domain(&domain->iovad);
2004
2005	freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
 
 
 
 
2006
2007	dma_free_pagelist(freelist);
 
 
2008
2009	free_domain_mem(domain);
2010}
2011
2012static int domain_context_mapping_one(struct dmar_domain *domain,
2013				      struct intel_iommu *iommu,
2014				      u8 bus, u8 devfn)
2015{
2016	u16 did = domain->iommu_did[iommu->seq_id];
2017	int translation = CONTEXT_TT_MULTI_LEVEL;
2018	struct device_domain_info *info = NULL;
2019	struct context_entry *context;
2020	unsigned long flags;
 
2021	struct dma_pte *pgd;
2022	int ret, agaw;
2023
2024	WARN_ON(did == 0);
2025
2026	if (hw_pass_through && domain_type_is_si(domain))
2027		translation = CONTEXT_TT_PASS_THROUGH;
2028
2029	pr_debug("Set context mapping for %02x:%02x.%d\n",
2030		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2031
2032	BUG_ON(!domain->pgd);
 
 
2033
2034	spin_lock_irqsave(&device_domain_lock, flags);
2035	spin_lock(&iommu->lock);
 
2036
2037	ret = -ENOMEM;
2038	context = iommu_context_addr(iommu, bus, devfn, 1);
2039	if (!context)
2040		goto out_unlock;
 
 
 
 
 
2041
2042	ret = 0;
2043	if (context_present(context))
2044		goto out_unlock;
2045
2046	/*
2047	 * For kdump cases, old valid entries may be cached due to the
2048	 * in-flight DMA and copied pgtable, but there is no unmapping
2049	 * behaviour for them, thus we need an explicit cache flush for
2050	 * the newly-mapped device. For kdump, at this point, the device
2051	 * is supposed to finish reset at its driver probe stage, so no
2052	 * in-flight DMA will exist, and we don't need to worry anymore
2053	 * hereafter.
2054	 */
2055	if (context_copied(context)) {
2056		u16 did_old = context_domain_id(context);
2057
2058		if (did_old < cap_ndoms(iommu->cap)) {
2059			iommu->flush.flush_context(iommu, did_old,
2060						   (((u16)bus) << 8) | devfn,
2061						   DMA_CCMD_MASK_NOBIT,
2062						   DMA_CCMD_DEVICE_INVL);
2063			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2064						 DMA_TLB_DSI_FLUSH);
 
2065		}
2066	}
2067
2068	pgd = domain->pgd;
 
 
 
 
 
 
2069
2070	context_clear_entry(context);
2071	context_set_domain_id(context, did);
 
 
2072
2073	/*
2074	 * Skip top levels of page tables for iommu which has less agaw
2075	 * than default.  Unnecessary for PT mode.
2076	 */
2077	if (translation != CONTEXT_TT_PASS_THROUGH) {
2078		for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2079			ret = -ENOMEM;
2080			pgd = phys_to_virt(dma_pte_addr(pgd));
2081			if (!dma_pte_present(pgd))
2082				goto out_unlock;
 
 
2083		}
 
2084
2085		info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2086		if (info && info->ats_supported)
2087			translation = CONTEXT_TT_DEV_IOTLB;
2088		else
2089			translation = CONTEXT_TT_MULTI_LEVEL;
2090
 
 
 
 
 
 
 
 
 
 
 
 
2091		context_set_address_root(context, virt_to_phys(pgd));
2092		context_set_address_width(context, iommu->agaw);
2093	} else {
2094		/*
2095		 * In pass through mode, AW must be programmed to
2096		 * indicate the largest AGAW value supported by
2097		 * hardware. And ASR is ignored by hardware.
2098		 */
2099		context_set_address_width(context, iommu->msagaw);
2100	}
2101
2102	context_set_translation_type(context, translation);
2103	context_set_fault_enable(context);
2104	context_set_present(context);
2105	domain_flush_cache(domain, context, sizeof(*context));
2106
2107	/*
2108	 * It's a non-present to present mapping. If hardware doesn't cache
2109	 * non-present entry we only need to flush the write-buffer. If the
2110	 * _does_ cache non-present entries, then it does so in the special
2111	 * domain #0, which we have to flush:
2112	 */
2113	if (cap_caching_mode(iommu->cap)) {
2114		iommu->flush.flush_context(iommu, 0,
2115					   (((u16)bus) << 8) | devfn,
2116					   DMA_CCMD_MASK_NOBIT,
2117					   DMA_CCMD_DEVICE_INVL);
2118		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2119	} else {
2120		iommu_flush_write_buffer(iommu);
2121	}
2122	iommu_enable_dev_iotlb(info);
 
2123
2124	ret = 0;
2125
2126out_unlock:
2127	spin_unlock(&iommu->lock);
2128	spin_unlock_irqrestore(&device_domain_lock, flags);
2129
2130	return ret;
2131}
2132
2133struct domain_context_mapping_data {
2134	struct dmar_domain *domain;
2135	struct intel_iommu *iommu;
2136};
2137
2138static int domain_context_mapping_cb(struct pci_dev *pdev,
2139				     u16 alias, void *opaque)
2140{
2141	struct domain_context_mapping_data *data = opaque;
2142
2143	return domain_context_mapping_one(data->domain, data->iommu,
2144					  PCI_BUS_NUM(alias), alias & 0xff);
2145}
2146
2147static int
2148domain_context_mapping(struct dmar_domain *domain, struct device *dev)
 
2149{
2150	struct intel_iommu *iommu;
2151	u8 bus, devfn;
2152	struct domain_context_mapping_data data;
2153
2154	iommu = device_to_iommu(dev, &bus, &devfn);
2155	if (!iommu)
2156		return -ENODEV;
 
 
2157
2158	if (!dev_is_pci(dev))
2159		return domain_context_mapping_one(domain, iommu, bus, devfn);
2160
2161	data.domain = domain;
2162	data.iommu = iommu;
2163
2164	return pci_for_each_dma_alias(to_pci_dev(dev),
2165				      &domain_context_mapping_cb, &data);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2166}
2167
2168static int domain_context_mapped_cb(struct pci_dev *pdev,
2169				    u16 alias, void *opaque)
2170{
2171	struct intel_iommu *iommu = opaque;
2172
2173	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174}
2175
2176static int domain_context_mapped(struct device *dev)
2177{
 
 
2178	struct intel_iommu *iommu;
2179	u8 bus, devfn;
2180
2181	iommu = device_to_iommu(dev, &bus, &devfn);
 
2182	if (!iommu)
2183		return -ENODEV;
2184
2185	if (!dev_is_pci(dev))
2186		return device_context_mapped(iommu, bus, devfn);
2187
2188	return !pci_for_each_dma_alias(to_pci_dev(dev),
2189				       domain_context_mapped_cb, iommu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2190}
2191
2192/* Returns a number of VTD pages, but aligned to MM page size */
2193static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194					    size_t size)
2195{
2196	host_addr &= ~PAGE_MASK;
2197	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198}
2199
2200/* Return largest possible superpage level for a given mapping */
2201static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202					  unsigned long iov_pfn,
2203					  unsigned long phy_pfn,
2204					  unsigned long pages)
2205{
2206	int support, level = 1;
2207	unsigned long pfnmerge;
2208
2209	support = domain->iommu_superpage;
2210
2211	/* To use a large page, the virtual *and* physical addresses
2212	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213	   of them will mean we have to use smaller pages. So just
2214	   merge them and check both at once. */
2215	pfnmerge = iov_pfn | phy_pfn;
2216
2217	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218		pages >>= VTD_STRIDE_SHIFT;
2219		if (!pages)
2220			break;
2221		pfnmerge >>= VTD_STRIDE_SHIFT;
2222		level++;
2223		support--;
2224	}
2225	return level;
2226}
2227
2228static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229			    struct scatterlist *sg, unsigned long phys_pfn,
2230			    unsigned long nr_pages, int prot)
2231{
2232	struct dma_pte *first_pte = NULL, *pte = NULL;
2233	phys_addr_t uninitialized_var(pteval);
2234	unsigned long sg_res = 0;
 
2235	unsigned int largepage_lvl = 0;
2236	unsigned long lvl_pages = 0;
2237
2238	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241		return -EINVAL;
2242
2243	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245	if (!sg) {
2246		sg_res = nr_pages;
 
 
2247		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248	}
2249
2250	while (nr_pages > 0) {
2251		uint64_t tmp;
2252
2253		if (!sg_res) {
2254			unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256			sg_res = aligned_nrpages(sg->offset, sg->length);
2257			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258			sg->dma_length = sg->length;
2259			pteval = (sg_phys(sg) - pgoff) | prot;
2260			phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261		}
2262
2263		if (!pte) {
2264			largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267			if (!pte)
2268				return -ENOMEM;
2269			/* It is large page*/
2270			if (largepage_lvl > 1) {
2271				unsigned long nr_superpages, end_pfn;
2272
2273				pteval |= DMA_PTE_LARGE_PAGE;
2274				lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276				nr_superpages = sg_res / lvl_pages;
2277				end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279				/*
2280				 * Ensure that old small page tables are
2281				 * removed to make room for superpage(s).
2282				 * We're adding new large pages, so make sure
2283				 * we don't remove their parent tables.
2284				 */
2285				dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286						       largepage_lvl + 1);
2287			} else {
2288				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289			}
2290
2291		}
2292		/* We don't need lock here, nobody else
2293		 * touches the iova range
2294		 */
2295		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296		if (tmp) {
2297			static int dumps = 5;
2298			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299				iov_pfn, tmp, (unsigned long long)pteval);
2300			if (dumps) {
2301				dumps--;
2302				debug_dma_dump_mappings(NULL);
2303			}
2304			WARN_ON(1);
2305		}
2306
2307		lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309		BUG_ON(nr_pages < lvl_pages);
2310		BUG_ON(sg_res < lvl_pages);
2311
2312		nr_pages -= lvl_pages;
2313		iov_pfn += lvl_pages;
2314		phys_pfn += lvl_pages;
2315		pteval += lvl_pages * VTD_PAGE_SIZE;
2316		sg_res -= lvl_pages;
2317
2318		/* If the next PTE would be the first in a new page, then we
2319		   need to flush the cache on the entries we've just written.
2320		   And then we'll need to recalculate 'pte', so clear it and
2321		   let it get set again in the if (!pte) block above.
2322
2323		   If we're done (!nr_pages) we need to flush the cache too.
2324
2325		   Also if we've been setting superpages, we may need to
2326		   recalculate 'pte' and switch back to smaller pages for the
2327		   end of the mapping, if the trailing size is not enough to
2328		   use another superpage (i.e. sg_res < lvl_pages). */
2329		pte++;
2330		if (!nr_pages || first_pte_in_page(pte) ||
2331		    (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332			domain_flush_cache(domain, first_pte,
2333					   (void *)pte - (void *)first_pte);
2334			pte = NULL;
2335		}
2336
2337		if (!sg_res && nr_pages)
2338			sg = sg_next(sg);
2339	}
2340	return 0;
2341}
2342
2343static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344				    struct scatterlist *sg, unsigned long nr_pages,
2345				    int prot)
2346{
2347	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2348}
2349
2350static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2351				     unsigned long phys_pfn, unsigned long nr_pages,
2352				     int prot)
2353{
2354	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2355}
2356
2357static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2358{
2359	unsigned long flags;
2360	struct context_entry *context;
2361	u16 did_old;
2362
2363	if (!iommu)
2364		return;
2365
2366	spin_lock_irqsave(&iommu->lock, flags);
2367	context = iommu_context_addr(iommu, bus, devfn, 0);
2368	if (!context) {
2369		spin_unlock_irqrestore(&iommu->lock, flags);
2370		return;
2371	}
2372	did_old = context_domain_id(context);
2373	context_clear_entry(context);
2374	__iommu_flush_cache(iommu, context, sizeof(*context));
2375	spin_unlock_irqrestore(&iommu->lock, flags);
2376	iommu->flush.flush_context(iommu,
2377				   did_old,
2378				   (((u16)bus) << 8) | devfn,
2379				   DMA_CCMD_MASK_NOBIT,
2380				   DMA_CCMD_DEVICE_INVL);
2381	iommu->flush.flush_iotlb(iommu,
2382				 did_old,
2383				 0,
2384				 0,
2385				 DMA_TLB_DSI_FLUSH);
2386}
2387
2388static inline void unlink_domain_info(struct device_domain_info *info)
2389{
2390	assert_spin_locked(&device_domain_lock);
2391	list_del(&info->link);
2392	list_del(&info->global);
2393	if (info->dev)
2394		info->dev->archdata.iommu = NULL;
2395}
2396
2397static void domain_remove_dev_info(struct dmar_domain *domain)
2398{
2399	struct device_domain_info *info, *tmp;
2400	unsigned long flags;
 
2401
2402	spin_lock_irqsave(&device_domain_lock, flags);
2403	list_for_each_entry_safe(info, tmp, &domain->devices, link)
2404		__dmar_remove_one_dev_info(info);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2405	spin_unlock_irqrestore(&device_domain_lock, flags);
2406}
2407
2408/*
2409 * find_domain
2410 * Note: we use struct device->archdata.iommu stores the info
2411 */
2412static struct dmar_domain *find_domain(struct device *dev)
 
2413{
2414	struct device_domain_info *info;
2415
2416	/* No lock here, assumes no domain exit in normal case */
2417	info = dev->archdata.iommu;
2418	if (likely(info))
2419		return info->domain;
2420	return NULL;
2421}
2422
2423static inline struct device_domain_info *
2424dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2425{
2426	struct device_domain_info *info;
2427
2428	list_for_each_entry(info, &device_domain_list, global)
2429		if (info->iommu->segment == segment && info->bus == bus &&
2430		    info->devfn == devfn)
2431			return info;
2432
2433	return NULL;
2434}
2435
2436static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2437						    int bus, int devfn,
2438						    struct device *dev,
2439						    struct dmar_domain *domain)
2440{
2441	struct dmar_domain *found = NULL;
2442	struct device_domain_info *info;
2443	unsigned long flags;
 
 
2444	int ret;
2445
2446	info = alloc_devinfo_mem();
2447	if (!info)
2448		return NULL;
2449
2450	info->bus = bus;
2451	info->devfn = devfn;
2452	info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2453	info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2454	info->ats_qdep = 0;
2455	info->dev = dev;
2456	info->domain = domain;
2457	info->iommu = iommu;
2458
2459	if (dev && dev_is_pci(dev)) {
2460		struct pci_dev *pdev = to_pci_dev(info->dev);
2461
2462		if (ecap_dev_iotlb_support(iommu->ecap) &&
2463		    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2464		    dmar_find_matched_atsr_unit(pdev))
2465			info->ats_supported = 1;
2466
2467		if (ecs_enabled(iommu)) {
2468			if (pasid_enabled(iommu)) {
2469				int features = pci_pasid_features(pdev);
2470				if (features >= 0)
2471					info->pasid_supported = features | 1;
 
 
 
 
 
2472			}
2473
2474			if (info->ats_supported && ecap_prs(iommu->ecap) &&
2475			    pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2476				info->pri_supported = 1;
 
 
2477		}
2478	}
2479
2480	spin_lock_irqsave(&device_domain_lock, flags);
2481	if (dev)
2482		found = find_domain(dev);
2483
2484	if (!found) {
2485		struct device_domain_info *info2;
2486		info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2487		if (info2) {
2488			found      = info2->domain;
2489			info2->dev = dev;
2490		}
2491	}
2492
2493	if (found) {
2494		spin_unlock_irqrestore(&device_domain_lock, flags);
2495		free_devinfo_mem(info);
2496		/* Caller must free the original domain */
2497		return found;
2498	}
 
2499
2500	spin_lock(&iommu->lock);
2501	ret = domain_attach_iommu(domain, iommu);
2502	spin_unlock(&iommu->lock);
2503
2504	if (ret) {
2505		spin_unlock_irqrestore(&device_domain_lock, flags);
2506		free_devinfo_mem(info);
2507		return NULL;
2508	}
2509
2510	list_add(&info->link, &domain->devices);
2511	list_add(&info->global, &device_domain_list);
2512	if (dev)
2513		dev->archdata.iommu = info;
2514	spin_unlock_irqrestore(&device_domain_lock, flags);
2515
2516	if (dev && domain_context_mapping(domain, dev)) {
2517		pr_err("Domain context map for %s failed\n", dev_name(dev));
2518		dmar_remove_one_dev_info(domain, dev);
2519		return NULL;
2520	}
2521
2522	return domain;
2523}
2524
2525static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2526{
2527	*(u16 *)opaque = alias;
2528	return 0;
2529}
2530
2531static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2532{
2533	struct device_domain_info *info = NULL;
2534	struct dmar_domain *domain = NULL;
2535	struct intel_iommu *iommu;
2536	u16 req_id, dma_alias;
2537	unsigned long flags;
2538	u8 bus, devfn;
2539
2540	iommu = device_to_iommu(dev, &bus, &devfn);
2541	if (!iommu)
2542		return NULL;
2543
2544	req_id = ((u16)bus << 8) | devfn;
2545
2546	if (dev_is_pci(dev)) {
2547		struct pci_dev *pdev = to_pci_dev(dev);
2548
2549		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2550
 
 
2551		spin_lock_irqsave(&device_domain_lock, flags);
2552		info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2553						      PCI_BUS_NUM(dma_alias),
2554						      dma_alias & 0xff);
2555		if (info) {
2556			iommu = info->iommu;
2557			domain = info->domain;
 
 
 
 
 
 
 
 
 
 
2558		}
2559		spin_unlock_irqrestore(&device_domain_lock, flags);
2560
2561		/* DMA alias already has a domain, use it */
2562		if (info)
2563			goto out;
2564	}
2565
2566	/* Allocate and initialize new domain for the device */
2567	domain = alloc_domain(0);
2568	if (!domain)
2569		return NULL;
2570	if (domain_init(domain, iommu, gaw)) {
2571		domain_exit(domain);
2572		return NULL;
2573	}
2574
2575out:
2576
2577	return domain;
2578}
2579
2580static struct dmar_domain *set_domain_for_dev(struct device *dev,
2581					      struct dmar_domain *domain)
2582{
2583	struct intel_iommu *iommu;
2584	struct dmar_domain *tmp;
2585	u16 req_id, dma_alias;
2586	u8 bus, devfn;
2587
2588	iommu = device_to_iommu(dev, &bus, &devfn);
2589	if (!iommu)
2590		return NULL;
2591
2592	req_id = ((u16)bus << 8) | devfn;
2593
2594	if (dev_is_pci(dev)) {
2595		struct pci_dev *pdev = to_pci_dev(dev);
2596
2597		pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2598
2599		/* register PCI DMA alias device */
2600		if (req_id != dma_alias) {
2601			tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2602					dma_alias & 0xff, NULL, domain);
2603
2604			if (!tmp || tmp != domain)
2605				return tmp;
2606		}
 
 
2607	}
2608
2609	tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2610	if (!tmp || tmp != domain)
2611		return tmp;
2612
2613	return domain;
 
 
 
2614}
2615
2616static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2617{
2618	struct dmar_domain *domain, *tmp;
2619
2620	domain = find_domain(dev);
2621	if (domain)
2622		goto out;
2623
2624	domain = find_or_alloc_domain(dev, gaw);
2625	if (!domain)
2626		goto out;
2627
2628	tmp = set_domain_for_dev(dev, domain);
2629	if (!tmp || domain != tmp) {
2630		domain_exit(domain);
2631		domain = tmp;
2632	}
2633
2634out:
2635
2636	return domain;
2637}
2638
2639static int iommu_domain_identity_map(struct dmar_domain *domain,
2640				     unsigned long long start,
2641				     unsigned long long end)
2642{
2643	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2644	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2645
2646	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2647			  dma_to_mm_pfn(last_vpfn))) {
2648		pr_err("Reserving iova failed\n");
2649		return -ENOMEM;
2650	}
2651
2652	pr_debug("Mapping reserved region %llx-%llx\n", start, end);
 
2653	/*
2654	 * RMRR range might have overlap with physical memory range,
2655	 * clear it first
2656	 */
2657	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2658
2659	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2660				  last_vpfn - first_vpfn + 1,
2661				  DMA_PTE_READ|DMA_PTE_WRITE);
2662}
2663
2664static int domain_prepare_identity_map(struct device *dev,
2665				       struct dmar_domain *domain,
2666				       unsigned long long start,
2667				       unsigned long long end)
2668{
 
 
 
 
 
 
 
2669	/* For _hardware_ passthrough, don't bother. But for software
2670	   passthrough, we do it anyway -- it may indicate a memory
2671	   range which is reserved in E820, so which didn't get set
2672	   up to start with in si_domain */
2673	if (domain == si_domain && hw_pass_through) {
2674		pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2675			dev_name(dev), start, end);
2676		return 0;
2677	}
2678
2679	pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2680		dev_name(dev), start, end);
2681
 
2682	if (end < start) {
2683		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2684			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2685			dmi_get_system_info(DMI_BIOS_VENDOR),
2686			dmi_get_system_info(DMI_BIOS_VERSION),
2687		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2688		return -EIO;
 
2689	}
2690
2691	if (end >> agaw_to_width(domain->agaw)) {
2692		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2693		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2694		     agaw_to_width(domain->agaw),
2695		     dmi_get_system_info(DMI_BIOS_VENDOR),
2696		     dmi_get_system_info(DMI_BIOS_VERSION),
2697		     dmi_get_system_info(DMI_PRODUCT_VERSION));
2698		return -EIO;
 
2699	}
2700
2701	return iommu_domain_identity_map(domain, start, end);
2702}
 
2703
2704static int iommu_prepare_identity_map(struct device *dev,
2705				      unsigned long long start,
2706				      unsigned long long end)
2707{
2708	struct dmar_domain *domain;
2709	int ret;
2710
2711	domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2712	if (!domain)
2713		return -ENOMEM;
2714
2715	ret = domain_prepare_identity_map(dev, domain, start, end);
2716	if (ret)
2717		domain_exit(domain);
2718
 
 
2719	return ret;
2720}
2721
2722static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2723					 struct device *dev)
2724{
2725	if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2726		return 0;
2727	return iommu_prepare_identity_map(dev, rmrr->base_address,
2728					  rmrr->end_address);
2729}
2730
2731#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2732static inline void iommu_prepare_isa(void)
2733{
2734	struct pci_dev *pdev;
2735	int ret;
2736
2737	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2738	if (!pdev)
2739		return;
2740
2741	pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2742	ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2743
2744	if (ret)
2745		pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
 
2746
2747	pci_dev_put(pdev);
2748}
2749#else
2750static inline void iommu_prepare_isa(void)
2751{
2752	return;
2753}
2754#endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2755
2756static int md_domain_init(struct dmar_domain *domain, int guest_width);
2757
 
 
 
 
 
 
 
 
 
 
 
 
2758static int __init si_domain_init(int hw)
2759{
 
 
2760	int nid, ret = 0;
2761
2762	si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2763	if (!si_domain)
2764		return -EFAULT;
2765
 
 
 
 
 
 
 
 
 
 
2766	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2767		domain_exit(si_domain);
2768		return -EFAULT;
2769	}
2770
2771	pr_debug("Identity mapping domain allocated\n");
2772
2773	if (hw)
2774		return 0;
2775
2776	for_each_online_node(nid) {
2777		unsigned long start_pfn, end_pfn;
2778		int i;
2779
2780		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2781			ret = iommu_domain_identity_map(si_domain,
2782					PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2783			if (ret)
2784				return ret;
2785		}
2786	}
2787
2788	return 0;
2789}
2790
2791static int identity_mapping(struct device *dev)
 
 
2792{
2793	struct device_domain_info *info;
2794
2795	if (likely(!iommu_identity_mapping))
2796		return 0;
2797
2798	info = dev->archdata.iommu;
2799	if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2800		return (info->domain == si_domain);
2801
2802	return 0;
2803}
2804
2805static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
 
 
2806{
2807	struct dmar_domain *ndomain;
2808	struct intel_iommu *iommu;
2809	u8 bus, devfn;
2810
2811	iommu = device_to_iommu(dev, &bus, &devfn);
2812	if (!iommu)
2813		return -ENODEV;
2814
2815	ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2816	if (ndomain != domain)
2817		return -EBUSY;
2818
2819	return 0;
2820}
2821
2822static bool device_has_rmrr(struct device *dev)
2823{
2824	struct dmar_rmrr_unit *rmrr;
2825	struct device *tmp;
2826	int i;
2827
2828	rcu_read_lock();
2829	for_each_rmrr_units(rmrr) {
2830		/*
2831		 * Return TRUE if this RMRR contains the device that
2832		 * is passed in.
2833		 */
2834		for_each_active_dev_scope(rmrr->devices,
2835					  rmrr->devices_cnt, i, tmp)
2836			if (tmp == dev) {
2837				rcu_read_unlock();
2838				return true;
2839			}
2840	}
2841	rcu_read_unlock();
2842	return false;
2843}
2844
2845/*
2846 * There are a couple cases where we need to restrict the functionality of
2847 * devices associated with RMRRs.  The first is when evaluating a device for
2848 * identity mapping because problems exist when devices are moved in and out
2849 * of domains and their respective RMRR information is lost.  This means that
2850 * a device with associated RMRRs will never be in a "passthrough" domain.
2851 * The second is use of the device through the IOMMU API.  This interface
2852 * expects to have full control of the IOVA space for the device.  We cannot
2853 * satisfy both the requirement that RMRR access is maintained and have an
2854 * unencumbered IOVA space.  We also have no ability to quiesce the device's
2855 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2856 * We therefore prevent devices associated with an RMRR from participating in
2857 * the IOMMU API, which eliminates them from device assignment.
2858 *
2859 * In both cases we assume that PCI USB devices with RMRRs have them largely
2860 * for historical reasons and that the RMRR space is not actively used post
2861 * boot.  This exclusion may change if vendors begin to abuse it.
2862 *
2863 * The same exception is made for graphics devices, with the requirement that
2864 * any use of the RMRR regions will be torn down before assigning the device
2865 * to a guest.
2866 */
2867static bool device_is_rmrr_locked(struct device *dev)
2868{
2869	if (!device_has_rmrr(dev))
2870		return false;
2871
2872	if (dev_is_pci(dev)) {
2873		struct pci_dev *pdev = to_pci_dev(dev);
 
 
 
2874
2875		if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2876			return false;
2877	}
2878
2879	return true;
2880}
2881
2882static int iommu_should_identity_map(struct device *dev, int startup)
2883{
 
 
2884
2885	if (dev_is_pci(dev)) {
2886		struct pci_dev *pdev = to_pci_dev(dev);
2887
2888		if (device_is_rmrr_locked(dev))
2889			return 0;
2890
2891		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2892			return 1;
2893
2894		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2895			return 1;
2896
2897		if (!(iommu_identity_mapping & IDENTMAP_ALL))
 
 
 
 
 
 
 
 
 
 
 
 
2898			return 0;
2899
2900		/*
2901		 * We want to start off with all devices in the 1:1 domain, and
2902		 * take them out later if we find they can't access all of memory.
2903		 *
2904		 * However, we can't do this for PCI devices behind bridges,
2905		 * because all PCI devices behind the same bridge will end up
2906		 * with the same source-id on their transactions.
2907		 *
2908		 * Practically speaking, we can't change things around for these
2909		 * devices at run-time, because we can't be sure there'll be no
2910		 * DMA transactions in flight for any of their siblings.
2911		 *
2912		 * So PCI devices (unless they're on the root bus) as well as
2913		 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2914		 * the 1:1 domain, just in _case_ one of their siblings turns out
2915		 * not to be able to map all of memory.
2916		 */
2917		if (!pci_is_pcie(pdev)) {
2918			if (!pci_is_root_bus(pdev->bus))
2919				return 0;
2920			if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2921				return 0;
2922		} else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2923			return 0;
2924	} else {
2925		if (device_has_rmrr(dev))
2926			return 0;
2927	}
2928
2929	/*
2930	 * At boot time, we don't yet know if devices will be 64-bit capable.
2931	 * Assume that they will — if they turn out not to be, then we can
2932	 * take them out of the 1:1 domain later.
2933	 */
2934	if (!startup) {
2935		/*
2936		 * If the device's dma_mask is less than the system's memory
2937		 * size then this is not a candidate for identity mapping.
2938		 */
2939		u64 dma_mask = *dev->dma_mask;
2940
2941		if (dev->coherent_dma_mask &&
2942		    dev->coherent_dma_mask < dma_mask)
2943			dma_mask = dev->coherent_dma_mask;
2944
2945		return dma_mask >= dma_get_required_mask(dev);
2946	}
2947
2948	return 1;
2949}
2950
2951static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2952{
 
2953	int ret;
2954
2955	if (!iommu_should_identity_map(dev, 1))
2956		return 0;
2957
2958	ret = domain_add_dev_info(si_domain, dev);
2959	if (!ret)
2960		pr_info("%s identity mapping for device %s\n",
2961			hw ? "Hardware" : "Software", dev_name(dev));
2962	else if (ret == -ENODEV)
2963		/* device not associated with an iommu */
2964		ret = 0;
2965
2966	return ret;
2967}
2968
2969
2970static int __init iommu_prepare_static_identity_mapping(int hw)
2971{
2972	struct pci_dev *pdev = NULL;
2973	struct dmar_drhd_unit *drhd;
2974	struct intel_iommu *iommu;
2975	struct device *dev;
2976	int i;
2977	int ret = 0;
2978
2979	for_each_pci_dev(pdev) {
2980		ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2981		if (ret)
2982			return ret;
2983	}
2984
2985	for_each_active_iommu(iommu, drhd)
2986		for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2987			struct acpi_device_physical_node *pn;
2988			struct acpi_device *adev;
2989
2990			if (dev->bus != &acpi_bus_type)
2991				continue;
2992
2993			adev= to_acpi_device(dev);
2994			mutex_lock(&adev->physical_node_lock);
2995			list_for_each_entry(pn, &adev->physical_node_list, node) {
2996				ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2997				if (ret)
2998					break;
2999			}
3000			mutex_unlock(&adev->physical_node_lock);
3001			if (ret)
3002				return ret;
3003		}
 
3004
3005	return 0;
3006}
3007
3008static void intel_iommu_init_qi(struct intel_iommu *iommu)
3009{
3010	/*
3011	 * Start from the sane iommu hardware state.
3012	 * If the queued invalidation is already initialized by us
3013	 * (for example, while enabling interrupt-remapping) then
3014	 * we got the things already rolling from a sane state.
3015	 */
3016	if (!iommu->qi) {
3017		/*
3018		 * Clear any previous faults.
3019		 */
3020		dmar_fault(-1, iommu);
3021		/*
3022		 * Disable queued invalidation if supported and already enabled
3023		 * before OS handover.
3024		 */
3025		dmar_disable_qi(iommu);
3026	}
3027
3028	if (dmar_enable_qi(iommu)) {
3029		/*
3030		 * Queued Invalidate not enabled, use Register Based Invalidate
3031		 */
3032		iommu->flush.flush_context = __iommu_flush_context;
3033		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3034		pr_info("%s: Using Register based invalidation\n",
3035			iommu->name);
3036	} else {
3037		iommu->flush.flush_context = qi_flush_context;
3038		iommu->flush.flush_iotlb = qi_flush_iotlb;
3039		pr_info("%s: Using Queued invalidation\n", iommu->name);
3040	}
3041}
3042
3043static int copy_context_table(struct intel_iommu *iommu,
3044			      struct root_entry *old_re,
3045			      struct context_entry **tbl,
3046			      int bus, bool ext)
3047{
3048	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3049	struct context_entry *new_ce = NULL, ce;
3050	struct context_entry *old_ce = NULL;
3051	struct root_entry re;
3052	phys_addr_t old_ce_phys;
3053
3054	tbl_idx = ext ? bus * 2 : bus;
3055	memcpy(&re, old_re, sizeof(re));
3056
3057	for (devfn = 0; devfn < 256; devfn++) {
3058		/* First calculate the correct index */
3059		idx = (ext ? devfn * 2 : devfn) % 256;
3060
3061		if (idx == 0) {
3062			/* First save what we may have and clean up */
3063			if (new_ce) {
3064				tbl[tbl_idx] = new_ce;
3065				__iommu_flush_cache(iommu, new_ce,
3066						    VTD_PAGE_SIZE);
3067				pos = 1;
3068			}
3069
3070			if (old_ce)
3071				iounmap(old_ce);
3072
3073			ret = 0;
3074			if (devfn < 0x80)
3075				old_ce_phys = root_entry_lctp(&re);
3076			else
3077				old_ce_phys = root_entry_uctp(&re);
3078
3079			if (!old_ce_phys) {
3080				if (ext && devfn == 0) {
3081					/* No LCTP, try UCTP */
3082					devfn = 0x7f;
3083					continue;
3084				} else {
3085					goto out;
3086				}
3087			}
3088
3089			ret = -ENOMEM;
3090			old_ce = memremap(old_ce_phys, PAGE_SIZE,
3091					MEMREMAP_WB);
3092			if (!old_ce)
3093				goto out;
3094
3095			new_ce = alloc_pgtable_page(iommu->node);
3096			if (!new_ce)
3097				goto out_unmap;
3098
3099			ret = 0;
3100		}
3101
3102		/* Now copy the context entry */
3103		memcpy(&ce, old_ce + idx, sizeof(ce));
3104
3105		if (!__context_present(&ce))
3106			continue;
3107
3108		did = context_domain_id(&ce);
3109		if (did >= 0 && did < cap_ndoms(iommu->cap))
3110			set_bit(did, iommu->domain_ids);
3111
3112		/*
3113		 * We need a marker for copied context entries. This
3114		 * marker needs to work for the old format as well as
3115		 * for extended context entries.
3116		 *
3117		 * Bit 67 of the context entry is used. In the old
3118		 * format this bit is available to software, in the
3119		 * extended format it is the PGE bit, but PGE is ignored
3120		 * by HW if PASIDs are disabled (and thus still
3121		 * available).
3122		 *
3123		 * So disable PASIDs first and then mark the entry
3124		 * copied. This means that we don't copy PASID
3125		 * translations from the old kernel, but this is fine as
3126		 * faults there are not fatal.
3127		 */
3128		context_clear_pasid_enable(&ce);
3129		context_set_copied(&ce);
3130
3131		new_ce[idx] = ce;
3132	}
3133
3134	tbl[tbl_idx + pos] = new_ce;
3135
3136	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3137
3138out_unmap:
3139	memunmap(old_ce);
3140
3141out:
3142	return ret;
3143}
3144
3145static int copy_translation_tables(struct intel_iommu *iommu)
3146{
3147	struct context_entry **ctxt_tbls;
3148	struct root_entry *old_rt;
3149	phys_addr_t old_rt_phys;
3150	int ctxt_table_entries;
3151	unsigned long flags;
3152	u64 rtaddr_reg;
3153	int bus, ret;
3154	bool new_ext, ext;
3155
3156	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3157	ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3158	new_ext    = !!ecap_ecs(iommu->ecap);
3159
3160	/*
3161	 * The RTT bit can only be changed when translation is disabled,
3162	 * but disabling translation means to open a window for data
3163	 * corruption. So bail out and don't copy anything if we would
3164	 * have to change the bit.
3165	 */
3166	if (new_ext != ext)
3167		return -EINVAL;
3168
3169	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3170	if (!old_rt_phys)
3171		return -EINVAL;
3172
3173	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3174	if (!old_rt)
3175		return -ENOMEM;
3176
3177	/* This is too big for the stack - allocate it from slab */
3178	ctxt_table_entries = ext ? 512 : 256;
3179	ret = -ENOMEM;
3180	ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3181	if (!ctxt_tbls)
3182		goto out_unmap;
3183
3184	for (bus = 0; bus < 256; bus++) {
3185		ret = copy_context_table(iommu, &old_rt[bus],
3186					 ctxt_tbls, bus, ext);
3187		if (ret) {
3188			pr_err("%s: Failed to copy context table for bus %d\n",
3189				iommu->name, bus);
3190			continue;
3191		}
3192	}
3193
3194	spin_lock_irqsave(&iommu->lock, flags);
3195
3196	/* Context tables are copied, now write them to the root_entry table */
3197	for (bus = 0; bus < 256; bus++) {
3198		int idx = ext ? bus * 2 : bus;
3199		u64 val;
3200
3201		if (ctxt_tbls[idx]) {
3202			val = virt_to_phys(ctxt_tbls[idx]) | 1;
3203			iommu->root_entry[bus].lo = val;
3204		}
3205
3206		if (!ext || !ctxt_tbls[idx + 1])
3207			continue;
3208
3209		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3210		iommu->root_entry[bus].hi = val;
3211	}
3212
3213	spin_unlock_irqrestore(&iommu->lock, flags);
3214
3215	kfree(ctxt_tbls);
3216
3217	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3218
3219	ret = 0;
3220
3221out_unmap:
3222	memunmap(old_rt);
3223
3224	return ret;
3225}
3226
3227static int __init init_dmars(void)
3228{
3229	struct dmar_drhd_unit *drhd;
3230	struct dmar_rmrr_unit *rmrr;
3231	bool copied_tables = false;
3232	struct device *dev;
3233	struct intel_iommu *iommu;
3234	int i, ret;
3235
3236	/*
3237	 * for each drhd
3238	 *    allocate root
3239	 *    initialize and program root entry to not present
3240	 * endfor
3241	 */
3242	for_each_drhd_unit(drhd) {
 
3243		/*
3244		 * lock not needed as this is only incremented in the single
3245		 * threaded kernel __init code path all other access are read
3246		 * only
3247		 */
3248		if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3249			g_num_of_iommus++;
3250			continue;
3251		}
3252		pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3253	}
3254
3255	/* Preallocate enough resources for IOMMU hot-addition */
3256	if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3257		g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3258
3259	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3260			GFP_KERNEL);
3261	if (!g_iommus) {
3262		pr_err("Allocating global iommu array failed\n");
 
 
 
 
 
 
 
3263		ret = -ENOMEM;
3264		goto error;
3265	}
3266
3267	for_each_active_iommu(iommu, drhd) {
 
 
 
 
3268		g_iommus[iommu->seq_id] = iommu;
3269
3270		intel_iommu_init_qi(iommu);
3271
3272		ret = iommu_init_domains(iommu);
3273		if (ret)
3274			goto free_iommu;
3275
3276		init_translation_status(iommu);
3277
3278		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3279			iommu_disable_translation(iommu);
3280			clear_translation_pre_enabled(iommu);
3281			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3282				iommu->name);
3283		}
3284
3285		/*
3286		 * TBD:
3287		 * we could share the same root & context tables
3288		 * among all IOMMU's. Need to Split it later.
3289		 */
3290		ret = iommu_alloc_root_entry(iommu);
3291		if (ret)
3292			goto free_iommu;
3293
3294		if (translation_pre_enabled(iommu)) {
3295			pr_info("Translation already enabled - trying to copy translation structures\n");
3296
3297			ret = copy_translation_tables(iommu);
3298			if (ret) {
3299				/*
3300				 * We found the IOMMU with translation
3301				 * enabled - but failed to copy over the
3302				 * old root-entry table. Try to proceed
3303				 * by disabling translation now and
3304				 * allocating a clean root-entry table.
3305				 * This might cause DMAR faults, but
3306				 * probably the dump will still succeed.
3307				 */
3308				pr_err("Failed to copy translation tables from previous kernel for %s\n",
3309				       iommu->name);
3310				iommu_disable_translation(iommu);
3311				clear_translation_pre_enabled(iommu);
3312			} else {
3313				pr_info("Copied translation tables from previous kernel for %s\n",
3314					iommu->name);
3315				copied_tables = true;
3316			}
3317		}
3318
3319		if (!ecap_pass_through(iommu->ecap))
3320			hw_pass_through = 0;
3321#ifdef CONFIG_INTEL_IOMMU_SVM
3322		if (pasid_enabled(iommu))
3323			intel_svm_alloc_pasid_tables(iommu);
3324#endif
3325	}
3326
3327	/*
3328	 * Now that qi is enabled on all iommus, set the root entry and flush
3329	 * caches. This is required on some Intel X58 chipsets, otherwise the
3330	 * flush_context function will loop forever and the boot hangs.
3331	 */
3332	for_each_active_iommu(iommu, drhd) {
3333		iommu_flush_write_buffer(iommu);
3334		iommu_set_root_entry(iommu);
3335		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3336		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3337	}
3338
3339	if (iommu_pass_through)
3340		iommu_identity_mapping |= IDENTMAP_ALL;
3341
3342#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3343	iommu_identity_mapping |= IDENTMAP_GFX;
3344#endif
3345
3346	check_tylersburg_isoch();
3347
3348	if (iommu_identity_mapping) {
3349		ret = si_domain_init(hw_pass_through);
3350		if (ret)
3351			goto free_iommu;
3352	}
3353
3354
3355	/*
3356	 * If we copied translations from a previous kernel in the kdump
3357	 * case, we can not assign the devices to domains now, as that
3358	 * would eliminate the old mappings. So skip this part and defer
3359	 * the assignment to device driver initialization time.
3360	 */
3361	if (copied_tables)
3362		goto domains_done;
3363
3364	/*
3365	 * If pass through is not set or not enabled, setup context entries for
3366	 * identity mappings for rmrr, gfx, and isa and may fall back to static
3367	 * identity mapping if iommu_identity_mapping is set.
3368	 */
3369	if (iommu_identity_mapping) {
3370		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3371		if (ret) {
3372			pr_crit("Failed to setup IOMMU pass-through\n");
3373			goto free_iommu;
3374		}
3375	}
3376	/*
3377	 * For each rmrr
3378	 *   for each dev attached to rmrr
3379	 *   do
3380	 *     locate drhd for dev, alloc domain for dev
3381	 *     allocate free domain
3382	 *     allocate page table entries for rmrr
3383	 *     if context not allocated for bus
3384	 *           allocate and init context
3385	 *           set present in root table for this bus
3386	 *     init context with domain, translation etc
3387	 *    endfor
3388	 * endfor
3389	 */
3390	pr_info("Setting RMRR:\n");
3391	for_each_rmrr_units(rmrr) {
3392		/* some BIOS lists non-exist devices in DMAR table. */
3393		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3394					  i, dev) {
3395			ret = iommu_prepare_rmrr_dev(rmrr, dev);
 
 
 
 
 
3396			if (ret)
3397				pr_err("Mapping reserved region failed\n");
 
3398		}
3399	}
3400
3401	iommu_prepare_isa();
3402
3403domains_done:
3404
3405	/*
3406	 * for each drhd
3407	 *   enable fault log
3408	 *   global invalidate context cache
3409	 *   global invalidate iotlb
3410	 *   enable translation
3411	 */
3412	for_each_iommu(iommu, drhd) {
3413		if (drhd->ignored) {
3414			/*
3415			 * we always have to disable PMRs or DMA may fail on
3416			 * this device
3417			 */
3418			if (force_on)
3419				iommu_disable_protect_mem_regions(iommu);
3420			continue;
3421		}
 
3422
3423		iommu_flush_write_buffer(iommu);
3424
3425#ifdef CONFIG_INTEL_IOMMU_SVM
3426		if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3427			ret = intel_svm_enable_prq(iommu);
3428			if (ret)
3429				goto free_iommu;
3430		}
3431#endif
3432		ret = dmar_set_interrupt(iommu);
3433		if (ret)
3434			goto free_iommu;
3435
3436		if (!translation_pre_enabled(iommu))
3437			iommu_enable_translation(iommu);
 
 
 
 
 
 
3438
3439		iommu_disable_protect_mem_regions(iommu);
3440	}
3441
3442	return 0;
3443
3444free_iommu:
3445	for_each_active_iommu(iommu, drhd) {
3446		disable_dmar_iommu(iommu);
3447		free_dmar_iommu(iommu);
 
3448	}
3449
3450	kfree(g_iommus);
3451
3452error:
3453	return ret;
3454}
3455
3456/* This takes a number of _MM_ pages, not VTD pages */
3457static unsigned long intel_alloc_iova(struct device *dev,
3458				     struct dmar_domain *domain,
3459				     unsigned long nrpages, uint64_t dma_mask)
3460{
3461	unsigned long iova_pfn = 0;
 
3462
3463	/* Restrict dma_mask to the width that the iommu can handle */
3464	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3465	/* Ensure we reserve the whole size-aligned region */
3466	nrpages = __roundup_pow_of_two(nrpages);
3467
3468	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3469		/*
3470		 * First try to allocate an io virtual address in
3471		 * DMA_BIT_MASK(32) and if that fails then try allocating
3472		 * from higher range
3473		 */
3474		iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3475					   IOVA_PFN(DMA_BIT_MASK(32)), false);
3476		if (iova_pfn)
3477			return iova_pfn;
3478	}
3479	iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3480				   IOVA_PFN(dma_mask), true);
3481	if (unlikely(!iova_pfn)) {
3482		pr_err("Allocating %ld-page iova for %s failed",
3483		       nrpages, dev_name(dev));
3484		return 0;
3485	}
3486
3487	return iova_pfn;
3488}
3489
3490static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3491{
3492	struct dmar_domain *domain, *tmp;
3493	struct dmar_rmrr_unit *rmrr;
3494	struct device *i_dev;
3495	int i, ret;
3496
3497	domain = find_domain(dev);
3498	if (domain)
3499		goto out;
 
 
 
 
3500
3501	domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3502	if (!domain)
3503		goto out;
3504
3505	/* We have a new domain - setup possible RMRRs for the device */
3506	rcu_read_lock();
3507	for_each_rmrr_units(rmrr) {
3508		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3509					  i, i_dev) {
3510			if (i_dev != dev)
3511				continue;
3512
3513			ret = domain_prepare_identity_map(dev, domain,
3514							  rmrr->base_address,
3515							  rmrr->end_address);
3516			if (ret)
3517				dev_err(dev, "Mapping reserved region failed\n");
3518		}
3519	}
3520	rcu_read_unlock();
3521
3522	tmp = set_domain_for_dev(dev, domain);
3523	if (!tmp || domain != tmp) {
3524		domain_exit(domain);
3525		domain = tmp;
3526	}
3527
3528out:
 
 
3529
3530	if (!domain)
3531		pr_err("Allocating domain for %s failed\n", dev_name(dev));
 
 
3532
 
 
3533
3534	return domain;
 
 
3535}
3536
3537/* Check if the dev needs to go through non-identity map and unmap process.*/
3538static int iommu_no_mapping(struct device *dev)
3539{
 
3540	int found;
3541
3542	if (iommu_dummy(dev))
 
 
 
 
3543		return 1;
3544
3545	if (!iommu_identity_mapping)
3546		return 0;
3547
3548	found = identity_mapping(dev);
3549	if (found) {
3550		if (iommu_should_identity_map(dev, 0))
3551			return 1;
3552		else {
3553			/*
3554			 * 32 bit DMA is removed from si_domain and fall back
3555			 * to non-identity mapping.
3556			 */
3557			dmar_remove_one_dev_info(si_domain, dev);
3558			pr_info("32bit %s uses non-identity mapping\n",
3559				dev_name(dev));
3560			return 0;
3561		}
3562	} else {
3563		/*
3564		 * In case of a detached 64 bit DMA device from vm, the device
3565		 * is put into si_domain for identity mapping.
3566		 */
3567		if (iommu_should_identity_map(dev, 0)) {
3568			int ret;
3569			ret = domain_add_dev_info(si_domain, dev);
 
 
 
3570			if (!ret) {
3571				pr_info("64bit %s uses identity mapping\n",
3572					dev_name(dev));
3573				return 1;
3574			}
3575		}
3576	}
3577
3578	return 0;
3579}
3580
3581static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3582				     size_t size, int dir, u64 dma_mask)
3583{
 
3584	struct dmar_domain *domain;
3585	phys_addr_t start_paddr;
3586	unsigned long iova_pfn;
3587	int prot = 0;
3588	int ret;
3589	struct intel_iommu *iommu;
3590	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3591
3592	BUG_ON(dir == DMA_NONE);
3593
3594	if (iommu_no_mapping(dev))
3595		return paddr;
3596
3597	domain = get_valid_domain_for_dev(dev);
3598	if (!domain)
3599		return 0;
3600
3601	iommu = domain_get_iommu(domain);
3602	size = aligned_nrpages(paddr, size);
3603
3604	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3605	if (!iova_pfn)
3606		goto error;
3607
3608	/*
3609	 * Check if DMAR supports zero-length reads on write only
3610	 * mappings..
3611	 */
3612	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3613			!cap_zlr(iommu->cap))
3614		prot |= DMA_PTE_READ;
3615	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3616		prot |= DMA_PTE_WRITE;
3617	/*
3618	 * paddr - (paddr + size) might be partial page, we should map the whole
3619	 * page.  Note: if two part of one page are separately mapped, we
3620	 * might have two guest_addr mapping to the same host paddr, but this
3621	 * is not a big problem
3622	 */
3623	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3624				 mm_to_dma_pfn(paddr_pfn), size, prot);
3625	if (ret)
3626		goto error;
3627
3628	/* it's a non-present to present mapping. Only flush if caching mode */
3629	if (cap_caching_mode(iommu->cap))
3630		iommu_flush_iotlb_psi(iommu, domain,
3631				      mm_to_dma_pfn(iova_pfn),
3632				      size, 0, 1);
3633	else
3634		iommu_flush_write_buffer(iommu);
3635
3636	start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3637	start_paddr += paddr & ~PAGE_MASK;
3638	return start_paddr;
3639
3640error:
3641	if (iova_pfn)
3642		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3643	pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3644		dev_name(dev), size, (unsigned long long)paddr, dir);
3645	return 0;
3646}
3647
3648static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3649				 unsigned long offset, size_t size,
3650				 enum dma_data_direction dir,
3651				 unsigned long attrs)
3652{
3653	return __intel_map_single(dev, page_to_phys(page) + offset, size,
3654				  dir, *dev->dma_mask);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3655}
3656
3657static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
 
 
3658{
 
3659	struct dmar_domain *domain;
3660	unsigned long start_pfn, last_pfn;
3661	unsigned long nrpages;
3662	unsigned long iova_pfn;
3663	struct intel_iommu *iommu;
3664	struct page *freelist;
3665
3666	if (iommu_no_mapping(dev))
3667		return;
3668
3669	domain = find_domain(dev);
3670	BUG_ON(!domain);
3671
3672	iommu = domain_get_iommu(domain);
3673
3674	iova_pfn = IOVA_PFN(dev_addr);
 
 
 
3675
3676	nrpages = aligned_nrpages(dev_addr, size);
3677	start_pfn = mm_to_dma_pfn(iova_pfn);
3678	last_pfn = start_pfn + nrpages - 1;
3679
3680	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3681		 dev_name(dev), start_pfn, last_pfn);
 
 
 
3682
3683	freelist = domain_unmap(domain, start_pfn, last_pfn);
 
3684
3685	if (intel_iommu_strict) {
3686		iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3687				      nrpages, !freelist, 0);
3688		/* free iova */
3689		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3690		dma_free_pagelist(freelist);
3691	} else {
3692		queue_iova(&domain->iovad, iova_pfn, nrpages,
3693			   (unsigned long)freelist);
3694		/*
3695		 * queue up the release of the unmap to save the 1/6th of the
3696		 * cpu used up by the iotlb flush operation...
3697		 */
3698	}
3699}
3700
3701static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3702			     size_t size, enum dma_data_direction dir,
3703			     unsigned long attrs)
3704{
3705	intel_unmap(dev, dev_addr, size);
3706}
3707
3708static void *intel_alloc_coherent(struct device *dev, size_t size,
3709				  dma_addr_t *dma_handle, gfp_t flags,
3710				  unsigned long attrs)
3711{
3712	void *vaddr;
3713
3714	vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3715	if (iommu_no_mapping(dev) || !vaddr)
3716		return vaddr;
 
 
 
 
 
3717
3718	*dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3719			PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3720			dev->coherent_dma_mask);
3721	if (!*dma_handle)
3722		goto out_free_pages;
3723	return vaddr;
3724
3725out_free_pages:
3726	dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
 
 
 
 
3727	return NULL;
3728}
3729
3730static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3731				dma_addr_t dma_handle, unsigned long attrs)
3732{
3733	if (!iommu_no_mapping(dev))
3734		intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3735	dma_direct_free(dev, size, vaddr, dma_handle, attrs);
 
 
 
 
3736}
3737
3738static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3739			   int nelems, enum dma_data_direction dir,
3740			   unsigned long attrs)
3741{
3742	dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3743	unsigned long nrpages = 0;
3744	struct scatterlist *sg;
3745	int i;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3746
3747	for_each_sg(sglist, sg, nelems, i) {
3748		nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
 
 
 
 
 
 
 
 
 
3749	}
3750
3751	intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3752}
3753
3754static int intel_nontranslate_map_sg(struct device *hddev,
3755	struct scatterlist *sglist, int nelems, int dir)
3756{
3757	int i;
3758	struct scatterlist *sg;
3759
3760	for_each_sg(sglist, sg, nelems, i) {
3761		BUG_ON(!sg_page(sg));
3762		sg->dma_address = sg_phys(sg);
3763		sg->dma_length = sg->length;
3764	}
3765	return nelems;
3766}
3767
3768static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3769			enum dma_data_direction dir, unsigned long attrs)
3770{
3771	int i;
 
3772	struct dmar_domain *domain;
3773	size_t size = 0;
3774	int prot = 0;
3775	unsigned long iova_pfn;
3776	int ret;
3777	struct scatterlist *sg;
3778	unsigned long start_vpfn;
3779	struct intel_iommu *iommu;
3780
3781	BUG_ON(dir == DMA_NONE);
3782	if (iommu_no_mapping(dev))
3783		return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3784
3785	domain = get_valid_domain_for_dev(dev);
3786	if (!domain)
3787		return 0;
3788
3789	iommu = domain_get_iommu(domain);
3790
3791	for_each_sg(sglist, sg, nelems, i)
3792		size += aligned_nrpages(sg->offset, sg->length);
3793
3794	iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3795				*dev->dma_mask);
3796	if (!iova_pfn) {
3797		sglist->dma_length = 0;
3798		return 0;
3799	}
3800
3801	/*
3802	 * Check if DMAR supports zero-length reads on write only
3803	 * mappings..
3804	 */
3805	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3806			!cap_zlr(iommu->cap))
3807		prot |= DMA_PTE_READ;
3808	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3809		prot |= DMA_PTE_WRITE;
3810
3811	start_vpfn = mm_to_dma_pfn(iova_pfn);
3812
3813	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3814	if (unlikely(ret)) {
 
 
 
 
3815		dma_pte_free_pagetable(domain, start_vpfn,
3816				       start_vpfn + size - 1,
3817				       agaw_to_level(domain->agaw) + 1);
3818		free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3819		return 0;
3820	}
3821
3822	/* it's a non-present to present mapping. Only flush if caching mode */
3823	if (cap_caching_mode(iommu->cap))
3824		iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3825	else
3826		iommu_flush_write_buffer(iommu);
3827
3828	return nelems;
3829}
3830
3831static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3832{
3833	return !dma_addr;
3834}
3835
3836const struct dma_map_ops intel_dma_ops = {
3837	.alloc = intel_alloc_coherent,
3838	.free = intel_free_coherent,
3839	.map_sg = intel_map_sg,
3840	.unmap_sg = intel_unmap_sg,
3841	.map_page = intel_map_page,
3842	.unmap_page = intel_unmap_page,
3843	.mapping_error = intel_mapping_error,
3844#ifdef CONFIG_X86
3845	.dma_supported = dma_direct_supported,
3846#endif
3847};
3848
3849static inline int iommu_domain_cache_init(void)
3850{
3851	int ret = 0;
3852
3853	iommu_domain_cache = kmem_cache_create("iommu_domain",
3854					 sizeof(struct dmar_domain),
3855					 0,
3856					 SLAB_HWCACHE_ALIGN,
3857
3858					 NULL);
3859	if (!iommu_domain_cache) {
3860		pr_err("Couldn't create iommu_domain cache\n");
3861		ret = -ENOMEM;
3862	}
3863
3864	return ret;
3865}
3866
3867static inline int iommu_devinfo_cache_init(void)
3868{
3869	int ret = 0;
3870
3871	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3872					 sizeof(struct device_domain_info),
3873					 0,
3874					 SLAB_HWCACHE_ALIGN,
3875					 NULL);
3876	if (!iommu_devinfo_cache) {
3877		pr_err("Couldn't create devinfo cache\n");
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3878		ret = -ENOMEM;
3879	}
3880
3881	return ret;
3882}
3883
3884static int __init iommu_init_mempool(void)
3885{
3886	int ret;
3887	ret = iova_cache_get();
3888	if (ret)
3889		return ret;
3890
3891	ret = iommu_domain_cache_init();
3892	if (ret)
3893		goto domain_error;
3894
3895	ret = iommu_devinfo_cache_init();
3896	if (!ret)
3897		return ret;
3898
3899	kmem_cache_destroy(iommu_domain_cache);
3900domain_error:
3901	iova_cache_put();
3902
3903	return -ENOMEM;
3904}
3905
3906static void __init iommu_exit_mempool(void)
3907{
3908	kmem_cache_destroy(iommu_devinfo_cache);
3909	kmem_cache_destroy(iommu_domain_cache);
3910	iova_cache_put();
 
3911}
3912
3913static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3914{
3915	struct dmar_drhd_unit *drhd;
3916	u32 vtbar;
3917	int rc;
3918
3919	/* We know that this device on this chipset has its own IOMMU.
3920	 * If we find it under a different IOMMU, then the BIOS is lying
3921	 * to us. Hope that the IOMMU for this device is actually
3922	 * disabled, and it needs no translation...
3923	 */
3924	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3925	if (rc) {
3926		/* "can't" happen */
3927		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3928		return;
3929	}
3930	vtbar &= 0xffff0000;
3931
3932	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3933	drhd = dmar_find_matched_drhd_unit(pdev);
3934	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3935			    TAINT_FIRMWARE_WORKAROUND,
3936			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3937		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3938}
3939DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3940
3941static void __init init_no_remapping_devices(void)
3942{
3943	struct dmar_drhd_unit *drhd;
3944	struct device *dev;
3945	int i;
3946
3947	for_each_drhd_unit(drhd) {
3948		if (!drhd->include_all) {
3949			for_each_active_dev_scope(drhd->devices,
3950						  drhd->devices_cnt, i, dev)
3951				break;
3952			/* ignore DMAR unit if no devices exist */
 
3953			if (i == drhd->devices_cnt)
3954				drhd->ignored = 1;
3955		}
3956	}
3957
3958	for_each_active_drhd_unit(drhd) {
3959		if (drhd->include_all)
 
3960			continue;
3961
3962		for_each_active_dev_scope(drhd->devices,
3963					  drhd->devices_cnt, i, dev)
3964			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3965				break;
 
3966		if (i < drhd->devices_cnt)
3967			continue;
3968
3969		/* This IOMMU has *only* gfx devices. Either bypass it or
3970		   set the gfx_mapped flag, as appropriate */
3971		if (dmar_map_gfx) {
3972			intel_iommu_gfx_mapped = 1;
3973		} else {
3974			drhd->ignored = 1;
3975			for_each_active_dev_scope(drhd->devices,
3976						  drhd->devices_cnt, i, dev)
3977				dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
 
 
3978		}
3979	}
3980}
3981
3982#ifdef CONFIG_SUSPEND
3983static int init_iommu_hw(void)
3984{
3985	struct dmar_drhd_unit *drhd;
3986	struct intel_iommu *iommu = NULL;
3987
3988	for_each_active_iommu(iommu, drhd)
3989		if (iommu->qi)
3990			dmar_reenable_qi(iommu);
3991
3992	for_each_iommu(iommu, drhd) {
3993		if (drhd->ignored) {
3994			/*
3995			 * we always have to disable PMRs or DMA may fail on
3996			 * this device
3997			 */
3998			if (force_on)
3999				iommu_disable_protect_mem_regions(iommu);
4000			continue;
4001		}
4002	
4003		iommu_flush_write_buffer(iommu);
4004
4005		iommu_set_root_entry(iommu);
4006
4007		iommu->flush.flush_context(iommu, 0, 0, 0,
4008					   DMA_CCMD_GLOBAL_INVL);
4009		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4010		iommu_enable_translation(iommu);
 
 
4011		iommu_disable_protect_mem_regions(iommu);
4012	}
4013
4014	return 0;
4015}
4016
4017static void iommu_flush_all(void)
4018{
4019	struct dmar_drhd_unit *drhd;
4020	struct intel_iommu *iommu;
4021
4022	for_each_active_iommu(iommu, drhd) {
4023		iommu->flush.flush_context(iommu, 0, 0, 0,
4024					   DMA_CCMD_GLOBAL_INVL);
4025		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4026					 DMA_TLB_GLOBAL_FLUSH);
4027	}
4028}
4029
4030static int iommu_suspend(void)
4031{
4032	struct dmar_drhd_unit *drhd;
4033	struct intel_iommu *iommu = NULL;
4034	unsigned long flag;
4035
4036	for_each_active_iommu(iommu, drhd) {
4037		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4038						 GFP_ATOMIC);
4039		if (!iommu->iommu_state)
4040			goto nomem;
4041	}
4042
4043	iommu_flush_all();
4044
4045	for_each_active_iommu(iommu, drhd) {
4046		iommu_disable_translation(iommu);
4047
4048		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4049
4050		iommu->iommu_state[SR_DMAR_FECTL_REG] =
4051			readl(iommu->reg + DMAR_FECTL_REG);
4052		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4053			readl(iommu->reg + DMAR_FEDATA_REG);
4054		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4055			readl(iommu->reg + DMAR_FEADDR_REG);
4056		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4057			readl(iommu->reg + DMAR_FEUADDR_REG);
4058
4059		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4060	}
4061	return 0;
4062
4063nomem:
4064	for_each_active_iommu(iommu, drhd)
4065		kfree(iommu->iommu_state);
4066
4067	return -ENOMEM;
4068}
4069
4070static void iommu_resume(void)
4071{
4072	struct dmar_drhd_unit *drhd;
4073	struct intel_iommu *iommu = NULL;
4074	unsigned long flag;
4075
4076	if (init_iommu_hw()) {
4077		if (force_on)
4078			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4079		else
4080			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4081		return;
4082	}
4083
4084	for_each_active_iommu(iommu, drhd) {
4085
4086		raw_spin_lock_irqsave(&iommu->register_lock, flag);
4087
4088		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4089			iommu->reg + DMAR_FECTL_REG);
4090		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4091			iommu->reg + DMAR_FEDATA_REG);
4092		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4093			iommu->reg + DMAR_FEADDR_REG);
4094		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4095			iommu->reg + DMAR_FEUADDR_REG);
4096
4097		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4098	}
4099
4100	for_each_active_iommu(iommu, drhd)
4101		kfree(iommu->iommu_state);
4102}
4103
4104static struct syscore_ops iommu_syscore_ops = {
4105	.resume		= iommu_resume,
4106	.suspend	= iommu_suspend,
4107};
4108
4109static void __init init_iommu_pm_ops(void)
4110{
4111	register_syscore_ops(&iommu_syscore_ops);
4112}
4113
4114#else
4115static inline void init_iommu_pm_ops(void) {}
4116#endif	/* CONFIG_PM */
4117
4118
4119int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4120{
4121	struct acpi_dmar_reserved_memory *rmrr;
4122	int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4123	struct dmar_rmrr_unit *rmrru;
4124	size_t length;
4125
4126	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4127	if (!rmrru)
4128		goto out;
4129
4130	rmrru->hdr = header;
4131	rmrr = (struct acpi_dmar_reserved_memory *)header;
4132	rmrru->base_address = rmrr->base_address;
4133	rmrru->end_address = rmrr->end_address;
4134
4135	length = rmrr->end_address - rmrr->base_address + 1;
4136	rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4137					      IOMMU_RESV_DIRECT);
4138	if (!rmrru->resv)
4139		goto free_rmrru;
4140
4141	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4142				((void *)rmrr) + rmrr->header.length,
4143				&rmrru->devices_cnt);
4144	if (rmrru->devices_cnt && rmrru->devices == NULL)
4145		goto free_all;
4146
4147	list_add(&rmrru->list, &dmar_rmrr_units);
4148
4149	return 0;
4150free_all:
4151	kfree(rmrru->resv);
4152free_rmrru:
4153	kfree(rmrru);
4154out:
4155	return -ENOMEM;
4156}
4157
4158static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4159{
4160	struct dmar_atsr_unit *atsru;
4161	struct acpi_dmar_atsr *tmp;
4162
4163	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4164		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4165		if (atsr->segment != tmp->segment)
4166			continue;
4167		if (atsr->header.length != tmp->header.length)
4168			continue;
4169		if (memcmp(atsr, tmp, atsr->header.length) == 0)
4170			return atsru;
4171	}
4172
4173	return NULL;
4174}
4175
4176int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4177{
4178	struct acpi_dmar_atsr *atsr;
4179	struct dmar_atsr_unit *atsru;
4180
4181	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4182		return 0;
4183
4184	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4185	atsru = dmar_find_atsr(atsr);
4186	if (atsru)
4187		return 0;
4188
4189	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4190	if (!atsru)
4191		return -ENOMEM;
4192
4193	/*
4194	 * If memory is allocated from slab by ACPI _DSM method, we need to
4195	 * copy the memory content because the memory buffer will be freed
4196	 * on return.
4197	 */
4198	atsru->hdr = (void *)(atsru + 1);
4199	memcpy(atsru->hdr, hdr, hdr->length);
4200	atsru->include_all = atsr->flags & 0x1;
4201	if (!atsru->include_all) {
4202		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4203				(void *)atsr + atsr->header.length,
4204				&atsru->devices_cnt);
4205		if (atsru->devices_cnt && atsru->devices == NULL) {
4206			kfree(atsru);
4207			return -ENOMEM;
4208		}
4209	}
4210
4211	list_add_rcu(&atsru->list, &dmar_atsr_units);
4212
4213	return 0;
4214}
4215
4216static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4217{
4218	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4219	kfree(atsru);
4220}
4221
4222int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4223{
4224	struct acpi_dmar_atsr *atsr;
4225	struct dmar_atsr_unit *atsru;
4226
4227	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4228	atsru = dmar_find_atsr(atsr);
4229	if (atsru) {
4230		list_del_rcu(&atsru->list);
4231		synchronize_rcu();
4232		intel_iommu_free_atsr(atsru);
4233	}
4234
4235	return 0;
4236}
4237
4238int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4239{
4240	int i;
4241	struct device *dev;
4242	struct acpi_dmar_atsr *atsr;
4243	struct dmar_atsr_unit *atsru;
4244
4245	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4246	atsru = dmar_find_atsr(atsr);
4247	if (!atsru)
4248		return 0;
4249
4250	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4251		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4252					  i, dev)
4253			return -EBUSY;
4254	}
4255
4256	return 0;
4257}
4258
4259static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4260{
4261	int sp, ret = 0;
4262	struct intel_iommu *iommu = dmaru->iommu;
4263
4264	if (g_iommus[iommu->seq_id])
4265		return 0;
4266
4267	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4268		pr_warn("%s: Doesn't support hardware pass through.\n",
4269			iommu->name);
4270		return -ENXIO;
4271	}
4272	if (!ecap_sc_support(iommu->ecap) &&
4273	    domain_update_iommu_snooping(iommu)) {
4274		pr_warn("%s: Doesn't support snooping.\n",
4275			iommu->name);
4276		return -ENXIO;
4277	}
4278	sp = domain_update_iommu_superpage(iommu) - 1;
4279	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4280		pr_warn("%s: Doesn't support large page.\n",
4281			iommu->name);
4282		return -ENXIO;
4283	}
4284
4285	/*
4286	 * Disable translation if already enabled prior to OS handover.
4287	 */
4288	if (iommu->gcmd & DMA_GCMD_TE)
4289		iommu_disable_translation(iommu);
4290
4291	g_iommus[iommu->seq_id] = iommu;
4292	ret = iommu_init_domains(iommu);
4293	if (ret == 0)
4294		ret = iommu_alloc_root_entry(iommu);
4295	if (ret)
4296		goto out;
4297
4298#ifdef CONFIG_INTEL_IOMMU_SVM
4299	if (pasid_enabled(iommu))
4300		intel_svm_alloc_pasid_tables(iommu);
4301#endif
4302
4303	if (dmaru->ignored) {
4304		/*
4305		 * we always have to disable PMRs or DMA may fail on this device
4306		 */
4307		if (force_on)
4308			iommu_disable_protect_mem_regions(iommu);
4309		return 0;
4310	}
4311
4312	intel_iommu_init_qi(iommu);
4313	iommu_flush_write_buffer(iommu);
4314
4315#ifdef CONFIG_INTEL_IOMMU_SVM
4316	if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4317		ret = intel_svm_enable_prq(iommu);
4318		if (ret)
4319			goto disable_iommu;
4320	}
4321#endif
4322	ret = dmar_set_interrupt(iommu);
4323	if (ret)
4324		goto disable_iommu;
4325
4326	iommu_set_root_entry(iommu);
4327	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4328	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4329	iommu_enable_translation(iommu);
4330
4331	iommu_disable_protect_mem_regions(iommu);
4332	return 0;
4333
4334disable_iommu:
4335	disable_dmar_iommu(iommu);
4336out:
4337	free_dmar_iommu(iommu);
4338	return ret;
4339}
4340
4341int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4342{
4343	int ret = 0;
4344	struct intel_iommu *iommu = dmaru->iommu;
4345
4346	if (!intel_iommu_enabled)
4347		return 0;
4348	if (iommu == NULL)
4349		return -EINVAL;
4350
4351	if (insert) {
4352		ret = intel_iommu_add(dmaru);
4353	} else {
4354		disable_dmar_iommu(iommu);
4355		free_dmar_iommu(iommu);
4356	}
4357
4358	return ret;
4359}
4360
4361static void intel_iommu_free_dmars(void)
4362{
4363	struct dmar_rmrr_unit *rmrru, *rmrr_n;
4364	struct dmar_atsr_unit *atsru, *atsr_n;
4365
4366	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4367		list_del(&rmrru->list);
4368		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4369		kfree(rmrru->resv);
4370		kfree(rmrru);
4371	}
4372
4373	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4374		list_del(&atsru->list);
4375		intel_iommu_free_atsr(atsru);
4376	}
4377}
4378
4379int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4380{
4381	int i, ret = 1;
4382	struct pci_bus *bus;
4383	struct pci_dev *bridge = NULL;
4384	struct device *tmp;
4385	struct acpi_dmar_atsr *atsr;
4386	struct dmar_atsr_unit *atsru;
4387
4388	dev = pci_physfn(dev);
4389	for (bus = dev->bus; bus; bus = bus->parent) {
4390		bridge = bus->self;
4391		/* If it's an integrated device, allow ATS */
4392		if (!bridge)
4393			return 1;
4394		/* Connected via non-PCIe: no ATS */
4395		if (!pci_is_pcie(bridge) ||
4396		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4397			return 0;
4398		/* If we found the root port, look it up in the ATSR */
4399		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4400			break;
4401	}
4402
4403	rcu_read_lock();
4404	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4405		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4406		if (atsr->segment != pci_domain_nr(dev->bus))
4407			continue;
4408
4409		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4410			if (tmp == &bridge->dev)
4411				goto out;
4412
4413		if (atsru->include_all)
4414			goto out;
4415	}
4416	ret = 0;
4417out:
4418	rcu_read_unlock();
4419
4420	return ret;
4421}
4422
4423int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4424{
4425	int ret = 0;
4426	struct dmar_rmrr_unit *rmrru;
4427	struct dmar_atsr_unit *atsru;
4428	struct acpi_dmar_atsr *atsr;
4429	struct acpi_dmar_reserved_memory *rmrr;
4430
4431	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4432		return 0;
4433
4434	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4435		rmrr = container_of(rmrru->hdr,
4436				    struct acpi_dmar_reserved_memory, header);
4437		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4438			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4439				((void *)rmrr) + rmrr->header.length,
4440				rmrr->segment, rmrru->devices,
4441				rmrru->devices_cnt);
4442			if(ret < 0)
4443				return ret;
4444		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4445			dmar_remove_dev_scope(info, rmrr->segment,
4446				rmrru->devices, rmrru->devices_cnt);
4447		}
4448	}
4449
4450	list_for_each_entry(atsru, &dmar_atsr_units, list) {
4451		if (atsru->include_all)
4452			continue;
4453
4454		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4455		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4456			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4457					(void *)atsr + atsr->header.length,
4458					atsr->segment, atsru->devices,
4459					atsru->devices_cnt);
4460			if (ret > 0)
4461				break;
4462			else if(ret < 0)
4463				return ret;
4464		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4465			if (dmar_remove_dev_scope(info, atsr->segment,
4466					atsru->devices, atsru->devices_cnt))
4467				break;
4468		}
4469	}
4470
4471	return 0;
4472}
4473
4474/*
4475 * Here we only respond to action of unbound device from driver.
4476 *
4477 * Added device is not attached to its DMAR domain here yet. That will happen
4478 * when mapping the device to iova.
4479 */
4480static int device_notifier(struct notifier_block *nb,
4481				  unsigned long action, void *data)
4482{
4483	struct device *dev = data;
 
4484	struct dmar_domain *domain;
4485
4486	if (iommu_dummy(dev))
4487		return 0;
4488
4489	if (action != BUS_NOTIFY_REMOVED_DEVICE)
 
4490		return 0;
4491
4492	domain = find_domain(dev);
4493	if (!domain)
4494		return 0;
4495
4496	dmar_remove_one_dev_info(domain, dev);
4497	if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4498		domain_exit(domain);
 
 
4499
4500	return 0;
4501}
4502
4503static struct notifier_block device_nb = {
4504	.notifier_call = device_notifier,
4505};
4506
4507static int intel_iommu_memory_notifier(struct notifier_block *nb,
4508				       unsigned long val, void *v)
4509{
4510	struct memory_notify *mhp = v;
4511	unsigned long long start, end;
4512	unsigned long start_vpfn, last_vpfn;
4513
4514	switch (val) {
4515	case MEM_GOING_ONLINE:
4516		start = mhp->start_pfn << PAGE_SHIFT;
4517		end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4518		if (iommu_domain_identity_map(si_domain, start, end)) {
4519			pr_warn("Failed to build identity map for [%llx-%llx]\n",
4520				start, end);
4521			return NOTIFY_BAD;
4522		}
4523		break;
4524
4525	case MEM_OFFLINE:
4526	case MEM_CANCEL_ONLINE:
4527		start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4528		last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4529		while (start_vpfn <= last_vpfn) {
4530			struct iova *iova;
4531			struct dmar_drhd_unit *drhd;
4532			struct intel_iommu *iommu;
4533			struct page *freelist;
4534
4535			iova = find_iova(&si_domain->iovad, start_vpfn);
4536			if (iova == NULL) {
4537				pr_debug("Failed get IOVA for PFN %lx\n",
4538					 start_vpfn);
4539				break;
4540			}
4541
4542			iova = split_and_remove_iova(&si_domain->iovad, iova,
4543						     start_vpfn, last_vpfn);
4544			if (iova == NULL) {
4545				pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4546					start_vpfn, last_vpfn);
4547				return NOTIFY_BAD;
4548			}
4549
4550			freelist = domain_unmap(si_domain, iova->pfn_lo,
4551					       iova->pfn_hi);
4552
4553			rcu_read_lock();
4554			for_each_active_iommu(iommu, drhd)
4555				iommu_flush_iotlb_psi(iommu, si_domain,
4556					iova->pfn_lo, iova_size(iova),
4557					!freelist, 0);
4558			rcu_read_unlock();
4559			dma_free_pagelist(freelist);
4560
4561			start_vpfn = iova->pfn_hi + 1;
4562			free_iova_mem(iova);
4563		}
4564		break;
4565	}
4566
4567	return NOTIFY_OK;
4568}
4569
4570static struct notifier_block intel_iommu_memory_nb = {
4571	.notifier_call = intel_iommu_memory_notifier,
4572	.priority = 0
4573};
4574
4575static void free_all_cpu_cached_iovas(unsigned int cpu)
4576{
4577	int i;
4578
4579	for (i = 0; i < g_num_of_iommus; i++) {
4580		struct intel_iommu *iommu = g_iommus[i];
4581		struct dmar_domain *domain;
4582		int did;
4583
4584		if (!iommu)
4585			continue;
4586
4587		for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4588			domain = get_iommu_domain(iommu, (u16)did);
4589
4590			if (!domain)
4591				continue;
4592			free_cpu_cached_iovas(cpu, &domain->iovad);
4593		}
4594	}
4595}
4596
4597static int intel_iommu_cpu_dead(unsigned int cpu)
4598{
4599	free_all_cpu_cached_iovas(cpu);
4600	return 0;
4601}
4602
4603static void intel_disable_iommus(void)
4604{
4605	struct intel_iommu *iommu = NULL;
4606	struct dmar_drhd_unit *drhd;
4607
4608	for_each_iommu(iommu, drhd)
4609		iommu_disable_translation(iommu);
4610}
4611
4612static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4613{
4614	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4615
4616	return container_of(iommu_dev, struct intel_iommu, iommu);
4617}
4618
4619static ssize_t intel_iommu_show_version(struct device *dev,
4620					struct device_attribute *attr,
4621					char *buf)
4622{
4623	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4624	u32 ver = readl(iommu->reg + DMAR_VER_REG);
4625	return sprintf(buf, "%d:%d\n",
4626		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4627}
4628static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4629
4630static ssize_t intel_iommu_show_address(struct device *dev,
4631					struct device_attribute *attr,
4632					char *buf)
4633{
4634	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4635	return sprintf(buf, "%llx\n", iommu->reg_phys);
4636}
4637static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4638
4639static ssize_t intel_iommu_show_cap(struct device *dev,
4640				    struct device_attribute *attr,
4641				    char *buf)
4642{
4643	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4644	return sprintf(buf, "%llx\n", iommu->cap);
4645}
4646static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4647
4648static ssize_t intel_iommu_show_ecap(struct device *dev,
4649				    struct device_attribute *attr,
4650				    char *buf)
4651{
4652	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4653	return sprintf(buf, "%llx\n", iommu->ecap);
4654}
4655static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4656
4657static ssize_t intel_iommu_show_ndoms(struct device *dev,
4658				      struct device_attribute *attr,
4659				      char *buf)
4660{
4661	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4662	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4663}
4664static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4665
4666static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4667					   struct device_attribute *attr,
4668					   char *buf)
4669{
4670	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4671	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4672						  cap_ndoms(iommu->cap)));
4673}
4674static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4675
4676static struct attribute *intel_iommu_attrs[] = {
4677	&dev_attr_version.attr,
4678	&dev_attr_address.attr,
4679	&dev_attr_cap.attr,
4680	&dev_attr_ecap.attr,
4681	&dev_attr_domains_supported.attr,
4682	&dev_attr_domains_used.attr,
4683	NULL,
4684};
4685
4686static struct attribute_group intel_iommu_group = {
4687	.name = "intel-iommu",
4688	.attrs = intel_iommu_attrs,
4689};
4690
4691const struct attribute_group *intel_iommu_groups[] = {
4692	&intel_iommu_group,
4693	NULL,
4694};
4695
4696int __init intel_iommu_init(void)
4697{
4698	int ret = -ENODEV;
4699	struct dmar_drhd_unit *drhd;
4700	struct intel_iommu *iommu;
4701
4702	/* VT-d is required for a TXT/tboot launch, so enforce that */
4703	force_on = tboot_force_iommu();
4704
4705	if (iommu_init_mempool()) {
4706		if (force_on)
4707			panic("tboot: Failed to initialize iommu memory\n");
4708		return -ENOMEM;
4709	}
4710
4711	down_write(&dmar_global_lock);
4712	if (dmar_table_init()) {
4713		if (force_on)
4714			panic("tboot: Failed to initialize DMAR table\n");
4715		goto out_free_dmar;
4716	}
4717
4718	if (dmar_dev_scope_init() < 0) {
4719		if (force_on)
4720			panic("tboot: Failed to initialize DMAR device scope\n");
4721		goto out_free_dmar;
4722	}
4723
4724	up_write(&dmar_global_lock);
4725
4726	/*
4727	 * The bus notifier takes the dmar_global_lock, so lockdep will
4728	 * complain later when we register it under the lock.
4729	 */
4730	dmar_register_bus_notifier();
 
4731
4732	down_write(&dmar_global_lock);
4733
4734	if (no_iommu || dmar_disabled) {
4735		/*
4736		 * We exit the function here to ensure IOMMU's remapping and
4737		 * mempool aren't setup, which means that the IOMMU's PMRs
4738		 * won't be disabled via the call to init_dmars(). So disable
4739		 * it explicitly here. The PMRs were setup by tboot prior to
4740		 * calling SENTER, but the kernel is expected to reset/tear
4741		 * down the PMRs.
4742		 */
4743		if (intel_iommu_tboot_noforce) {
4744			for_each_iommu(iommu, drhd)
4745				iommu_disable_protect_mem_regions(iommu);
4746		}
4747
4748		/*
4749		 * Make sure the IOMMUs are switched off, even when we
4750		 * boot into a kexec kernel and the previous kernel left
4751		 * them enabled
4752		 */
4753		intel_disable_iommus();
4754		goto out_free_dmar;
4755	}
4756
4757	if (list_empty(&dmar_rmrr_units))
4758		pr_info("No RMRR found\n");
4759
4760	if (list_empty(&dmar_atsr_units))
4761		pr_info("No ATSR found\n");
4762
4763	if (dmar_init_reserved_ranges()) {
4764		if (force_on)
4765			panic("tboot: Failed to reserve iommu ranges\n");
4766		goto out_free_reserved_range;
4767	}
4768
4769	init_no_remapping_devices();
4770
4771	ret = init_dmars();
4772	if (ret) {
4773		if (force_on)
4774			panic("tboot: Failed to initialize DMARs\n");
4775		pr_err("Initialization failed\n");
4776		goto out_free_reserved_range;
 
 
4777	}
4778	up_write(&dmar_global_lock);
4779	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4780
4781#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
 
4782	swiotlb = 0;
4783#endif
4784	dma_ops = &intel_dma_ops;
4785
4786	init_iommu_pm_ops();
4787
4788	for_each_active_iommu(iommu, drhd) {
4789		iommu_device_sysfs_add(&iommu->iommu, NULL,
4790				       intel_iommu_groups,
4791				       "%s", iommu->name);
4792		iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4793		iommu_device_register(&iommu->iommu);
4794	}
4795
4796	bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4797	bus_register_notifier(&pci_bus_type, &device_nb);
4798	if (si_domain && !hw_pass_through)
4799		register_memory_notifier(&intel_iommu_memory_nb);
4800	cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4801			  intel_iommu_cpu_dead);
4802	intel_iommu_enabled = 1;
4803
4804	return 0;
4805
4806out_free_reserved_range:
4807	put_iova_domain(&reserved_iova_list);
4808out_free_dmar:
4809	intel_iommu_free_dmars();
4810	up_write(&dmar_global_lock);
4811	iommu_exit_mempool();
4812	return ret;
4813}
4814
4815static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
 
4816{
4817	struct intel_iommu *iommu = opaque;
4818
4819	domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4820	return 0;
4821}
4822
4823/*
4824 * NB - intel-iommu lacks any sort of reference counting for the users of
4825 * dependent devices.  If multiple endpoints have intersecting dependent
4826 * devices, unbinding the driver from any one of them will possibly leave
4827 * the others unable to operate.
4828 */
4829static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4830{
4831	if (!iommu || !dev || !dev_is_pci(dev))
4832		return;
4833
4834	pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4835}
4836
4837static void __dmar_remove_one_dev_info(struct device_domain_info *info)
 
4838{
 
4839	struct intel_iommu *iommu;
4840	unsigned long flags;
 
 
4841
4842	assert_spin_locked(&device_domain_lock);
 
 
 
4843
4844	if (WARN_ON(!info))
4845		return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4846
4847	iommu = info->iommu;
 
 
 
 
4848
4849	if (info->dev) {
4850		iommu_disable_dev_iotlb(info);
4851		domain_context_clear(iommu, info->dev);
 
 
 
 
4852	}
4853
4854	unlink_domain_info(info);
4855
4856	spin_lock_irqsave(&iommu->lock, flags);
4857	domain_detach_iommu(info->domain, iommu);
4858	spin_unlock_irqrestore(&iommu->lock, flags);
 
 
 
 
4859
4860	free_devinfo_mem(info);
 
 
 
 
 
 
 
4861}
4862
4863static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4864				     struct device *dev)
4865{
4866	struct device_domain_info *info;
4867	unsigned long flags;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4868
4869	spin_lock_irqsave(&device_domain_lock, flags);
4870	info = dev->archdata.iommu;
4871	__dmar_remove_one_dev_info(info);
4872	spin_unlock_irqrestore(&device_domain_lock, flags);
4873}
4874
4875static int md_domain_init(struct dmar_domain *domain, int guest_width)
4876{
4877	int adjust_width;
4878
4879	init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
 
 
4880	domain_reserve_special_ranges(domain);
4881
4882	/* calculate AGAW */
4883	domain->gaw = guest_width;
4884	adjust_width = guestwidth_to_adjustwidth(guest_width);
4885	domain->agaw = width_to_agaw(adjust_width);
4886
 
 
 
4887	domain->iommu_coherency = 0;
4888	domain->iommu_snooping = 0;
4889	domain->iommu_superpage = 0;
4890	domain->max_addr = 0;
 
4891
4892	/* always allocate the top pgd */
4893	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4894	if (!domain->pgd)
4895		return -ENOMEM;
4896	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4897	return 0;
4898}
4899
4900static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4901{
4902	struct dmar_domain *dmar_domain;
4903	struct iommu_domain *domain;
4904
4905	if (type != IOMMU_DOMAIN_UNMANAGED)
4906		return NULL;
4907
4908	dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4909	if (!dmar_domain) {
4910		pr_err("Can't allocate dmar_domain\n");
4911		return NULL;
 
4912	}
4913	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4914		pr_err("Domain initialization failed\n");
4915		domain_exit(dmar_domain);
4916		return NULL;
 
4917	}
4918	domain_update_iommu_cap(dmar_domain);
 
4919
4920	domain = &dmar_domain->domain;
4921	domain->geometry.aperture_start = 0;
4922	domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4923	domain->geometry.force_aperture = true;
4924
4925	return domain;
4926}
4927
4928static void intel_iommu_domain_free(struct iommu_domain *domain)
4929{
4930	domain_exit(to_dmar_domain(domain));
 
 
 
4931}
4932
4933static int intel_iommu_attach_device(struct iommu_domain *domain,
4934				     struct device *dev)
4935{
4936	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 
4937	struct intel_iommu *iommu;
4938	int addr_width;
4939	u8 bus, devfn;
4940
4941	if (device_is_rmrr_locked(dev)) {
4942		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4943		return -EPERM;
4944	}
4945
4946	/* normally dev is not mapped */
4947	if (unlikely(domain_context_mapped(dev))) {
4948		struct dmar_domain *old_domain;
4949
4950		old_domain = find_domain(dev);
4951		if (old_domain) {
4952			rcu_read_lock();
4953			dmar_remove_one_dev_info(old_domain, dev);
4954			rcu_read_unlock();
4955
4956			if (!domain_type_is_vm_or_si(old_domain) &&
4957			     list_empty(&old_domain->devices))
4958				domain_exit(old_domain);
4959		}
4960	}
4961
4962	iommu = device_to_iommu(dev, &bus, &devfn);
 
4963	if (!iommu)
4964		return -ENODEV;
4965
4966	/* check if this iommu agaw is sufficient for max mapped address */
4967	addr_width = agaw_to_width(iommu->agaw);
4968	if (addr_width > cap_mgaw(iommu->cap))
4969		addr_width = cap_mgaw(iommu->cap);
4970
4971	if (dmar_domain->max_addr > (1LL << addr_width)) {
4972		pr_err("%s: iommu width (%d) is not "
4973		       "sufficient for the mapped address (%llx)\n",
4974		       __func__, addr_width, dmar_domain->max_addr);
4975		return -EFAULT;
4976	}
4977	dmar_domain->gaw = addr_width;
4978
4979	/*
4980	 * Knock out extra levels of page tables if necessary
4981	 */
4982	while (iommu->agaw < dmar_domain->agaw) {
4983		struct dma_pte *pte;
4984
4985		pte = dmar_domain->pgd;
4986		if (dma_pte_present(pte)) {
4987			dmar_domain->pgd = (struct dma_pte *)
4988				phys_to_virt(dma_pte_addr(pte));
4989			free_pgtable_page(pte);
4990		}
4991		dmar_domain->agaw--;
4992	}
4993
4994	return domain_add_dev_info(dmar_domain, dev);
4995}
4996
4997static void intel_iommu_detach_device(struct iommu_domain *domain,
4998				      struct device *dev)
4999{
5000	dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
 
 
 
5001}
5002
5003static int intel_iommu_map(struct iommu_domain *domain,
5004			   unsigned long iova, phys_addr_t hpa,
5005			   size_t size, int iommu_prot)
5006{
5007	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5008	u64 max_addr;
5009	int prot = 0;
 
5010	int ret;
5011
5012	if (iommu_prot & IOMMU_READ)
5013		prot |= DMA_PTE_READ;
5014	if (iommu_prot & IOMMU_WRITE)
5015		prot |= DMA_PTE_WRITE;
5016	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5017		prot |= DMA_PTE_SNP;
5018
 
5019	max_addr = iova + size;
5020	if (dmar_domain->max_addr < max_addr) {
5021		u64 end;
5022
5023		/* check if minimum agaw is sufficient for mapped address */
5024		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5025		if (end < max_addr) {
5026			pr_err("%s: iommu width (%d) is not "
5027			       "sufficient for the mapped address (%llx)\n",
5028			       __func__, dmar_domain->gaw, max_addr);
5029			return -EFAULT;
5030		}
5031		dmar_domain->max_addr = max_addr;
5032	}
5033	/* Round up size to next multiple of PAGE_SIZE, if it and
5034	   the low bits of hpa would take us onto the next page */
5035	size = aligned_nrpages(hpa, size);
5036	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5037				 hpa >> VTD_PAGE_SHIFT, size, prot);
5038	return ret;
5039}
5040
5041static size_t intel_iommu_unmap(struct iommu_domain *domain,
5042				unsigned long iova, size_t size)
5043{
5044	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5045	struct page *freelist = NULL;
5046	unsigned long start_pfn, last_pfn;
5047	unsigned int npages;
5048	int iommu_id, level = 0;
5049
5050	/* Cope with horrid API which requires us to unmap more than the
5051	   size argument if it happens to be a large-page mapping. */
5052	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5053
5054	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5055		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5056
5057	start_pfn = iova >> VTD_PAGE_SHIFT;
5058	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5059
5060	freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5061
5062	npages = last_pfn - start_pfn + 1;
5063
5064	for_each_domain_iommu(iommu_id, dmar_domain)
5065		iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5066				      start_pfn, npages, !freelist, 0);
5067
5068	dma_free_pagelist(freelist);
5069
5070	if (dmar_domain->max_addr == iova + size)
5071		dmar_domain->max_addr = iova;
5072
5073	return size;
5074}
5075
5076static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5077					    dma_addr_t iova)
5078{
5079	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5080	struct dma_pte *pte;
5081	int level = 0;
5082	u64 phys = 0;
5083
5084	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5085	if (pte)
5086		phys = dma_pte_addr(pte);
5087
5088	return phys;
5089}
5090
5091static bool intel_iommu_capable(enum iommu_cap cap)
 
5092{
 
 
5093	if (cap == IOMMU_CAP_CACHE_COHERENCY)
5094		return domain_update_iommu_snooping(NULL) == 1;
5095	if (cap == IOMMU_CAP_INTR_REMAP)
5096		return irq_remapping_enabled == 1;
5097
5098	return false;
5099}
5100
5101static int intel_iommu_add_device(struct device *dev)
5102{
5103	struct intel_iommu *iommu;
5104	struct iommu_group *group;
5105	u8 bus, devfn;
5106
5107	iommu = device_to_iommu(dev, &bus, &devfn);
5108	if (!iommu)
5109		return -ENODEV;
5110
5111	iommu_device_link(&iommu->iommu, dev);
5112
5113	group = iommu_group_get_for_dev(dev);
5114
5115	if (IS_ERR(group))
5116		return PTR_ERR(group);
5117
5118	iommu_group_put(group);
5119	return 0;
5120}
5121
5122static void intel_iommu_remove_device(struct device *dev)
5123{
5124	struct intel_iommu *iommu;
5125	u8 bus, devfn;
5126
5127	iommu = device_to_iommu(dev, &bus, &devfn);
5128	if (!iommu)
5129		return;
5130
5131	iommu_group_remove_device(dev);
5132
5133	iommu_device_unlink(&iommu->iommu, dev);
5134}
5135
5136static void intel_iommu_get_resv_regions(struct device *device,
5137					 struct list_head *head)
5138{
5139	struct iommu_resv_region *reg;
5140	struct dmar_rmrr_unit *rmrr;
5141	struct device *i_dev;
5142	int i;
5143
5144	rcu_read_lock();
5145	for_each_rmrr_units(rmrr) {
5146		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5147					  i, i_dev) {
5148			if (i_dev != device)
5149				continue;
5150
5151			list_add_tail(&rmrr->resv->list, head);
5152		}
5153	}
5154	rcu_read_unlock();
5155
5156	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5157				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5158				      0, IOMMU_RESV_MSI);
5159	if (!reg)
5160		return;
5161	list_add_tail(&reg->list, head);
5162}
5163
5164static void intel_iommu_put_resv_regions(struct device *dev,
5165					 struct list_head *head)
5166{
5167	struct iommu_resv_region *entry, *next;
5168
5169	list_for_each_entry_safe(entry, next, head, list) {
5170		if (entry->type == IOMMU_RESV_RESERVED)
5171			kfree(entry);
5172	}
5173}
5174
5175#ifdef CONFIG_INTEL_IOMMU_SVM
5176#define MAX_NR_PASID_BITS (20)
5177static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5178{
5179	/*
5180	 * Convert ecap_pss to extend context entry pts encoding, also
5181	 * respect the soft pasid_max value set by the iommu.
5182	 * - number of PASID bits = ecap_pss + 1
5183	 * - number of PASID table entries = 2^(pts + 5)
5184	 * Therefore, pts = ecap_pss - 4
5185	 * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5186	 */
5187	if (ecap_pss(iommu->ecap) < 5)
5188		return 0;
5189
5190	/* pasid_max is encoded as actual number of entries not the bits */
5191	return find_first_bit((unsigned long *)&iommu->pasid_max,
5192			MAX_NR_PASID_BITS) - 5;
5193}
5194
5195int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5196{
5197	struct device_domain_info *info;
5198	struct context_entry *context;
5199	struct dmar_domain *domain;
5200	unsigned long flags;
5201	u64 ctx_lo;
5202	int ret;
5203
5204	domain = get_valid_domain_for_dev(sdev->dev);
5205	if (!domain)
5206		return -EINVAL;
5207
5208	spin_lock_irqsave(&device_domain_lock, flags);
5209	spin_lock(&iommu->lock);
5210
5211	ret = -EINVAL;
5212	info = sdev->dev->archdata.iommu;
5213	if (!info || !info->pasid_supported)
5214		goto out;
5215
5216	context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5217	if (WARN_ON(!context))
5218		goto out;
5219
5220	ctx_lo = context[0].lo;
5221
5222	sdev->did = domain->iommu_did[iommu->seq_id];
5223	sdev->sid = PCI_DEVID(info->bus, info->devfn);
5224
5225	if (!(ctx_lo & CONTEXT_PASIDE)) {
5226		if (iommu->pasid_state_table)
5227			context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5228		context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5229			intel_iommu_get_pts(iommu);
5230
5231		wmb();
5232		/* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5233		 * extended to permit requests-with-PASID if the PASIDE bit
5234		 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5235		 * however, the PASIDE bit is ignored and requests-with-PASID
5236		 * are unconditionally blocked. Which makes less sense.
5237		 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5238		 * "guest mode" translation types depending on whether ATS
5239		 * is available or not. Annoyingly, we can't use the new
5240		 * modes *unless* PASIDE is set. */
5241		if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5242			ctx_lo &= ~CONTEXT_TT_MASK;
5243			if (info->ats_supported)
5244				ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5245			else
5246				ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5247		}
5248		ctx_lo |= CONTEXT_PASIDE;
5249		if (iommu->pasid_state_table)
5250			ctx_lo |= CONTEXT_DINVE;
5251		if (info->pri_supported)
5252			ctx_lo |= CONTEXT_PRS;
5253		context[0].lo = ctx_lo;
5254		wmb();
5255		iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5256					   DMA_CCMD_MASK_NOBIT,
5257					   DMA_CCMD_DEVICE_INVL);
5258	}
5259
5260	/* Enable PASID support in the device, if it wasn't already */
5261	if (!info->pasid_enabled)
5262		iommu_enable_dev_iotlb(info);
5263
5264	if (info->ats_enabled) {
5265		sdev->dev_iotlb = 1;
5266		sdev->qdep = info->ats_qdep;
5267		if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5268			sdev->qdep = 0;
5269	}
5270	ret = 0;
5271
5272 out:
5273	spin_unlock(&iommu->lock);
5274	spin_unlock_irqrestore(&device_domain_lock, flags);
5275
5276	return ret;
5277}
5278
5279struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5280{
5281	struct intel_iommu *iommu;
5282	u8 bus, devfn;
5283
5284	if (iommu_dummy(dev)) {
5285		dev_warn(dev,
5286			 "No IOMMU translation for device; cannot enable SVM\n");
5287		return NULL;
5288	}
5289
5290	iommu = device_to_iommu(dev, &bus, &devfn);
5291	if ((!iommu)) {
5292		dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5293		return NULL;
5294	}
5295
5296	if (!iommu->pasid_table) {
5297		dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5298		return NULL;
5299	}
5300
5301	return iommu;
5302}
5303#endif /* CONFIG_INTEL_IOMMU_SVM */
5304
5305const struct iommu_ops intel_iommu_ops = {
5306	.capable		= intel_iommu_capable,
5307	.domain_alloc		= intel_iommu_domain_alloc,
5308	.domain_free		= intel_iommu_domain_free,
5309	.attach_dev		= intel_iommu_attach_device,
5310	.detach_dev		= intel_iommu_detach_device,
5311	.map			= intel_iommu_map,
5312	.unmap			= intel_iommu_unmap,
5313	.map_sg			= default_iommu_map_sg,
5314	.iova_to_phys		= intel_iommu_iova_to_phys,
5315	.add_device		= intel_iommu_add_device,
5316	.remove_device		= intel_iommu_remove_device,
5317	.get_resv_regions	= intel_iommu_get_resv_regions,
5318	.put_resv_regions	= intel_iommu_put_resv_regions,
5319	.device_group		= pci_device_group,
5320	.pgsize_bitmap		= INTEL_IOMMU_PGSIZES,
5321};
5322
5323static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5324{
5325	/* G4x/GM45 integrated gfx dmar support is totally busted. */
5326	pr_info("Disabling IOMMU for graphics on this chipset\n");
5327	dmar_map_gfx = 0;
5328}
5329
5330DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5331DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5332DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5333DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5334DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5335DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5336DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5337
5338static void quirk_iommu_rwbf(struct pci_dev *dev)
5339{
5340	/*
5341	 * Mobile 4 Series Chipset neglects to set RWBF capability,
5342	 * but needs it. Same seems to hold for the desktop versions.
5343	 */
5344	pr_info("Forcing write-buffer flush capability\n");
5345	rwbf_quirk = 1;
 
 
 
 
 
 
5346}
5347
5348DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5349DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5350DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5351DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5352DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5353DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5354DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5355
5356#define GGC 0x52
5357#define GGC_MEMORY_SIZE_MASK	(0xf << 8)
5358#define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
5359#define GGC_MEMORY_SIZE_1M	(0x1 << 8)
5360#define GGC_MEMORY_SIZE_2M	(0x3 << 8)
5361#define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
5362#define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
5363#define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
5364#define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
5365
5366static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5367{
5368	unsigned short ggc;
5369
5370	if (pci_read_config_word(dev, GGC, &ggc))
5371		return;
5372
5373	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5374		pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5375		dmar_map_gfx = 0;
5376	} else if (dmar_map_gfx) {
5377		/* we have to ensure the gfx device is idle before we flush */
5378		pr_info("Disabling batched IOTLB flush on Ironlake\n");
5379		intel_iommu_strict = 1;
5380       }
5381}
5382DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5383DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5384DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5385DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5386
5387/* On Tylersburg chipsets, some BIOSes have been known to enable the
5388   ISOCH DMAR unit for the Azalia sound device, but not give it any
5389   TLB entries, which causes it to deadlock. Check for that.  We do
5390   this in a function called from init_dmars(), instead of in a PCI
5391   quirk, because we don't want to print the obnoxious "BIOS broken"
5392   message if VT-d is actually disabled.
5393*/
5394static void __init check_tylersburg_isoch(void)
5395{
5396	struct pci_dev *pdev;
5397	uint32_t vtisochctrl;
5398
5399	/* If there's no Azalia in the system anyway, forget it. */
5400	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5401	if (!pdev)
5402		return;
5403	pci_dev_put(pdev);
5404
5405	/* System Management Registers. Might be hidden, in which case
5406	   we can't do the sanity check. But that's OK, because the
5407	   known-broken BIOSes _don't_ actually hide it, so far. */
5408	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5409	if (!pdev)
5410		return;
5411
5412	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5413		pci_dev_put(pdev);
5414		return;
5415	}
5416
5417	pci_dev_put(pdev);
5418
5419	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5420	if (vtisochctrl & 1)
5421		return;
5422
5423	/* Drop all bits other than the number of TLB entries */
5424	vtisochctrl &= 0x1c;
5425
5426	/* If we have the recommended number of TLB entries (16), fine. */
5427	if (vtisochctrl == 0x10)
5428		return;
5429
5430	/* Zero TLB entries? You get to ride the short bus to school. */
5431	if (!vtisochctrl) {
5432		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5433		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5434		     dmi_get_system_info(DMI_BIOS_VENDOR),
5435		     dmi_get_system_info(DMI_BIOS_VERSION),
5436		     dmi_get_system_info(DMI_PRODUCT_VERSION));
5437		iommu_identity_mapping |= IDENTMAP_AZALIA;
5438		return;
5439	}
5440
5441	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5442	       vtisochctrl);
5443}