Loading...
1/*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24#include <linux/init.h>
25#include <linux/bitmap.h>
26#include <linux/debugfs.h>
27#include <linux/slab.h>
28#include <linux/irq.h>
29#include <linux/interrupt.h>
30#include <linux/spinlock.h>
31#include <linux/pci.h>
32#include <linux/dmar.h>
33#include <linux/dma-mapping.h>
34#include <linux/mempool.h>
35#include <linux/timer.h>
36#include <linux/iova.h>
37#include <linux/iommu.h>
38#include <linux/intel-iommu.h>
39#include <linux/syscore_ops.h>
40#include <linux/tboot.h>
41#include <linux/dmi.h>
42#include <linux/pci-ats.h>
43#include <asm/cacheflush.h>
44#include <asm/iommu.h>
45
46#define ROOT_SIZE VTD_PAGE_SIZE
47#define CONTEXT_SIZE VTD_PAGE_SIZE
48
49#define IS_BRIDGE_HOST_DEVICE(pdev) \
50 ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
51#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55#define IOAPIC_RANGE_START (0xfee00000)
56#define IOAPIC_RANGE_END (0xfeefffff)
57#define IOVA_START_ADDR (0x1000)
58
59#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61#define MAX_AGAW_WIDTH 64
62
63#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73#define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74#define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75
76/* page table handling */
77#define LEVEL_STRIDE (9)
78#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
79
80static inline int agaw_to_level(int agaw)
81{
82 return agaw + 2;
83}
84
85static inline int agaw_to_width(int agaw)
86{
87 return 30 + agaw * LEVEL_STRIDE;
88}
89
90static inline int width_to_agaw(int width)
91{
92 return (width - 30) / LEVEL_STRIDE;
93}
94
95static inline unsigned int level_to_offset_bits(int level)
96{
97 return (level - 1) * LEVEL_STRIDE;
98}
99
100static inline int pfn_level_offset(unsigned long pfn, int level)
101{
102 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
103}
104
105static inline unsigned long level_mask(int level)
106{
107 return -1UL << level_to_offset_bits(level);
108}
109
110static inline unsigned long level_size(int level)
111{
112 return 1UL << level_to_offset_bits(level);
113}
114
115static inline unsigned long align_to_level(unsigned long pfn, int level)
116{
117 return (pfn + level_size(level) - 1) & level_mask(level);
118}
119
120static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
121{
122 return 1 << ((lvl - 1) * LEVEL_STRIDE);
123}
124
125/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
126 are never going to work. */
127static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
128{
129 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
130}
131
132static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
133{
134 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
135}
136static inline unsigned long page_to_dma_pfn(struct page *pg)
137{
138 return mm_to_dma_pfn(page_to_pfn(pg));
139}
140static inline unsigned long virt_to_dma_pfn(void *p)
141{
142 return page_to_dma_pfn(virt_to_page(p));
143}
144
145/* global iommu list, set NULL for ignored DMAR units */
146static struct intel_iommu **g_iommus;
147
148static void __init check_tylersburg_isoch(void);
149static int rwbf_quirk;
150
151/*
152 * set to 1 to panic kernel if can't successfully enable VT-d
153 * (used when kernel is launched w/ TXT)
154 */
155static int force_on = 0;
156
157/*
158 * 0: Present
159 * 1-11: Reserved
160 * 12-63: Context Ptr (12 - (haw-1))
161 * 64-127: Reserved
162 */
163struct root_entry {
164 u64 val;
165 u64 rsvd1;
166};
167#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
168static inline bool root_present(struct root_entry *root)
169{
170 return (root->val & 1);
171}
172static inline void set_root_present(struct root_entry *root)
173{
174 root->val |= 1;
175}
176static inline void set_root_value(struct root_entry *root, unsigned long value)
177{
178 root->val |= value & VTD_PAGE_MASK;
179}
180
181static inline struct context_entry *
182get_context_addr_from_root(struct root_entry *root)
183{
184 return (struct context_entry *)
185 (root_present(root)?phys_to_virt(
186 root->val & VTD_PAGE_MASK) :
187 NULL);
188}
189
190/*
191 * low 64 bits:
192 * 0: present
193 * 1: fault processing disable
194 * 2-3: translation type
195 * 12-63: address space root
196 * high 64 bits:
197 * 0-2: address width
198 * 3-6: aval
199 * 8-23: domain id
200 */
201struct context_entry {
202 u64 lo;
203 u64 hi;
204};
205
206static inline bool context_present(struct context_entry *context)
207{
208 return (context->lo & 1);
209}
210static inline void context_set_present(struct context_entry *context)
211{
212 context->lo |= 1;
213}
214
215static inline void context_set_fault_enable(struct context_entry *context)
216{
217 context->lo &= (((u64)-1) << 2) | 1;
218}
219
220static inline void context_set_translation_type(struct context_entry *context,
221 unsigned long value)
222{
223 context->lo &= (((u64)-1) << 4) | 3;
224 context->lo |= (value & 3) << 2;
225}
226
227static inline void context_set_address_root(struct context_entry *context,
228 unsigned long value)
229{
230 context->lo |= value & VTD_PAGE_MASK;
231}
232
233static inline void context_set_address_width(struct context_entry *context,
234 unsigned long value)
235{
236 context->hi |= value & 7;
237}
238
239static inline void context_set_domain_id(struct context_entry *context,
240 unsigned long value)
241{
242 context->hi |= (value & ((1 << 16) - 1)) << 8;
243}
244
245static inline void context_clear_entry(struct context_entry *context)
246{
247 context->lo = 0;
248 context->hi = 0;
249}
250
251/*
252 * 0: readable
253 * 1: writable
254 * 2-6: reserved
255 * 7: super page
256 * 8-10: available
257 * 11: snoop behavior
258 * 12-63: Host physcial address
259 */
260struct dma_pte {
261 u64 val;
262};
263
264static inline void dma_clear_pte(struct dma_pte *pte)
265{
266 pte->val = 0;
267}
268
269static inline void dma_set_pte_readable(struct dma_pte *pte)
270{
271 pte->val |= DMA_PTE_READ;
272}
273
274static inline void dma_set_pte_writable(struct dma_pte *pte)
275{
276 pte->val |= DMA_PTE_WRITE;
277}
278
279static inline void dma_set_pte_snp(struct dma_pte *pte)
280{
281 pte->val |= DMA_PTE_SNP;
282}
283
284static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
285{
286 pte->val = (pte->val & ~3) | (prot & 3);
287}
288
289static inline u64 dma_pte_addr(struct dma_pte *pte)
290{
291#ifdef CONFIG_64BIT
292 return pte->val & VTD_PAGE_MASK;
293#else
294 /* Must have a full atomic 64-bit read */
295 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
296#endif
297}
298
299static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
300{
301 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
302}
303
304static inline bool dma_pte_present(struct dma_pte *pte)
305{
306 return (pte->val & 3) != 0;
307}
308
309static inline bool dma_pte_superpage(struct dma_pte *pte)
310{
311 return (pte->val & (1 << 7));
312}
313
314static inline int first_pte_in_page(struct dma_pte *pte)
315{
316 return !((unsigned long)pte & ~VTD_PAGE_MASK);
317}
318
319/*
320 * This domain is a statically identity mapping domain.
321 * 1. This domain creats a static 1:1 mapping to all usable memory.
322 * 2. It maps to each iommu if successful.
323 * 3. Each iommu mapps to this domain if successful.
324 */
325static struct dmar_domain *si_domain;
326static int hw_pass_through = 1;
327
328/* devices under the same p2p bridge are owned in one domain */
329#define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
330
331/* domain represents a virtual machine, more than one devices
332 * across iommus may be owned in one domain, e.g. kvm guest.
333 */
334#define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
335
336/* si_domain contains mulitple devices */
337#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
338
339struct dmar_domain {
340 int id; /* domain id */
341 int nid; /* node id */
342 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
343
344 struct list_head devices; /* all devices' list */
345 struct iova_domain iovad; /* iova's that belong to this domain */
346
347 struct dma_pte *pgd; /* virtual address */
348 int gaw; /* max guest address width */
349
350 /* adjusted guest address width, 0 is level 2 30-bit */
351 int agaw;
352
353 int flags; /* flags to find out type of domain */
354
355 int iommu_coherency;/* indicate coherency of iommu access */
356 int iommu_snooping; /* indicate snooping control feature*/
357 int iommu_count; /* reference count of iommu */
358 int iommu_superpage;/* Level of superpages supported:
359 0 == 4KiB (no superpages), 1 == 2MiB,
360 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
361 spinlock_t iommu_lock; /* protect iommu set in domain */
362 u64 max_addr; /* maximum mapped address */
363};
364
365/* PCI domain-device relationship */
366struct device_domain_info {
367 struct list_head link; /* link to domain siblings */
368 struct list_head global; /* link to global list */
369 int segment; /* PCI domain */
370 u8 bus; /* PCI bus number */
371 u8 devfn; /* PCI devfn number */
372 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
373 struct intel_iommu *iommu; /* IOMMU used by this device */
374 struct dmar_domain *domain; /* pointer to domain */
375};
376
377static void flush_unmaps_timeout(unsigned long data);
378
379DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
380
381#define HIGH_WATER_MARK 250
382struct deferred_flush_tables {
383 int next;
384 struct iova *iova[HIGH_WATER_MARK];
385 struct dmar_domain *domain[HIGH_WATER_MARK];
386};
387
388static struct deferred_flush_tables *deferred_flush;
389
390/* bitmap for indexing intel_iommus */
391static int g_num_of_iommus;
392
393static DEFINE_SPINLOCK(async_umap_flush_lock);
394static LIST_HEAD(unmaps_to_do);
395
396static int timer_on;
397static long list_size;
398
399static void domain_remove_dev_info(struct dmar_domain *domain);
400
401#ifdef CONFIG_DMAR_DEFAULT_ON
402int dmar_disabled = 0;
403#else
404int dmar_disabled = 1;
405#endif /*CONFIG_DMAR_DEFAULT_ON*/
406
407static int dmar_map_gfx = 1;
408static int dmar_forcedac;
409static int intel_iommu_strict;
410static int intel_iommu_superpage = 1;
411
412int intel_iommu_gfx_mapped;
413EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
414
415#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
416static DEFINE_SPINLOCK(device_domain_lock);
417static LIST_HEAD(device_domain_list);
418
419static struct iommu_ops intel_iommu_ops;
420
421static int __init intel_iommu_setup(char *str)
422{
423 if (!str)
424 return -EINVAL;
425 while (*str) {
426 if (!strncmp(str, "on", 2)) {
427 dmar_disabled = 0;
428 printk(KERN_INFO "Intel-IOMMU: enabled\n");
429 } else if (!strncmp(str, "off", 3)) {
430 dmar_disabled = 1;
431 printk(KERN_INFO "Intel-IOMMU: disabled\n");
432 } else if (!strncmp(str, "igfx_off", 8)) {
433 dmar_map_gfx = 0;
434 printk(KERN_INFO
435 "Intel-IOMMU: disable GFX device mapping\n");
436 } else if (!strncmp(str, "forcedac", 8)) {
437 printk(KERN_INFO
438 "Intel-IOMMU: Forcing DAC for PCI devices\n");
439 dmar_forcedac = 1;
440 } else if (!strncmp(str, "strict", 6)) {
441 printk(KERN_INFO
442 "Intel-IOMMU: disable batched IOTLB flush\n");
443 intel_iommu_strict = 1;
444 } else if (!strncmp(str, "sp_off", 6)) {
445 printk(KERN_INFO
446 "Intel-IOMMU: disable supported super page\n");
447 intel_iommu_superpage = 0;
448 }
449
450 str += strcspn(str, ",");
451 while (*str == ',')
452 str++;
453 }
454 return 0;
455}
456__setup("intel_iommu=", intel_iommu_setup);
457
458static struct kmem_cache *iommu_domain_cache;
459static struct kmem_cache *iommu_devinfo_cache;
460static struct kmem_cache *iommu_iova_cache;
461
462static inline void *alloc_pgtable_page(int node)
463{
464 struct page *page;
465 void *vaddr = NULL;
466
467 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
468 if (page)
469 vaddr = page_address(page);
470 return vaddr;
471}
472
473static inline void free_pgtable_page(void *vaddr)
474{
475 free_page((unsigned long)vaddr);
476}
477
478static inline void *alloc_domain_mem(void)
479{
480 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
481}
482
483static void free_domain_mem(void *vaddr)
484{
485 kmem_cache_free(iommu_domain_cache, vaddr);
486}
487
488static inline void * alloc_devinfo_mem(void)
489{
490 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
491}
492
493static inline void free_devinfo_mem(void *vaddr)
494{
495 kmem_cache_free(iommu_devinfo_cache, vaddr);
496}
497
498struct iova *alloc_iova_mem(void)
499{
500 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
501}
502
503void free_iova_mem(struct iova *iova)
504{
505 kmem_cache_free(iommu_iova_cache, iova);
506}
507
508
509static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
510{
511 unsigned long sagaw;
512 int agaw = -1;
513
514 sagaw = cap_sagaw(iommu->cap);
515 for (agaw = width_to_agaw(max_gaw);
516 agaw >= 0; agaw--) {
517 if (test_bit(agaw, &sagaw))
518 break;
519 }
520
521 return agaw;
522}
523
524/*
525 * Calculate max SAGAW for each iommu.
526 */
527int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
528{
529 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
530}
531
532/*
533 * calculate agaw for each iommu.
534 * "SAGAW" may be different across iommus, use a default agaw, and
535 * get a supported less agaw for iommus that don't support the default agaw.
536 */
537int iommu_calculate_agaw(struct intel_iommu *iommu)
538{
539 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
540}
541
542/* This functionin only returns single iommu in a domain */
543static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
544{
545 int iommu_id;
546
547 /* si_domain and vm domain should not get here. */
548 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
549 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
550
551 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
552 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
553 return NULL;
554
555 return g_iommus[iommu_id];
556}
557
558static void domain_update_iommu_coherency(struct dmar_domain *domain)
559{
560 int i;
561
562 domain->iommu_coherency = 1;
563
564 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
565 if (!ecap_coherent(g_iommus[i]->ecap)) {
566 domain->iommu_coherency = 0;
567 break;
568 }
569 }
570}
571
572static void domain_update_iommu_snooping(struct dmar_domain *domain)
573{
574 int i;
575
576 domain->iommu_snooping = 1;
577
578 for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
579 if (!ecap_sc_support(g_iommus[i]->ecap)) {
580 domain->iommu_snooping = 0;
581 break;
582 }
583 }
584}
585
586static void domain_update_iommu_superpage(struct dmar_domain *domain)
587{
588 struct dmar_drhd_unit *drhd;
589 struct intel_iommu *iommu = NULL;
590 int mask = 0xf;
591
592 if (!intel_iommu_superpage) {
593 domain->iommu_superpage = 0;
594 return;
595 }
596
597 /* set iommu_superpage to the smallest common denominator */
598 for_each_active_iommu(iommu, drhd) {
599 mask &= cap_super_page_val(iommu->cap);
600 if (!mask) {
601 break;
602 }
603 }
604 domain->iommu_superpage = fls(mask);
605}
606
607/* Some capabilities may be different across iommus */
608static void domain_update_iommu_cap(struct dmar_domain *domain)
609{
610 domain_update_iommu_coherency(domain);
611 domain_update_iommu_snooping(domain);
612 domain_update_iommu_superpage(domain);
613}
614
615static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
616{
617 struct dmar_drhd_unit *drhd = NULL;
618 int i;
619
620 for_each_drhd_unit(drhd) {
621 if (drhd->ignored)
622 continue;
623 if (segment != drhd->segment)
624 continue;
625
626 for (i = 0; i < drhd->devices_cnt; i++) {
627 if (drhd->devices[i] &&
628 drhd->devices[i]->bus->number == bus &&
629 drhd->devices[i]->devfn == devfn)
630 return drhd->iommu;
631 if (drhd->devices[i] &&
632 drhd->devices[i]->subordinate &&
633 drhd->devices[i]->subordinate->number <= bus &&
634 drhd->devices[i]->subordinate->subordinate >= bus)
635 return drhd->iommu;
636 }
637
638 if (drhd->include_all)
639 return drhd->iommu;
640 }
641
642 return NULL;
643}
644
645static void domain_flush_cache(struct dmar_domain *domain,
646 void *addr, int size)
647{
648 if (!domain->iommu_coherency)
649 clflush_cache_range(addr, size);
650}
651
652/* Gets context entry for a given bus and devfn */
653static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
654 u8 bus, u8 devfn)
655{
656 struct root_entry *root;
657 struct context_entry *context;
658 unsigned long phy_addr;
659 unsigned long flags;
660
661 spin_lock_irqsave(&iommu->lock, flags);
662 root = &iommu->root_entry[bus];
663 context = get_context_addr_from_root(root);
664 if (!context) {
665 context = (struct context_entry *)
666 alloc_pgtable_page(iommu->node);
667 if (!context) {
668 spin_unlock_irqrestore(&iommu->lock, flags);
669 return NULL;
670 }
671 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
672 phy_addr = virt_to_phys((void *)context);
673 set_root_value(root, phy_addr);
674 set_root_present(root);
675 __iommu_flush_cache(iommu, root, sizeof(*root));
676 }
677 spin_unlock_irqrestore(&iommu->lock, flags);
678 return &context[devfn];
679}
680
681static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
682{
683 struct root_entry *root;
684 struct context_entry *context;
685 int ret;
686 unsigned long flags;
687
688 spin_lock_irqsave(&iommu->lock, flags);
689 root = &iommu->root_entry[bus];
690 context = get_context_addr_from_root(root);
691 if (!context) {
692 ret = 0;
693 goto out;
694 }
695 ret = context_present(&context[devfn]);
696out:
697 spin_unlock_irqrestore(&iommu->lock, flags);
698 return ret;
699}
700
701static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
702{
703 struct root_entry *root;
704 struct context_entry *context;
705 unsigned long flags;
706
707 spin_lock_irqsave(&iommu->lock, flags);
708 root = &iommu->root_entry[bus];
709 context = get_context_addr_from_root(root);
710 if (context) {
711 context_clear_entry(&context[devfn]);
712 __iommu_flush_cache(iommu, &context[devfn], \
713 sizeof(*context));
714 }
715 spin_unlock_irqrestore(&iommu->lock, flags);
716}
717
718static void free_context_table(struct intel_iommu *iommu)
719{
720 struct root_entry *root;
721 int i;
722 unsigned long flags;
723 struct context_entry *context;
724
725 spin_lock_irqsave(&iommu->lock, flags);
726 if (!iommu->root_entry) {
727 goto out;
728 }
729 for (i = 0; i < ROOT_ENTRY_NR; i++) {
730 root = &iommu->root_entry[i];
731 context = get_context_addr_from_root(root);
732 if (context)
733 free_pgtable_page(context);
734 }
735 free_pgtable_page(iommu->root_entry);
736 iommu->root_entry = NULL;
737out:
738 spin_unlock_irqrestore(&iommu->lock, flags);
739}
740
741static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
742 unsigned long pfn, int target_level)
743{
744 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
745 struct dma_pte *parent, *pte = NULL;
746 int level = agaw_to_level(domain->agaw);
747 int offset;
748
749 BUG_ON(!domain->pgd);
750 BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
751 parent = domain->pgd;
752
753 while (level > 0) {
754 void *tmp_page;
755
756 offset = pfn_level_offset(pfn, level);
757 pte = &parent[offset];
758 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
759 break;
760 if (level == target_level)
761 break;
762
763 if (!dma_pte_present(pte)) {
764 uint64_t pteval;
765
766 tmp_page = alloc_pgtable_page(domain->nid);
767
768 if (!tmp_page)
769 return NULL;
770
771 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
772 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
773 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
774 /* Someone else set it while we were thinking; use theirs. */
775 free_pgtable_page(tmp_page);
776 } else {
777 dma_pte_addr(pte);
778 domain_flush_cache(domain, pte, sizeof(*pte));
779 }
780 }
781 parent = phys_to_virt(dma_pte_addr(pte));
782 level--;
783 }
784
785 return pte;
786}
787
788
789/* return address's pte at specific level */
790static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
791 unsigned long pfn,
792 int level, int *large_page)
793{
794 struct dma_pte *parent, *pte = NULL;
795 int total = agaw_to_level(domain->agaw);
796 int offset;
797
798 parent = domain->pgd;
799 while (level <= total) {
800 offset = pfn_level_offset(pfn, total);
801 pte = &parent[offset];
802 if (level == total)
803 return pte;
804
805 if (!dma_pte_present(pte)) {
806 *large_page = total;
807 break;
808 }
809
810 if (pte->val & DMA_PTE_LARGE_PAGE) {
811 *large_page = total;
812 return pte;
813 }
814
815 parent = phys_to_virt(dma_pte_addr(pte));
816 total--;
817 }
818 return NULL;
819}
820
821/* clear last level pte, a tlb flush should be followed */
822static int dma_pte_clear_range(struct dmar_domain *domain,
823 unsigned long start_pfn,
824 unsigned long last_pfn)
825{
826 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
827 unsigned int large_page = 1;
828 struct dma_pte *first_pte, *pte;
829 int order;
830
831 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
832 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
833 BUG_ON(start_pfn > last_pfn);
834
835 /* we don't need lock here; nobody else touches the iova range */
836 do {
837 large_page = 1;
838 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
839 if (!pte) {
840 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
841 continue;
842 }
843 do {
844 dma_clear_pte(pte);
845 start_pfn += lvl_to_nr_pages(large_page);
846 pte++;
847 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
848
849 domain_flush_cache(domain, first_pte,
850 (void *)pte - (void *)first_pte);
851
852 } while (start_pfn && start_pfn <= last_pfn);
853
854 order = (large_page - 1) * 9;
855 return order;
856}
857
858/* free page table pages. last level pte should already be cleared */
859static void dma_pte_free_pagetable(struct dmar_domain *domain,
860 unsigned long start_pfn,
861 unsigned long last_pfn)
862{
863 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
864 struct dma_pte *first_pte, *pte;
865 int total = agaw_to_level(domain->agaw);
866 int level;
867 unsigned long tmp;
868 int large_page = 2;
869
870 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
871 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
872 BUG_ON(start_pfn > last_pfn);
873
874 /* We don't need lock here; nobody else touches the iova range */
875 level = 2;
876 while (level <= total) {
877 tmp = align_to_level(start_pfn, level);
878
879 /* If we can't even clear one PTE at this level, we're done */
880 if (tmp + level_size(level) - 1 > last_pfn)
881 return;
882
883 do {
884 large_page = level;
885 first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
886 if (large_page > level)
887 level = large_page + 1;
888 if (!pte) {
889 tmp = align_to_level(tmp + 1, level + 1);
890 continue;
891 }
892 do {
893 if (dma_pte_present(pte)) {
894 free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
895 dma_clear_pte(pte);
896 }
897 pte++;
898 tmp += level_size(level);
899 } while (!first_pte_in_page(pte) &&
900 tmp + level_size(level) - 1 <= last_pfn);
901
902 domain_flush_cache(domain, first_pte,
903 (void *)pte - (void *)first_pte);
904
905 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
906 level++;
907 }
908 /* free pgd */
909 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
910 free_pgtable_page(domain->pgd);
911 domain->pgd = NULL;
912 }
913}
914
915/* iommu handling */
916static int iommu_alloc_root_entry(struct intel_iommu *iommu)
917{
918 struct root_entry *root;
919 unsigned long flags;
920
921 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
922 if (!root)
923 return -ENOMEM;
924
925 __iommu_flush_cache(iommu, root, ROOT_SIZE);
926
927 spin_lock_irqsave(&iommu->lock, flags);
928 iommu->root_entry = root;
929 spin_unlock_irqrestore(&iommu->lock, flags);
930
931 return 0;
932}
933
934static void iommu_set_root_entry(struct intel_iommu *iommu)
935{
936 void *addr;
937 u32 sts;
938 unsigned long flag;
939
940 addr = iommu->root_entry;
941
942 spin_lock_irqsave(&iommu->register_lock, flag);
943 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
944
945 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
946
947 /* Make sure hardware complete it */
948 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
949 readl, (sts & DMA_GSTS_RTPS), sts);
950
951 spin_unlock_irqrestore(&iommu->register_lock, flag);
952}
953
954static void iommu_flush_write_buffer(struct intel_iommu *iommu)
955{
956 u32 val;
957 unsigned long flag;
958
959 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
960 return;
961
962 spin_lock_irqsave(&iommu->register_lock, flag);
963 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
964
965 /* Make sure hardware complete it */
966 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
967 readl, (!(val & DMA_GSTS_WBFS)), val);
968
969 spin_unlock_irqrestore(&iommu->register_lock, flag);
970}
971
972/* return value determine if we need a write buffer flush */
973static void __iommu_flush_context(struct intel_iommu *iommu,
974 u16 did, u16 source_id, u8 function_mask,
975 u64 type)
976{
977 u64 val = 0;
978 unsigned long flag;
979
980 switch (type) {
981 case DMA_CCMD_GLOBAL_INVL:
982 val = DMA_CCMD_GLOBAL_INVL;
983 break;
984 case DMA_CCMD_DOMAIN_INVL:
985 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
986 break;
987 case DMA_CCMD_DEVICE_INVL:
988 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
989 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
990 break;
991 default:
992 BUG();
993 }
994 val |= DMA_CCMD_ICC;
995
996 spin_lock_irqsave(&iommu->register_lock, flag);
997 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
998
999 /* Make sure hardware complete it */
1000 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1001 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1002
1003 spin_unlock_irqrestore(&iommu->register_lock, flag);
1004}
1005
1006/* return value determine if we need a write buffer flush */
1007static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1008 u64 addr, unsigned int size_order, u64 type)
1009{
1010 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1011 u64 val = 0, val_iva = 0;
1012 unsigned long flag;
1013
1014 switch (type) {
1015 case DMA_TLB_GLOBAL_FLUSH:
1016 /* global flush doesn't need set IVA_REG */
1017 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1018 break;
1019 case DMA_TLB_DSI_FLUSH:
1020 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1021 break;
1022 case DMA_TLB_PSI_FLUSH:
1023 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1024 /* Note: always flush non-leaf currently */
1025 val_iva = size_order | addr;
1026 break;
1027 default:
1028 BUG();
1029 }
1030 /* Note: set drain read/write */
1031#if 0
1032 /*
1033 * This is probably to be super secure.. Looks like we can
1034 * ignore it without any impact.
1035 */
1036 if (cap_read_drain(iommu->cap))
1037 val |= DMA_TLB_READ_DRAIN;
1038#endif
1039 if (cap_write_drain(iommu->cap))
1040 val |= DMA_TLB_WRITE_DRAIN;
1041
1042 spin_lock_irqsave(&iommu->register_lock, flag);
1043 /* Note: Only uses first TLB reg currently */
1044 if (val_iva)
1045 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1046 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1047
1048 /* Make sure hardware complete it */
1049 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1050 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1051
1052 spin_unlock_irqrestore(&iommu->register_lock, flag);
1053
1054 /* check IOTLB invalidation granularity */
1055 if (DMA_TLB_IAIG(val) == 0)
1056 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1057 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1058 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1059 (unsigned long long)DMA_TLB_IIRG(type),
1060 (unsigned long long)DMA_TLB_IAIG(val));
1061}
1062
1063static struct device_domain_info *iommu_support_dev_iotlb(
1064 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1065{
1066 int found = 0;
1067 unsigned long flags;
1068 struct device_domain_info *info;
1069 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1070
1071 if (!ecap_dev_iotlb_support(iommu->ecap))
1072 return NULL;
1073
1074 if (!iommu->qi)
1075 return NULL;
1076
1077 spin_lock_irqsave(&device_domain_lock, flags);
1078 list_for_each_entry(info, &domain->devices, link)
1079 if (info->bus == bus && info->devfn == devfn) {
1080 found = 1;
1081 break;
1082 }
1083 spin_unlock_irqrestore(&device_domain_lock, flags);
1084
1085 if (!found || !info->dev)
1086 return NULL;
1087
1088 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1089 return NULL;
1090
1091 if (!dmar_find_matched_atsr_unit(info->dev))
1092 return NULL;
1093
1094 info->iommu = iommu;
1095
1096 return info;
1097}
1098
1099static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1100{
1101 if (!info)
1102 return;
1103
1104 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1105}
1106
1107static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1108{
1109 if (!info->dev || !pci_ats_enabled(info->dev))
1110 return;
1111
1112 pci_disable_ats(info->dev);
1113}
1114
1115static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1116 u64 addr, unsigned mask)
1117{
1118 u16 sid, qdep;
1119 unsigned long flags;
1120 struct device_domain_info *info;
1121
1122 spin_lock_irqsave(&device_domain_lock, flags);
1123 list_for_each_entry(info, &domain->devices, link) {
1124 if (!info->dev || !pci_ats_enabled(info->dev))
1125 continue;
1126
1127 sid = info->bus << 8 | info->devfn;
1128 qdep = pci_ats_queue_depth(info->dev);
1129 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1130 }
1131 spin_unlock_irqrestore(&device_domain_lock, flags);
1132}
1133
1134static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1135 unsigned long pfn, unsigned int pages, int map)
1136{
1137 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1138 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1139
1140 BUG_ON(pages == 0);
1141
1142 /*
1143 * Fallback to domain selective flush if no PSI support or the size is
1144 * too big.
1145 * PSI requires page size to be 2 ^ x, and the base address is naturally
1146 * aligned to the size
1147 */
1148 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1149 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1150 DMA_TLB_DSI_FLUSH);
1151 else
1152 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1153 DMA_TLB_PSI_FLUSH);
1154
1155 /*
1156 * In caching mode, changes of pages from non-present to present require
1157 * flush. However, device IOTLB doesn't need to be flushed in this case.
1158 */
1159 if (!cap_caching_mode(iommu->cap) || !map)
1160 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1161}
1162
1163static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1164{
1165 u32 pmen;
1166 unsigned long flags;
1167
1168 spin_lock_irqsave(&iommu->register_lock, flags);
1169 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1170 pmen &= ~DMA_PMEN_EPM;
1171 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1172
1173 /* wait for the protected region status bit to clear */
1174 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1175 readl, !(pmen & DMA_PMEN_PRS), pmen);
1176
1177 spin_unlock_irqrestore(&iommu->register_lock, flags);
1178}
1179
1180static int iommu_enable_translation(struct intel_iommu *iommu)
1181{
1182 u32 sts;
1183 unsigned long flags;
1184
1185 spin_lock_irqsave(&iommu->register_lock, flags);
1186 iommu->gcmd |= DMA_GCMD_TE;
1187 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1188
1189 /* Make sure hardware complete it */
1190 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1191 readl, (sts & DMA_GSTS_TES), sts);
1192
1193 spin_unlock_irqrestore(&iommu->register_lock, flags);
1194 return 0;
1195}
1196
1197static int iommu_disable_translation(struct intel_iommu *iommu)
1198{
1199 u32 sts;
1200 unsigned long flag;
1201
1202 spin_lock_irqsave(&iommu->register_lock, flag);
1203 iommu->gcmd &= ~DMA_GCMD_TE;
1204 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1205
1206 /* Make sure hardware complete it */
1207 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1208 readl, (!(sts & DMA_GSTS_TES)), sts);
1209
1210 spin_unlock_irqrestore(&iommu->register_lock, flag);
1211 return 0;
1212}
1213
1214
1215static int iommu_init_domains(struct intel_iommu *iommu)
1216{
1217 unsigned long ndomains;
1218 unsigned long nlongs;
1219
1220 ndomains = cap_ndoms(iommu->cap);
1221 pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1222 ndomains);
1223 nlongs = BITS_TO_LONGS(ndomains);
1224
1225 spin_lock_init(&iommu->lock);
1226
1227 /* TBD: there might be 64K domains,
1228 * consider other allocation for future chip
1229 */
1230 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1231 if (!iommu->domain_ids) {
1232 printk(KERN_ERR "Allocating domain id array failed\n");
1233 return -ENOMEM;
1234 }
1235 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1236 GFP_KERNEL);
1237 if (!iommu->domains) {
1238 printk(KERN_ERR "Allocating domain array failed\n");
1239 return -ENOMEM;
1240 }
1241
1242 /*
1243 * if Caching mode is set, then invalid translations are tagged
1244 * with domainid 0. Hence we need to pre-allocate it.
1245 */
1246 if (cap_caching_mode(iommu->cap))
1247 set_bit(0, iommu->domain_ids);
1248 return 0;
1249}
1250
1251
1252static void domain_exit(struct dmar_domain *domain);
1253static void vm_domain_exit(struct dmar_domain *domain);
1254
1255void free_dmar_iommu(struct intel_iommu *iommu)
1256{
1257 struct dmar_domain *domain;
1258 int i;
1259 unsigned long flags;
1260
1261 if ((iommu->domains) && (iommu->domain_ids)) {
1262 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1263 domain = iommu->domains[i];
1264 clear_bit(i, iommu->domain_ids);
1265
1266 spin_lock_irqsave(&domain->iommu_lock, flags);
1267 if (--domain->iommu_count == 0) {
1268 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1269 vm_domain_exit(domain);
1270 else
1271 domain_exit(domain);
1272 }
1273 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1274 }
1275 }
1276
1277 if (iommu->gcmd & DMA_GCMD_TE)
1278 iommu_disable_translation(iommu);
1279
1280 if (iommu->irq) {
1281 irq_set_handler_data(iommu->irq, NULL);
1282 /* This will mask the irq */
1283 free_irq(iommu->irq, iommu);
1284 destroy_irq(iommu->irq);
1285 }
1286
1287 kfree(iommu->domains);
1288 kfree(iommu->domain_ids);
1289
1290 g_iommus[iommu->seq_id] = NULL;
1291
1292 /* if all iommus are freed, free g_iommus */
1293 for (i = 0; i < g_num_of_iommus; i++) {
1294 if (g_iommus[i])
1295 break;
1296 }
1297
1298 if (i == g_num_of_iommus)
1299 kfree(g_iommus);
1300
1301 /* free context mapping */
1302 free_context_table(iommu);
1303}
1304
1305static struct dmar_domain *alloc_domain(void)
1306{
1307 struct dmar_domain *domain;
1308
1309 domain = alloc_domain_mem();
1310 if (!domain)
1311 return NULL;
1312
1313 domain->nid = -1;
1314 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1315 domain->flags = 0;
1316
1317 return domain;
1318}
1319
1320static int iommu_attach_domain(struct dmar_domain *domain,
1321 struct intel_iommu *iommu)
1322{
1323 int num;
1324 unsigned long ndomains;
1325 unsigned long flags;
1326
1327 ndomains = cap_ndoms(iommu->cap);
1328
1329 spin_lock_irqsave(&iommu->lock, flags);
1330
1331 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1332 if (num >= ndomains) {
1333 spin_unlock_irqrestore(&iommu->lock, flags);
1334 printk(KERN_ERR "IOMMU: no free domain ids\n");
1335 return -ENOMEM;
1336 }
1337
1338 domain->id = num;
1339 set_bit(num, iommu->domain_ids);
1340 set_bit(iommu->seq_id, &domain->iommu_bmp);
1341 iommu->domains[num] = domain;
1342 spin_unlock_irqrestore(&iommu->lock, flags);
1343
1344 return 0;
1345}
1346
1347static void iommu_detach_domain(struct dmar_domain *domain,
1348 struct intel_iommu *iommu)
1349{
1350 unsigned long flags;
1351 int num, ndomains;
1352 int found = 0;
1353
1354 spin_lock_irqsave(&iommu->lock, flags);
1355 ndomains = cap_ndoms(iommu->cap);
1356 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1357 if (iommu->domains[num] == domain) {
1358 found = 1;
1359 break;
1360 }
1361 }
1362
1363 if (found) {
1364 clear_bit(num, iommu->domain_ids);
1365 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1366 iommu->domains[num] = NULL;
1367 }
1368 spin_unlock_irqrestore(&iommu->lock, flags);
1369}
1370
1371static struct iova_domain reserved_iova_list;
1372static struct lock_class_key reserved_rbtree_key;
1373
1374static int dmar_init_reserved_ranges(void)
1375{
1376 struct pci_dev *pdev = NULL;
1377 struct iova *iova;
1378 int i;
1379
1380 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1381
1382 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1383 &reserved_rbtree_key);
1384
1385 /* IOAPIC ranges shouldn't be accessed by DMA */
1386 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1387 IOVA_PFN(IOAPIC_RANGE_END));
1388 if (!iova) {
1389 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1390 return -ENODEV;
1391 }
1392
1393 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1394 for_each_pci_dev(pdev) {
1395 struct resource *r;
1396
1397 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1398 r = &pdev->resource[i];
1399 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1400 continue;
1401 iova = reserve_iova(&reserved_iova_list,
1402 IOVA_PFN(r->start),
1403 IOVA_PFN(r->end));
1404 if (!iova) {
1405 printk(KERN_ERR "Reserve iova failed\n");
1406 return -ENODEV;
1407 }
1408 }
1409 }
1410 return 0;
1411}
1412
1413static void domain_reserve_special_ranges(struct dmar_domain *domain)
1414{
1415 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1416}
1417
1418static inline int guestwidth_to_adjustwidth(int gaw)
1419{
1420 int agaw;
1421 int r = (gaw - 12) % 9;
1422
1423 if (r == 0)
1424 agaw = gaw;
1425 else
1426 agaw = gaw + 9 - r;
1427 if (agaw > 64)
1428 agaw = 64;
1429 return agaw;
1430}
1431
1432static int domain_init(struct dmar_domain *domain, int guest_width)
1433{
1434 struct intel_iommu *iommu;
1435 int adjust_width, agaw;
1436 unsigned long sagaw;
1437
1438 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1439 spin_lock_init(&domain->iommu_lock);
1440
1441 domain_reserve_special_ranges(domain);
1442
1443 /* calculate AGAW */
1444 iommu = domain_get_iommu(domain);
1445 if (guest_width > cap_mgaw(iommu->cap))
1446 guest_width = cap_mgaw(iommu->cap);
1447 domain->gaw = guest_width;
1448 adjust_width = guestwidth_to_adjustwidth(guest_width);
1449 agaw = width_to_agaw(adjust_width);
1450 sagaw = cap_sagaw(iommu->cap);
1451 if (!test_bit(agaw, &sagaw)) {
1452 /* hardware doesn't support it, choose a bigger one */
1453 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1454 agaw = find_next_bit(&sagaw, 5, agaw);
1455 if (agaw >= 5)
1456 return -ENODEV;
1457 }
1458 domain->agaw = agaw;
1459 INIT_LIST_HEAD(&domain->devices);
1460
1461 if (ecap_coherent(iommu->ecap))
1462 domain->iommu_coherency = 1;
1463 else
1464 domain->iommu_coherency = 0;
1465
1466 if (ecap_sc_support(iommu->ecap))
1467 domain->iommu_snooping = 1;
1468 else
1469 domain->iommu_snooping = 0;
1470
1471 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1472 domain->iommu_count = 1;
1473 domain->nid = iommu->node;
1474
1475 /* always allocate the top pgd */
1476 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1477 if (!domain->pgd)
1478 return -ENOMEM;
1479 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1480 return 0;
1481}
1482
1483static void domain_exit(struct dmar_domain *domain)
1484{
1485 struct dmar_drhd_unit *drhd;
1486 struct intel_iommu *iommu;
1487
1488 /* Domain 0 is reserved, so dont process it */
1489 if (!domain)
1490 return;
1491
1492 /* Flush any lazy unmaps that may reference this domain */
1493 if (!intel_iommu_strict)
1494 flush_unmaps_timeout(0);
1495
1496 domain_remove_dev_info(domain);
1497 /* destroy iovas */
1498 put_iova_domain(&domain->iovad);
1499
1500 /* clear ptes */
1501 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1502
1503 /* free page tables */
1504 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1505
1506 for_each_active_iommu(iommu, drhd)
1507 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1508 iommu_detach_domain(domain, iommu);
1509
1510 free_domain_mem(domain);
1511}
1512
1513static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1514 u8 bus, u8 devfn, int translation)
1515{
1516 struct context_entry *context;
1517 unsigned long flags;
1518 struct intel_iommu *iommu;
1519 struct dma_pte *pgd;
1520 unsigned long num;
1521 unsigned long ndomains;
1522 int id;
1523 int agaw;
1524 struct device_domain_info *info = NULL;
1525
1526 pr_debug("Set context mapping for %02x:%02x.%d\n",
1527 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1528
1529 BUG_ON(!domain->pgd);
1530 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1531 translation != CONTEXT_TT_MULTI_LEVEL);
1532
1533 iommu = device_to_iommu(segment, bus, devfn);
1534 if (!iommu)
1535 return -ENODEV;
1536
1537 context = device_to_context_entry(iommu, bus, devfn);
1538 if (!context)
1539 return -ENOMEM;
1540 spin_lock_irqsave(&iommu->lock, flags);
1541 if (context_present(context)) {
1542 spin_unlock_irqrestore(&iommu->lock, flags);
1543 return 0;
1544 }
1545
1546 id = domain->id;
1547 pgd = domain->pgd;
1548
1549 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1550 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1551 int found = 0;
1552
1553 /* find an available domain id for this device in iommu */
1554 ndomains = cap_ndoms(iommu->cap);
1555 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1556 if (iommu->domains[num] == domain) {
1557 id = num;
1558 found = 1;
1559 break;
1560 }
1561 }
1562
1563 if (found == 0) {
1564 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1565 if (num >= ndomains) {
1566 spin_unlock_irqrestore(&iommu->lock, flags);
1567 printk(KERN_ERR "IOMMU: no free domain ids\n");
1568 return -EFAULT;
1569 }
1570
1571 set_bit(num, iommu->domain_ids);
1572 iommu->domains[num] = domain;
1573 id = num;
1574 }
1575
1576 /* Skip top levels of page tables for
1577 * iommu which has less agaw than default.
1578 * Unnecessary for PT mode.
1579 */
1580 if (translation != CONTEXT_TT_PASS_THROUGH) {
1581 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1582 pgd = phys_to_virt(dma_pte_addr(pgd));
1583 if (!dma_pte_present(pgd)) {
1584 spin_unlock_irqrestore(&iommu->lock, flags);
1585 return -ENOMEM;
1586 }
1587 }
1588 }
1589 }
1590
1591 context_set_domain_id(context, id);
1592
1593 if (translation != CONTEXT_TT_PASS_THROUGH) {
1594 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1595 translation = info ? CONTEXT_TT_DEV_IOTLB :
1596 CONTEXT_TT_MULTI_LEVEL;
1597 }
1598 /*
1599 * In pass through mode, AW must be programmed to indicate the largest
1600 * AGAW value supported by hardware. And ASR is ignored by hardware.
1601 */
1602 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1603 context_set_address_width(context, iommu->msagaw);
1604 else {
1605 context_set_address_root(context, virt_to_phys(pgd));
1606 context_set_address_width(context, iommu->agaw);
1607 }
1608
1609 context_set_translation_type(context, translation);
1610 context_set_fault_enable(context);
1611 context_set_present(context);
1612 domain_flush_cache(domain, context, sizeof(*context));
1613
1614 /*
1615 * It's a non-present to present mapping. If hardware doesn't cache
1616 * non-present entry we only need to flush the write-buffer. If the
1617 * _does_ cache non-present entries, then it does so in the special
1618 * domain #0, which we have to flush:
1619 */
1620 if (cap_caching_mode(iommu->cap)) {
1621 iommu->flush.flush_context(iommu, 0,
1622 (((u16)bus) << 8) | devfn,
1623 DMA_CCMD_MASK_NOBIT,
1624 DMA_CCMD_DEVICE_INVL);
1625 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1626 } else {
1627 iommu_flush_write_buffer(iommu);
1628 }
1629 iommu_enable_dev_iotlb(info);
1630 spin_unlock_irqrestore(&iommu->lock, flags);
1631
1632 spin_lock_irqsave(&domain->iommu_lock, flags);
1633 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1634 domain->iommu_count++;
1635 if (domain->iommu_count == 1)
1636 domain->nid = iommu->node;
1637 domain_update_iommu_cap(domain);
1638 }
1639 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1640 return 0;
1641}
1642
1643static int
1644domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1645 int translation)
1646{
1647 int ret;
1648 struct pci_dev *tmp, *parent;
1649
1650 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1651 pdev->bus->number, pdev->devfn,
1652 translation);
1653 if (ret)
1654 return ret;
1655
1656 /* dependent device mapping */
1657 tmp = pci_find_upstream_pcie_bridge(pdev);
1658 if (!tmp)
1659 return 0;
1660 /* Secondary interface's bus number and devfn 0 */
1661 parent = pdev->bus->self;
1662 while (parent != tmp) {
1663 ret = domain_context_mapping_one(domain,
1664 pci_domain_nr(parent->bus),
1665 parent->bus->number,
1666 parent->devfn, translation);
1667 if (ret)
1668 return ret;
1669 parent = parent->bus->self;
1670 }
1671 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1672 return domain_context_mapping_one(domain,
1673 pci_domain_nr(tmp->subordinate),
1674 tmp->subordinate->number, 0,
1675 translation);
1676 else /* this is a legacy PCI bridge */
1677 return domain_context_mapping_one(domain,
1678 pci_domain_nr(tmp->bus),
1679 tmp->bus->number,
1680 tmp->devfn,
1681 translation);
1682}
1683
1684static int domain_context_mapped(struct pci_dev *pdev)
1685{
1686 int ret;
1687 struct pci_dev *tmp, *parent;
1688 struct intel_iommu *iommu;
1689
1690 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1691 pdev->devfn);
1692 if (!iommu)
1693 return -ENODEV;
1694
1695 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1696 if (!ret)
1697 return ret;
1698 /* dependent device mapping */
1699 tmp = pci_find_upstream_pcie_bridge(pdev);
1700 if (!tmp)
1701 return ret;
1702 /* Secondary interface's bus number and devfn 0 */
1703 parent = pdev->bus->self;
1704 while (parent != tmp) {
1705 ret = device_context_mapped(iommu, parent->bus->number,
1706 parent->devfn);
1707 if (!ret)
1708 return ret;
1709 parent = parent->bus->self;
1710 }
1711 if (pci_is_pcie(tmp))
1712 return device_context_mapped(iommu, tmp->subordinate->number,
1713 0);
1714 else
1715 return device_context_mapped(iommu, tmp->bus->number,
1716 tmp->devfn);
1717}
1718
1719/* Returns a number of VTD pages, but aligned to MM page size */
1720static inline unsigned long aligned_nrpages(unsigned long host_addr,
1721 size_t size)
1722{
1723 host_addr &= ~PAGE_MASK;
1724 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1725}
1726
1727/* Return largest possible superpage level for a given mapping */
1728static inline int hardware_largepage_caps(struct dmar_domain *domain,
1729 unsigned long iov_pfn,
1730 unsigned long phy_pfn,
1731 unsigned long pages)
1732{
1733 int support, level = 1;
1734 unsigned long pfnmerge;
1735
1736 support = domain->iommu_superpage;
1737
1738 /* To use a large page, the virtual *and* physical addresses
1739 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1740 of them will mean we have to use smaller pages. So just
1741 merge them and check both at once. */
1742 pfnmerge = iov_pfn | phy_pfn;
1743
1744 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1745 pages >>= VTD_STRIDE_SHIFT;
1746 if (!pages)
1747 break;
1748 pfnmerge >>= VTD_STRIDE_SHIFT;
1749 level++;
1750 support--;
1751 }
1752 return level;
1753}
1754
1755static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1756 struct scatterlist *sg, unsigned long phys_pfn,
1757 unsigned long nr_pages, int prot)
1758{
1759 struct dma_pte *first_pte = NULL, *pte = NULL;
1760 phys_addr_t uninitialized_var(pteval);
1761 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1762 unsigned long sg_res;
1763 unsigned int largepage_lvl = 0;
1764 unsigned long lvl_pages = 0;
1765
1766 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1767
1768 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1769 return -EINVAL;
1770
1771 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1772
1773 if (sg)
1774 sg_res = 0;
1775 else {
1776 sg_res = nr_pages + 1;
1777 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1778 }
1779
1780 while (nr_pages > 0) {
1781 uint64_t tmp;
1782
1783 if (!sg_res) {
1784 sg_res = aligned_nrpages(sg->offset, sg->length);
1785 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1786 sg->dma_length = sg->length;
1787 pteval = page_to_phys(sg_page(sg)) | prot;
1788 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1789 }
1790
1791 if (!pte) {
1792 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1793
1794 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1795 if (!pte)
1796 return -ENOMEM;
1797 /* It is large page*/
1798 if (largepage_lvl > 1)
1799 pteval |= DMA_PTE_LARGE_PAGE;
1800 else
1801 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1802
1803 }
1804 /* We don't need lock here, nobody else
1805 * touches the iova range
1806 */
1807 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1808 if (tmp) {
1809 static int dumps = 5;
1810 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1811 iov_pfn, tmp, (unsigned long long)pteval);
1812 if (dumps) {
1813 dumps--;
1814 debug_dma_dump_mappings(NULL);
1815 }
1816 WARN_ON(1);
1817 }
1818
1819 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1820
1821 BUG_ON(nr_pages < lvl_pages);
1822 BUG_ON(sg_res < lvl_pages);
1823
1824 nr_pages -= lvl_pages;
1825 iov_pfn += lvl_pages;
1826 phys_pfn += lvl_pages;
1827 pteval += lvl_pages * VTD_PAGE_SIZE;
1828 sg_res -= lvl_pages;
1829
1830 /* If the next PTE would be the first in a new page, then we
1831 need to flush the cache on the entries we've just written.
1832 And then we'll need to recalculate 'pte', so clear it and
1833 let it get set again in the if (!pte) block above.
1834
1835 If we're done (!nr_pages) we need to flush the cache too.
1836
1837 Also if we've been setting superpages, we may need to
1838 recalculate 'pte' and switch back to smaller pages for the
1839 end of the mapping, if the trailing size is not enough to
1840 use another superpage (i.e. sg_res < lvl_pages). */
1841 pte++;
1842 if (!nr_pages || first_pte_in_page(pte) ||
1843 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1844 domain_flush_cache(domain, first_pte,
1845 (void *)pte - (void *)first_pte);
1846 pte = NULL;
1847 }
1848
1849 if (!sg_res && nr_pages)
1850 sg = sg_next(sg);
1851 }
1852 return 0;
1853}
1854
1855static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1856 struct scatterlist *sg, unsigned long nr_pages,
1857 int prot)
1858{
1859 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1860}
1861
1862static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1863 unsigned long phys_pfn, unsigned long nr_pages,
1864 int prot)
1865{
1866 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1867}
1868
1869static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1870{
1871 if (!iommu)
1872 return;
1873
1874 clear_context_table(iommu, bus, devfn);
1875 iommu->flush.flush_context(iommu, 0, 0, 0,
1876 DMA_CCMD_GLOBAL_INVL);
1877 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1878}
1879
1880static void domain_remove_dev_info(struct dmar_domain *domain)
1881{
1882 struct device_domain_info *info;
1883 unsigned long flags;
1884 struct intel_iommu *iommu;
1885
1886 spin_lock_irqsave(&device_domain_lock, flags);
1887 while (!list_empty(&domain->devices)) {
1888 info = list_entry(domain->devices.next,
1889 struct device_domain_info, link);
1890 list_del(&info->link);
1891 list_del(&info->global);
1892 if (info->dev)
1893 info->dev->dev.archdata.iommu = NULL;
1894 spin_unlock_irqrestore(&device_domain_lock, flags);
1895
1896 iommu_disable_dev_iotlb(info);
1897 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1898 iommu_detach_dev(iommu, info->bus, info->devfn);
1899 free_devinfo_mem(info);
1900
1901 spin_lock_irqsave(&device_domain_lock, flags);
1902 }
1903 spin_unlock_irqrestore(&device_domain_lock, flags);
1904}
1905
1906/*
1907 * find_domain
1908 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1909 */
1910static struct dmar_domain *
1911find_domain(struct pci_dev *pdev)
1912{
1913 struct device_domain_info *info;
1914
1915 /* No lock here, assumes no domain exit in normal case */
1916 info = pdev->dev.archdata.iommu;
1917 if (info)
1918 return info->domain;
1919 return NULL;
1920}
1921
1922/* domain is initialized */
1923static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1924{
1925 struct dmar_domain *domain, *found = NULL;
1926 struct intel_iommu *iommu;
1927 struct dmar_drhd_unit *drhd;
1928 struct device_domain_info *info, *tmp;
1929 struct pci_dev *dev_tmp;
1930 unsigned long flags;
1931 int bus = 0, devfn = 0;
1932 int segment;
1933 int ret;
1934
1935 domain = find_domain(pdev);
1936 if (domain)
1937 return domain;
1938
1939 segment = pci_domain_nr(pdev->bus);
1940
1941 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1942 if (dev_tmp) {
1943 if (pci_is_pcie(dev_tmp)) {
1944 bus = dev_tmp->subordinate->number;
1945 devfn = 0;
1946 } else {
1947 bus = dev_tmp->bus->number;
1948 devfn = dev_tmp->devfn;
1949 }
1950 spin_lock_irqsave(&device_domain_lock, flags);
1951 list_for_each_entry(info, &device_domain_list, global) {
1952 if (info->segment == segment &&
1953 info->bus == bus && info->devfn == devfn) {
1954 found = info->domain;
1955 break;
1956 }
1957 }
1958 spin_unlock_irqrestore(&device_domain_lock, flags);
1959 /* pcie-pci bridge already has a domain, uses it */
1960 if (found) {
1961 domain = found;
1962 goto found_domain;
1963 }
1964 }
1965
1966 domain = alloc_domain();
1967 if (!domain)
1968 goto error;
1969
1970 /* Allocate new domain for the device */
1971 drhd = dmar_find_matched_drhd_unit(pdev);
1972 if (!drhd) {
1973 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1974 pci_name(pdev));
1975 return NULL;
1976 }
1977 iommu = drhd->iommu;
1978
1979 ret = iommu_attach_domain(domain, iommu);
1980 if (ret) {
1981 free_domain_mem(domain);
1982 goto error;
1983 }
1984
1985 if (domain_init(domain, gaw)) {
1986 domain_exit(domain);
1987 goto error;
1988 }
1989
1990 /* register pcie-to-pci device */
1991 if (dev_tmp) {
1992 info = alloc_devinfo_mem();
1993 if (!info) {
1994 domain_exit(domain);
1995 goto error;
1996 }
1997 info->segment = segment;
1998 info->bus = bus;
1999 info->devfn = devfn;
2000 info->dev = NULL;
2001 info->domain = domain;
2002 /* This domain is shared by devices under p2p bridge */
2003 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2004
2005 /* pcie-to-pci bridge already has a domain, uses it */
2006 found = NULL;
2007 spin_lock_irqsave(&device_domain_lock, flags);
2008 list_for_each_entry(tmp, &device_domain_list, global) {
2009 if (tmp->segment == segment &&
2010 tmp->bus == bus && tmp->devfn == devfn) {
2011 found = tmp->domain;
2012 break;
2013 }
2014 }
2015 if (found) {
2016 spin_unlock_irqrestore(&device_domain_lock, flags);
2017 free_devinfo_mem(info);
2018 domain_exit(domain);
2019 domain = found;
2020 } else {
2021 list_add(&info->link, &domain->devices);
2022 list_add(&info->global, &device_domain_list);
2023 spin_unlock_irqrestore(&device_domain_lock, flags);
2024 }
2025 }
2026
2027found_domain:
2028 info = alloc_devinfo_mem();
2029 if (!info)
2030 goto error;
2031 info->segment = segment;
2032 info->bus = pdev->bus->number;
2033 info->devfn = pdev->devfn;
2034 info->dev = pdev;
2035 info->domain = domain;
2036 spin_lock_irqsave(&device_domain_lock, flags);
2037 /* somebody is fast */
2038 found = find_domain(pdev);
2039 if (found != NULL) {
2040 spin_unlock_irqrestore(&device_domain_lock, flags);
2041 if (found != domain) {
2042 domain_exit(domain);
2043 domain = found;
2044 }
2045 free_devinfo_mem(info);
2046 return domain;
2047 }
2048 list_add(&info->link, &domain->devices);
2049 list_add(&info->global, &device_domain_list);
2050 pdev->dev.archdata.iommu = info;
2051 spin_unlock_irqrestore(&device_domain_lock, flags);
2052 return domain;
2053error:
2054 /* recheck it here, maybe others set it */
2055 return find_domain(pdev);
2056}
2057
2058static int iommu_identity_mapping;
2059#define IDENTMAP_ALL 1
2060#define IDENTMAP_GFX 2
2061#define IDENTMAP_AZALIA 4
2062
2063static int iommu_domain_identity_map(struct dmar_domain *domain,
2064 unsigned long long start,
2065 unsigned long long end)
2066{
2067 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2068 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2069
2070 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2071 dma_to_mm_pfn(last_vpfn))) {
2072 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2073 return -ENOMEM;
2074 }
2075
2076 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2077 start, end, domain->id);
2078 /*
2079 * RMRR range might have overlap with physical memory range,
2080 * clear it first
2081 */
2082 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2083
2084 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2085 last_vpfn - first_vpfn + 1,
2086 DMA_PTE_READ|DMA_PTE_WRITE);
2087}
2088
2089static int iommu_prepare_identity_map(struct pci_dev *pdev,
2090 unsigned long long start,
2091 unsigned long long end)
2092{
2093 struct dmar_domain *domain;
2094 int ret;
2095
2096 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2097 if (!domain)
2098 return -ENOMEM;
2099
2100 /* For _hardware_ passthrough, don't bother. But for software
2101 passthrough, we do it anyway -- it may indicate a memory
2102 range which is reserved in E820, so which didn't get set
2103 up to start with in si_domain */
2104 if (domain == si_domain && hw_pass_through) {
2105 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2106 pci_name(pdev), start, end);
2107 return 0;
2108 }
2109
2110 printk(KERN_INFO
2111 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2112 pci_name(pdev), start, end);
2113
2114 if (end < start) {
2115 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2116 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2117 dmi_get_system_info(DMI_BIOS_VENDOR),
2118 dmi_get_system_info(DMI_BIOS_VERSION),
2119 dmi_get_system_info(DMI_PRODUCT_VERSION));
2120 ret = -EIO;
2121 goto error;
2122 }
2123
2124 if (end >> agaw_to_width(domain->agaw)) {
2125 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2126 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2127 agaw_to_width(domain->agaw),
2128 dmi_get_system_info(DMI_BIOS_VENDOR),
2129 dmi_get_system_info(DMI_BIOS_VERSION),
2130 dmi_get_system_info(DMI_PRODUCT_VERSION));
2131 ret = -EIO;
2132 goto error;
2133 }
2134
2135 ret = iommu_domain_identity_map(domain, start, end);
2136 if (ret)
2137 goto error;
2138
2139 /* context entry init */
2140 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2141 if (ret)
2142 goto error;
2143
2144 return 0;
2145
2146 error:
2147 domain_exit(domain);
2148 return ret;
2149}
2150
2151static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2152 struct pci_dev *pdev)
2153{
2154 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2155 return 0;
2156 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2157 rmrr->end_address);
2158}
2159
2160#ifdef CONFIG_DMAR_FLOPPY_WA
2161static inline void iommu_prepare_isa(void)
2162{
2163 struct pci_dev *pdev;
2164 int ret;
2165
2166 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2167 if (!pdev)
2168 return;
2169
2170 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2171 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2172
2173 if (ret)
2174 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2175 "floppy might not work\n");
2176
2177}
2178#else
2179static inline void iommu_prepare_isa(void)
2180{
2181 return;
2182}
2183#endif /* !CONFIG_DMAR_FLPY_WA */
2184
2185static int md_domain_init(struct dmar_domain *domain, int guest_width);
2186
2187static int __init si_domain_work_fn(unsigned long start_pfn,
2188 unsigned long end_pfn, void *datax)
2189{
2190 int *ret = datax;
2191
2192 *ret = iommu_domain_identity_map(si_domain,
2193 (uint64_t)start_pfn << PAGE_SHIFT,
2194 (uint64_t)end_pfn << PAGE_SHIFT);
2195 return *ret;
2196
2197}
2198
2199static int __init si_domain_init(int hw)
2200{
2201 struct dmar_drhd_unit *drhd;
2202 struct intel_iommu *iommu;
2203 int nid, ret = 0;
2204
2205 si_domain = alloc_domain();
2206 if (!si_domain)
2207 return -EFAULT;
2208
2209 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2210
2211 for_each_active_iommu(iommu, drhd) {
2212 ret = iommu_attach_domain(si_domain, iommu);
2213 if (ret) {
2214 domain_exit(si_domain);
2215 return -EFAULT;
2216 }
2217 }
2218
2219 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2220 domain_exit(si_domain);
2221 return -EFAULT;
2222 }
2223
2224 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2225
2226 if (hw)
2227 return 0;
2228
2229 for_each_online_node(nid) {
2230 work_with_active_regions(nid, si_domain_work_fn, &ret);
2231 if (ret)
2232 return ret;
2233 }
2234
2235 return 0;
2236}
2237
2238static void domain_remove_one_dev_info(struct dmar_domain *domain,
2239 struct pci_dev *pdev);
2240static int identity_mapping(struct pci_dev *pdev)
2241{
2242 struct device_domain_info *info;
2243
2244 if (likely(!iommu_identity_mapping))
2245 return 0;
2246
2247 info = pdev->dev.archdata.iommu;
2248 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2249 return (info->domain == si_domain);
2250
2251 return 0;
2252}
2253
2254static int domain_add_dev_info(struct dmar_domain *domain,
2255 struct pci_dev *pdev,
2256 int translation)
2257{
2258 struct device_domain_info *info;
2259 unsigned long flags;
2260 int ret;
2261
2262 info = alloc_devinfo_mem();
2263 if (!info)
2264 return -ENOMEM;
2265
2266 ret = domain_context_mapping(domain, pdev, translation);
2267 if (ret) {
2268 free_devinfo_mem(info);
2269 return ret;
2270 }
2271
2272 info->segment = pci_domain_nr(pdev->bus);
2273 info->bus = pdev->bus->number;
2274 info->devfn = pdev->devfn;
2275 info->dev = pdev;
2276 info->domain = domain;
2277
2278 spin_lock_irqsave(&device_domain_lock, flags);
2279 list_add(&info->link, &domain->devices);
2280 list_add(&info->global, &device_domain_list);
2281 pdev->dev.archdata.iommu = info;
2282 spin_unlock_irqrestore(&device_domain_lock, flags);
2283
2284 return 0;
2285}
2286
2287static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2288{
2289 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2290 return 1;
2291
2292 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2293 return 1;
2294
2295 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2296 return 0;
2297
2298 /*
2299 * We want to start off with all devices in the 1:1 domain, and
2300 * take them out later if we find they can't access all of memory.
2301 *
2302 * However, we can't do this for PCI devices behind bridges,
2303 * because all PCI devices behind the same bridge will end up
2304 * with the same source-id on their transactions.
2305 *
2306 * Practically speaking, we can't change things around for these
2307 * devices at run-time, because we can't be sure there'll be no
2308 * DMA transactions in flight for any of their siblings.
2309 *
2310 * So PCI devices (unless they're on the root bus) as well as
2311 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2312 * the 1:1 domain, just in _case_ one of their siblings turns out
2313 * not to be able to map all of memory.
2314 */
2315 if (!pci_is_pcie(pdev)) {
2316 if (!pci_is_root_bus(pdev->bus))
2317 return 0;
2318 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2319 return 0;
2320 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2321 return 0;
2322
2323 /*
2324 * At boot time, we don't yet know if devices will be 64-bit capable.
2325 * Assume that they will -- if they turn out not to be, then we can
2326 * take them out of the 1:1 domain later.
2327 */
2328 if (!startup) {
2329 /*
2330 * If the device's dma_mask is less than the system's memory
2331 * size then this is not a candidate for identity mapping.
2332 */
2333 u64 dma_mask = pdev->dma_mask;
2334
2335 if (pdev->dev.coherent_dma_mask &&
2336 pdev->dev.coherent_dma_mask < dma_mask)
2337 dma_mask = pdev->dev.coherent_dma_mask;
2338
2339 return dma_mask >= dma_get_required_mask(&pdev->dev);
2340 }
2341
2342 return 1;
2343}
2344
2345static int __init iommu_prepare_static_identity_mapping(int hw)
2346{
2347 struct pci_dev *pdev = NULL;
2348 int ret;
2349
2350 ret = si_domain_init(hw);
2351 if (ret)
2352 return -EFAULT;
2353
2354 for_each_pci_dev(pdev) {
2355 /* Skip Host/PCI Bridge devices */
2356 if (IS_BRIDGE_HOST_DEVICE(pdev))
2357 continue;
2358 if (iommu_should_identity_map(pdev, 1)) {
2359 printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2360 hw ? "hardware" : "software", pci_name(pdev));
2361
2362 ret = domain_add_dev_info(si_domain, pdev,
2363 hw ? CONTEXT_TT_PASS_THROUGH :
2364 CONTEXT_TT_MULTI_LEVEL);
2365 if (ret)
2366 return ret;
2367 }
2368 }
2369
2370 return 0;
2371}
2372
2373static int __init init_dmars(void)
2374{
2375 struct dmar_drhd_unit *drhd;
2376 struct dmar_rmrr_unit *rmrr;
2377 struct pci_dev *pdev;
2378 struct intel_iommu *iommu;
2379 int i, ret;
2380
2381 /*
2382 * for each drhd
2383 * allocate root
2384 * initialize and program root entry to not present
2385 * endfor
2386 */
2387 for_each_drhd_unit(drhd) {
2388 g_num_of_iommus++;
2389 /*
2390 * lock not needed as this is only incremented in the single
2391 * threaded kernel __init code path all other access are read
2392 * only
2393 */
2394 }
2395
2396 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2397 GFP_KERNEL);
2398 if (!g_iommus) {
2399 printk(KERN_ERR "Allocating global iommu array failed\n");
2400 ret = -ENOMEM;
2401 goto error;
2402 }
2403
2404 deferred_flush = kzalloc(g_num_of_iommus *
2405 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2406 if (!deferred_flush) {
2407 ret = -ENOMEM;
2408 goto error;
2409 }
2410
2411 for_each_drhd_unit(drhd) {
2412 if (drhd->ignored)
2413 continue;
2414
2415 iommu = drhd->iommu;
2416 g_iommus[iommu->seq_id] = iommu;
2417
2418 ret = iommu_init_domains(iommu);
2419 if (ret)
2420 goto error;
2421
2422 /*
2423 * TBD:
2424 * we could share the same root & context tables
2425 * among all IOMMU's. Need to Split it later.
2426 */
2427 ret = iommu_alloc_root_entry(iommu);
2428 if (ret) {
2429 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2430 goto error;
2431 }
2432 if (!ecap_pass_through(iommu->ecap))
2433 hw_pass_through = 0;
2434 }
2435
2436 /*
2437 * Start from the sane iommu hardware state.
2438 */
2439 for_each_drhd_unit(drhd) {
2440 if (drhd->ignored)
2441 continue;
2442
2443 iommu = drhd->iommu;
2444
2445 /*
2446 * If the queued invalidation is already initialized by us
2447 * (for example, while enabling interrupt-remapping) then
2448 * we got the things already rolling from a sane state.
2449 */
2450 if (iommu->qi)
2451 continue;
2452
2453 /*
2454 * Clear any previous faults.
2455 */
2456 dmar_fault(-1, iommu);
2457 /*
2458 * Disable queued invalidation if supported and already enabled
2459 * before OS handover.
2460 */
2461 dmar_disable_qi(iommu);
2462 }
2463
2464 for_each_drhd_unit(drhd) {
2465 if (drhd->ignored)
2466 continue;
2467
2468 iommu = drhd->iommu;
2469
2470 if (dmar_enable_qi(iommu)) {
2471 /*
2472 * Queued Invalidate not enabled, use Register Based
2473 * Invalidate
2474 */
2475 iommu->flush.flush_context = __iommu_flush_context;
2476 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2477 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2478 "invalidation\n",
2479 iommu->seq_id,
2480 (unsigned long long)drhd->reg_base_addr);
2481 } else {
2482 iommu->flush.flush_context = qi_flush_context;
2483 iommu->flush.flush_iotlb = qi_flush_iotlb;
2484 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2485 "invalidation\n",
2486 iommu->seq_id,
2487 (unsigned long long)drhd->reg_base_addr);
2488 }
2489 }
2490
2491 if (iommu_pass_through)
2492 iommu_identity_mapping |= IDENTMAP_ALL;
2493
2494#ifdef CONFIG_DMAR_BROKEN_GFX_WA
2495 iommu_identity_mapping |= IDENTMAP_GFX;
2496#endif
2497
2498 check_tylersburg_isoch();
2499
2500 /*
2501 * If pass through is not set or not enabled, setup context entries for
2502 * identity mappings for rmrr, gfx, and isa and may fall back to static
2503 * identity mapping if iommu_identity_mapping is set.
2504 */
2505 if (iommu_identity_mapping) {
2506 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2507 if (ret) {
2508 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2509 goto error;
2510 }
2511 }
2512 /*
2513 * For each rmrr
2514 * for each dev attached to rmrr
2515 * do
2516 * locate drhd for dev, alloc domain for dev
2517 * allocate free domain
2518 * allocate page table entries for rmrr
2519 * if context not allocated for bus
2520 * allocate and init context
2521 * set present in root table for this bus
2522 * init context with domain, translation etc
2523 * endfor
2524 * endfor
2525 */
2526 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2527 for_each_rmrr_units(rmrr) {
2528 for (i = 0; i < rmrr->devices_cnt; i++) {
2529 pdev = rmrr->devices[i];
2530 /*
2531 * some BIOS lists non-exist devices in DMAR
2532 * table.
2533 */
2534 if (!pdev)
2535 continue;
2536 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2537 if (ret)
2538 printk(KERN_ERR
2539 "IOMMU: mapping reserved region failed\n");
2540 }
2541 }
2542
2543 iommu_prepare_isa();
2544
2545 /*
2546 * for each drhd
2547 * enable fault log
2548 * global invalidate context cache
2549 * global invalidate iotlb
2550 * enable translation
2551 */
2552 for_each_drhd_unit(drhd) {
2553 if (drhd->ignored) {
2554 /*
2555 * we always have to disable PMRs or DMA may fail on
2556 * this device
2557 */
2558 if (force_on)
2559 iommu_disable_protect_mem_regions(drhd->iommu);
2560 continue;
2561 }
2562 iommu = drhd->iommu;
2563
2564 iommu_flush_write_buffer(iommu);
2565
2566 ret = dmar_set_interrupt(iommu);
2567 if (ret)
2568 goto error;
2569
2570 iommu_set_root_entry(iommu);
2571
2572 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2573 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2574
2575 ret = iommu_enable_translation(iommu);
2576 if (ret)
2577 goto error;
2578
2579 iommu_disable_protect_mem_regions(iommu);
2580 }
2581
2582 return 0;
2583error:
2584 for_each_drhd_unit(drhd) {
2585 if (drhd->ignored)
2586 continue;
2587 iommu = drhd->iommu;
2588 free_iommu(iommu);
2589 }
2590 kfree(g_iommus);
2591 return ret;
2592}
2593
2594/* This takes a number of _MM_ pages, not VTD pages */
2595static struct iova *intel_alloc_iova(struct device *dev,
2596 struct dmar_domain *domain,
2597 unsigned long nrpages, uint64_t dma_mask)
2598{
2599 struct pci_dev *pdev = to_pci_dev(dev);
2600 struct iova *iova = NULL;
2601
2602 /* Restrict dma_mask to the width that the iommu can handle */
2603 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2604
2605 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2606 /*
2607 * First try to allocate an io virtual address in
2608 * DMA_BIT_MASK(32) and if that fails then try allocating
2609 * from higher range
2610 */
2611 iova = alloc_iova(&domain->iovad, nrpages,
2612 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2613 if (iova)
2614 return iova;
2615 }
2616 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2617 if (unlikely(!iova)) {
2618 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2619 nrpages, pci_name(pdev));
2620 return NULL;
2621 }
2622
2623 return iova;
2624}
2625
2626static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2627{
2628 struct dmar_domain *domain;
2629 int ret;
2630
2631 domain = get_domain_for_dev(pdev,
2632 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2633 if (!domain) {
2634 printk(KERN_ERR
2635 "Allocating domain for %s failed", pci_name(pdev));
2636 return NULL;
2637 }
2638
2639 /* make sure context mapping is ok */
2640 if (unlikely(!domain_context_mapped(pdev))) {
2641 ret = domain_context_mapping(domain, pdev,
2642 CONTEXT_TT_MULTI_LEVEL);
2643 if (ret) {
2644 printk(KERN_ERR
2645 "Domain context map for %s failed",
2646 pci_name(pdev));
2647 return NULL;
2648 }
2649 }
2650
2651 return domain;
2652}
2653
2654static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2655{
2656 struct device_domain_info *info;
2657
2658 /* No lock here, assumes no domain exit in normal case */
2659 info = dev->dev.archdata.iommu;
2660 if (likely(info))
2661 return info->domain;
2662
2663 return __get_valid_domain_for_dev(dev);
2664}
2665
2666static int iommu_dummy(struct pci_dev *pdev)
2667{
2668 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2669}
2670
2671/* Check if the pdev needs to go through non-identity map and unmap process.*/
2672static int iommu_no_mapping(struct device *dev)
2673{
2674 struct pci_dev *pdev;
2675 int found;
2676
2677 if (unlikely(dev->bus != &pci_bus_type))
2678 return 1;
2679
2680 pdev = to_pci_dev(dev);
2681 if (iommu_dummy(pdev))
2682 return 1;
2683
2684 if (!iommu_identity_mapping)
2685 return 0;
2686
2687 found = identity_mapping(pdev);
2688 if (found) {
2689 if (iommu_should_identity_map(pdev, 0))
2690 return 1;
2691 else {
2692 /*
2693 * 32 bit DMA is removed from si_domain and fall back
2694 * to non-identity mapping.
2695 */
2696 domain_remove_one_dev_info(si_domain, pdev);
2697 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2698 pci_name(pdev));
2699 return 0;
2700 }
2701 } else {
2702 /*
2703 * In case of a detached 64 bit DMA device from vm, the device
2704 * is put into si_domain for identity mapping.
2705 */
2706 if (iommu_should_identity_map(pdev, 0)) {
2707 int ret;
2708 ret = domain_add_dev_info(si_domain, pdev,
2709 hw_pass_through ?
2710 CONTEXT_TT_PASS_THROUGH :
2711 CONTEXT_TT_MULTI_LEVEL);
2712 if (!ret) {
2713 printk(KERN_INFO "64bit %s uses identity mapping\n",
2714 pci_name(pdev));
2715 return 1;
2716 }
2717 }
2718 }
2719
2720 return 0;
2721}
2722
2723static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2724 size_t size, int dir, u64 dma_mask)
2725{
2726 struct pci_dev *pdev = to_pci_dev(hwdev);
2727 struct dmar_domain *domain;
2728 phys_addr_t start_paddr;
2729 struct iova *iova;
2730 int prot = 0;
2731 int ret;
2732 struct intel_iommu *iommu;
2733 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2734
2735 BUG_ON(dir == DMA_NONE);
2736
2737 if (iommu_no_mapping(hwdev))
2738 return paddr;
2739
2740 domain = get_valid_domain_for_dev(pdev);
2741 if (!domain)
2742 return 0;
2743
2744 iommu = domain_get_iommu(domain);
2745 size = aligned_nrpages(paddr, size);
2746
2747 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2748 if (!iova)
2749 goto error;
2750
2751 /*
2752 * Check if DMAR supports zero-length reads on write only
2753 * mappings..
2754 */
2755 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2756 !cap_zlr(iommu->cap))
2757 prot |= DMA_PTE_READ;
2758 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2759 prot |= DMA_PTE_WRITE;
2760 /*
2761 * paddr - (paddr + size) might be partial page, we should map the whole
2762 * page. Note: if two part of one page are separately mapped, we
2763 * might have two guest_addr mapping to the same host paddr, but this
2764 * is not a big problem
2765 */
2766 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2767 mm_to_dma_pfn(paddr_pfn), size, prot);
2768 if (ret)
2769 goto error;
2770
2771 /* it's a non-present to present mapping. Only flush if caching mode */
2772 if (cap_caching_mode(iommu->cap))
2773 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2774 else
2775 iommu_flush_write_buffer(iommu);
2776
2777 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2778 start_paddr += paddr & ~PAGE_MASK;
2779 return start_paddr;
2780
2781error:
2782 if (iova)
2783 __free_iova(&domain->iovad, iova);
2784 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2785 pci_name(pdev), size, (unsigned long long)paddr, dir);
2786 return 0;
2787}
2788
2789static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2790 unsigned long offset, size_t size,
2791 enum dma_data_direction dir,
2792 struct dma_attrs *attrs)
2793{
2794 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2795 dir, to_pci_dev(dev)->dma_mask);
2796}
2797
2798static void flush_unmaps(void)
2799{
2800 int i, j;
2801
2802 timer_on = 0;
2803
2804 /* just flush them all */
2805 for (i = 0; i < g_num_of_iommus; i++) {
2806 struct intel_iommu *iommu = g_iommus[i];
2807 if (!iommu)
2808 continue;
2809
2810 if (!deferred_flush[i].next)
2811 continue;
2812
2813 /* In caching mode, global flushes turn emulation expensive */
2814 if (!cap_caching_mode(iommu->cap))
2815 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2816 DMA_TLB_GLOBAL_FLUSH);
2817 for (j = 0; j < deferred_flush[i].next; j++) {
2818 unsigned long mask;
2819 struct iova *iova = deferred_flush[i].iova[j];
2820 struct dmar_domain *domain = deferred_flush[i].domain[j];
2821
2822 /* On real hardware multiple invalidations are expensive */
2823 if (cap_caching_mode(iommu->cap))
2824 iommu_flush_iotlb_psi(iommu, domain->id,
2825 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2826 else {
2827 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2828 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2829 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2830 }
2831 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2832 }
2833 deferred_flush[i].next = 0;
2834 }
2835
2836 list_size = 0;
2837}
2838
2839static void flush_unmaps_timeout(unsigned long data)
2840{
2841 unsigned long flags;
2842
2843 spin_lock_irqsave(&async_umap_flush_lock, flags);
2844 flush_unmaps();
2845 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2846}
2847
2848static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2849{
2850 unsigned long flags;
2851 int next, iommu_id;
2852 struct intel_iommu *iommu;
2853
2854 spin_lock_irqsave(&async_umap_flush_lock, flags);
2855 if (list_size == HIGH_WATER_MARK)
2856 flush_unmaps();
2857
2858 iommu = domain_get_iommu(dom);
2859 iommu_id = iommu->seq_id;
2860
2861 next = deferred_flush[iommu_id].next;
2862 deferred_flush[iommu_id].domain[next] = dom;
2863 deferred_flush[iommu_id].iova[next] = iova;
2864 deferred_flush[iommu_id].next++;
2865
2866 if (!timer_on) {
2867 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2868 timer_on = 1;
2869 }
2870 list_size++;
2871 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2872}
2873
2874static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2875 size_t size, enum dma_data_direction dir,
2876 struct dma_attrs *attrs)
2877{
2878 struct pci_dev *pdev = to_pci_dev(dev);
2879 struct dmar_domain *domain;
2880 unsigned long start_pfn, last_pfn;
2881 struct iova *iova;
2882 struct intel_iommu *iommu;
2883
2884 if (iommu_no_mapping(dev))
2885 return;
2886
2887 domain = find_domain(pdev);
2888 BUG_ON(!domain);
2889
2890 iommu = domain_get_iommu(domain);
2891
2892 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2893 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2894 (unsigned long long)dev_addr))
2895 return;
2896
2897 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2898 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2899
2900 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2901 pci_name(pdev), start_pfn, last_pfn);
2902
2903 /* clear the whole page */
2904 dma_pte_clear_range(domain, start_pfn, last_pfn);
2905
2906 /* free page tables */
2907 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2908
2909 if (intel_iommu_strict) {
2910 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2911 last_pfn - start_pfn + 1, 0);
2912 /* free iova */
2913 __free_iova(&domain->iovad, iova);
2914 } else {
2915 add_unmap(domain, iova);
2916 /*
2917 * queue up the release of the unmap to save the 1/6th of the
2918 * cpu used up by the iotlb flush operation...
2919 */
2920 }
2921}
2922
2923static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2924 dma_addr_t *dma_handle, gfp_t flags)
2925{
2926 void *vaddr;
2927 int order;
2928
2929 size = PAGE_ALIGN(size);
2930 order = get_order(size);
2931
2932 if (!iommu_no_mapping(hwdev))
2933 flags &= ~(GFP_DMA | GFP_DMA32);
2934 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2935 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2936 flags |= GFP_DMA;
2937 else
2938 flags |= GFP_DMA32;
2939 }
2940
2941 vaddr = (void *)__get_free_pages(flags, order);
2942 if (!vaddr)
2943 return NULL;
2944 memset(vaddr, 0, size);
2945
2946 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2947 DMA_BIDIRECTIONAL,
2948 hwdev->coherent_dma_mask);
2949 if (*dma_handle)
2950 return vaddr;
2951 free_pages((unsigned long)vaddr, order);
2952 return NULL;
2953}
2954
2955static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2956 dma_addr_t dma_handle)
2957{
2958 int order;
2959
2960 size = PAGE_ALIGN(size);
2961 order = get_order(size);
2962
2963 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2964 free_pages((unsigned long)vaddr, order);
2965}
2966
2967static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2968 int nelems, enum dma_data_direction dir,
2969 struct dma_attrs *attrs)
2970{
2971 struct pci_dev *pdev = to_pci_dev(hwdev);
2972 struct dmar_domain *domain;
2973 unsigned long start_pfn, last_pfn;
2974 struct iova *iova;
2975 struct intel_iommu *iommu;
2976
2977 if (iommu_no_mapping(hwdev))
2978 return;
2979
2980 domain = find_domain(pdev);
2981 BUG_ON(!domain);
2982
2983 iommu = domain_get_iommu(domain);
2984
2985 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2986 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2987 (unsigned long long)sglist[0].dma_address))
2988 return;
2989
2990 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2991 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2992
2993 /* clear the whole page */
2994 dma_pte_clear_range(domain, start_pfn, last_pfn);
2995
2996 /* free page tables */
2997 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2998
2999 if (intel_iommu_strict) {
3000 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3001 last_pfn - start_pfn + 1, 0);
3002 /* free iova */
3003 __free_iova(&domain->iovad, iova);
3004 } else {
3005 add_unmap(domain, iova);
3006 /*
3007 * queue up the release of the unmap to save the 1/6th of the
3008 * cpu used up by the iotlb flush operation...
3009 */
3010 }
3011}
3012
3013static int intel_nontranslate_map_sg(struct device *hddev,
3014 struct scatterlist *sglist, int nelems, int dir)
3015{
3016 int i;
3017 struct scatterlist *sg;
3018
3019 for_each_sg(sglist, sg, nelems, i) {
3020 BUG_ON(!sg_page(sg));
3021 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3022 sg->dma_length = sg->length;
3023 }
3024 return nelems;
3025}
3026
3027static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3028 enum dma_data_direction dir, struct dma_attrs *attrs)
3029{
3030 int i;
3031 struct pci_dev *pdev = to_pci_dev(hwdev);
3032 struct dmar_domain *domain;
3033 size_t size = 0;
3034 int prot = 0;
3035 struct iova *iova = NULL;
3036 int ret;
3037 struct scatterlist *sg;
3038 unsigned long start_vpfn;
3039 struct intel_iommu *iommu;
3040
3041 BUG_ON(dir == DMA_NONE);
3042 if (iommu_no_mapping(hwdev))
3043 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3044
3045 domain = get_valid_domain_for_dev(pdev);
3046 if (!domain)
3047 return 0;
3048
3049 iommu = domain_get_iommu(domain);
3050
3051 for_each_sg(sglist, sg, nelems, i)
3052 size += aligned_nrpages(sg->offset, sg->length);
3053
3054 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3055 pdev->dma_mask);
3056 if (!iova) {
3057 sglist->dma_length = 0;
3058 return 0;
3059 }
3060
3061 /*
3062 * Check if DMAR supports zero-length reads on write only
3063 * mappings..
3064 */
3065 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3066 !cap_zlr(iommu->cap))
3067 prot |= DMA_PTE_READ;
3068 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3069 prot |= DMA_PTE_WRITE;
3070
3071 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3072
3073 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3074 if (unlikely(ret)) {
3075 /* clear the page */
3076 dma_pte_clear_range(domain, start_vpfn,
3077 start_vpfn + size - 1);
3078 /* free page tables */
3079 dma_pte_free_pagetable(domain, start_vpfn,
3080 start_vpfn + size - 1);
3081 /* free iova */
3082 __free_iova(&domain->iovad, iova);
3083 return 0;
3084 }
3085
3086 /* it's a non-present to present mapping. Only flush if caching mode */
3087 if (cap_caching_mode(iommu->cap))
3088 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3089 else
3090 iommu_flush_write_buffer(iommu);
3091
3092 return nelems;
3093}
3094
3095static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3096{
3097 return !dma_addr;
3098}
3099
3100struct dma_map_ops intel_dma_ops = {
3101 .alloc_coherent = intel_alloc_coherent,
3102 .free_coherent = intel_free_coherent,
3103 .map_sg = intel_map_sg,
3104 .unmap_sg = intel_unmap_sg,
3105 .map_page = intel_map_page,
3106 .unmap_page = intel_unmap_page,
3107 .mapping_error = intel_mapping_error,
3108};
3109
3110static inline int iommu_domain_cache_init(void)
3111{
3112 int ret = 0;
3113
3114 iommu_domain_cache = kmem_cache_create("iommu_domain",
3115 sizeof(struct dmar_domain),
3116 0,
3117 SLAB_HWCACHE_ALIGN,
3118
3119 NULL);
3120 if (!iommu_domain_cache) {
3121 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3122 ret = -ENOMEM;
3123 }
3124
3125 return ret;
3126}
3127
3128static inline int iommu_devinfo_cache_init(void)
3129{
3130 int ret = 0;
3131
3132 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3133 sizeof(struct device_domain_info),
3134 0,
3135 SLAB_HWCACHE_ALIGN,
3136 NULL);
3137 if (!iommu_devinfo_cache) {
3138 printk(KERN_ERR "Couldn't create devinfo cache\n");
3139 ret = -ENOMEM;
3140 }
3141
3142 return ret;
3143}
3144
3145static inline int iommu_iova_cache_init(void)
3146{
3147 int ret = 0;
3148
3149 iommu_iova_cache = kmem_cache_create("iommu_iova",
3150 sizeof(struct iova),
3151 0,
3152 SLAB_HWCACHE_ALIGN,
3153 NULL);
3154 if (!iommu_iova_cache) {
3155 printk(KERN_ERR "Couldn't create iova cache\n");
3156 ret = -ENOMEM;
3157 }
3158
3159 return ret;
3160}
3161
3162static int __init iommu_init_mempool(void)
3163{
3164 int ret;
3165 ret = iommu_iova_cache_init();
3166 if (ret)
3167 return ret;
3168
3169 ret = iommu_domain_cache_init();
3170 if (ret)
3171 goto domain_error;
3172
3173 ret = iommu_devinfo_cache_init();
3174 if (!ret)
3175 return ret;
3176
3177 kmem_cache_destroy(iommu_domain_cache);
3178domain_error:
3179 kmem_cache_destroy(iommu_iova_cache);
3180
3181 return -ENOMEM;
3182}
3183
3184static void __init iommu_exit_mempool(void)
3185{
3186 kmem_cache_destroy(iommu_devinfo_cache);
3187 kmem_cache_destroy(iommu_domain_cache);
3188 kmem_cache_destroy(iommu_iova_cache);
3189
3190}
3191
3192static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3193{
3194 struct dmar_drhd_unit *drhd;
3195 u32 vtbar;
3196 int rc;
3197
3198 /* We know that this device on this chipset has its own IOMMU.
3199 * If we find it under a different IOMMU, then the BIOS is lying
3200 * to us. Hope that the IOMMU for this device is actually
3201 * disabled, and it needs no translation...
3202 */
3203 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3204 if (rc) {
3205 /* "can't" happen */
3206 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3207 return;
3208 }
3209 vtbar &= 0xffff0000;
3210
3211 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3212 drhd = dmar_find_matched_drhd_unit(pdev);
3213 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3214 TAINT_FIRMWARE_WORKAROUND,
3215 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3216 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3217}
3218DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3219
3220static void __init init_no_remapping_devices(void)
3221{
3222 struct dmar_drhd_unit *drhd;
3223
3224 for_each_drhd_unit(drhd) {
3225 if (!drhd->include_all) {
3226 int i;
3227 for (i = 0; i < drhd->devices_cnt; i++)
3228 if (drhd->devices[i] != NULL)
3229 break;
3230 /* ignore DMAR unit if no pci devices exist */
3231 if (i == drhd->devices_cnt)
3232 drhd->ignored = 1;
3233 }
3234 }
3235
3236 for_each_drhd_unit(drhd) {
3237 int i;
3238 if (drhd->ignored || drhd->include_all)
3239 continue;
3240
3241 for (i = 0; i < drhd->devices_cnt; i++)
3242 if (drhd->devices[i] &&
3243 !IS_GFX_DEVICE(drhd->devices[i]))
3244 break;
3245
3246 if (i < drhd->devices_cnt)
3247 continue;
3248
3249 /* This IOMMU has *only* gfx devices. Either bypass it or
3250 set the gfx_mapped flag, as appropriate */
3251 if (dmar_map_gfx) {
3252 intel_iommu_gfx_mapped = 1;
3253 } else {
3254 drhd->ignored = 1;
3255 for (i = 0; i < drhd->devices_cnt; i++) {
3256 if (!drhd->devices[i])
3257 continue;
3258 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3259 }
3260 }
3261 }
3262}
3263
3264#ifdef CONFIG_SUSPEND
3265static int init_iommu_hw(void)
3266{
3267 struct dmar_drhd_unit *drhd;
3268 struct intel_iommu *iommu = NULL;
3269
3270 for_each_active_iommu(iommu, drhd)
3271 if (iommu->qi)
3272 dmar_reenable_qi(iommu);
3273
3274 for_each_iommu(iommu, drhd) {
3275 if (drhd->ignored) {
3276 /*
3277 * we always have to disable PMRs or DMA may fail on
3278 * this device
3279 */
3280 if (force_on)
3281 iommu_disable_protect_mem_regions(iommu);
3282 continue;
3283 }
3284
3285 iommu_flush_write_buffer(iommu);
3286
3287 iommu_set_root_entry(iommu);
3288
3289 iommu->flush.flush_context(iommu, 0, 0, 0,
3290 DMA_CCMD_GLOBAL_INVL);
3291 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3292 DMA_TLB_GLOBAL_FLUSH);
3293 if (iommu_enable_translation(iommu))
3294 return 1;
3295 iommu_disable_protect_mem_regions(iommu);
3296 }
3297
3298 return 0;
3299}
3300
3301static void iommu_flush_all(void)
3302{
3303 struct dmar_drhd_unit *drhd;
3304 struct intel_iommu *iommu;
3305
3306 for_each_active_iommu(iommu, drhd) {
3307 iommu->flush.flush_context(iommu, 0, 0, 0,
3308 DMA_CCMD_GLOBAL_INVL);
3309 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3310 DMA_TLB_GLOBAL_FLUSH);
3311 }
3312}
3313
3314static int iommu_suspend(void)
3315{
3316 struct dmar_drhd_unit *drhd;
3317 struct intel_iommu *iommu = NULL;
3318 unsigned long flag;
3319
3320 for_each_active_iommu(iommu, drhd) {
3321 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3322 GFP_ATOMIC);
3323 if (!iommu->iommu_state)
3324 goto nomem;
3325 }
3326
3327 iommu_flush_all();
3328
3329 for_each_active_iommu(iommu, drhd) {
3330 iommu_disable_translation(iommu);
3331
3332 spin_lock_irqsave(&iommu->register_lock, flag);
3333
3334 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3335 readl(iommu->reg + DMAR_FECTL_REG);
3336 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3337 readl(iommu->reg + DMAR_FEDATA_REG);
3338 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3339 readl(iommu->reg + DMAR_FEADDR_REG);
3340 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3341 readl(iommu->reg + DMAR_FEUADDR_REG);
3342
3343 spin_unlock_irqrestore(&iommu->register_lock, flag);
3344 }
3345 return 0;
3346
3347nomem:
3348 for_each_active_iommu(iommu, drhd)
3349 kfree(iommu->iommu_state);
3350
3351 return -ENOMEM;
3352}
3353
3354static void iommu_resume(void)
3355{
3356 struct dmar_drhd_unit *drhd;
3357 struct intel_iommu *iommu = NULL;
3358 unsigned long flag;
3359
3360 if (init_iommu_hw()) {
3361 if (force_on)
3362 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3363 else
3364 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3365 return;
3366 }
3367
3368 for_each_active_iommu(iommu, drhd) {
3369
3370 spin_lock_irqsave(&iommu->register_lock, flag);
3371
3372 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3373 iommu->reg + DMAR_FECTL_REG);
3374 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3375 iommu->reg + DMAR_FEDATA_REG);
3376 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3377 iommu->reg + DMAR_FEADDR_REG);
3378 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3379 iommu->reg + DMAR_FEUADDR_REG);
3380
3381 spin_unlock_irqrestore(&iommu->register_lock, flag);
3382 }
3383
3384 for_each_active_iommu(iommu, drhd)
3385 kfree(iommu->iommu_state);
3386}
3387
3388static struct syscore_ops iommu_syscore_ops = {
3389 .resume = iommu_resume,
3390 .suspend = iommu_suspend,
3391};
3392
3393static void __init init_iommu_pm_ops(void)
3394{
3395 register_syscore_ops(&iommu_syscore_ops);
3396}
3397
3398#else
3399static inline void init_iommu_pm_ops(void) {}
3400#endif /* CONFIG_PM */
3401
3402/*
3403 * Here we only respond to action of unbound device from driver.
3404 *
3405 * Added device is not attached to its DMAR domain here yet. That will happen
3406 * when mapping the device to iova.
3407 */
3408static int device_notifier(struct notifier_block *nb,
3409 unsigned long action, void *data)
3410{
3411 struct device *dev = data;
3412 struct pci_dev *pdev = to_pci_dev(dev);
3413 struct dmar_domain *domain;
3414
3415 if (iommu_no_mapping(dev))
3416 return 0;
3417
3418 domain = find_domain(pdev);
3419 if (!domain)
3420 return 0;
3421
3422 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3423 domain_remove_one_dev_info(domain, pdev);
3424
3425 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3426 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3427 list_empty(&domain->devices))
3428 domain_exit(domain);
3429 }
3430
3431 return 0;
3432}
3433
3434static struct notifier_block device_nb = {
3435 .notifier_call = device_notifier,
3436};
3437
3438int __init intel_iommu_init(void)
3439{
3440 int ret = 0;
3441
3442 /* VT-d is required for a TXT/tboot launch, so enforce that */
3443 force_on = tboot_force_iommu();
3444
3445 if (dmar_table_init()) {
3446 if (force_on)
3447 panic("tboot: Failed to initialize DMAR table\n");
3448 return -ENODEV;
3449 }
3450
3451 if (dmar_dev_scope_init()) {
3452 if (force_on)
3453 panic("tboot: Failed to initialize DMAR device scope\n");
3454 return -ENODEV;
3455 }
3456
3457 /*
3458 * Check the need for DMA-remapping initialization now.
3459 * Above initialization will also be used by Interrupt-remapping.
3460 */
3461 if (no_iommu || dmar_disabled)
3462 return -ENODEV;
3463
3464 if (iommu_init_mempool()) {
3465 if (force_on)
3466 panic("tboot: Failed to initialize iommu memory\n");
3467 return -ENODEV;
3468 }
3469
3470 if (dmar_init_reserved_ranges()) {
3471 if (force_on)
3472 panic("tboot: Failed to reserve iommu ranges\n");
3473 return -ENODEV;
3474 }
3475
3476 init_no_remapping_devices();
3477
3478 ret = init_dmars();
3479 if (ret) {
3480 if (force_on)
3481 panic("tboot: Failed to initialize DMARs\n");
3482 printk(KERN_ERR "IOMMU: dmar init failed\n");
3483 put_iova_domain(&reserved_iova_list);
3484 iommu_exit_mempool();
3485 return ret;
3486 }
3487 printk(KERN_INFO
3488 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3489
3490 init_timer(&unmap_timer);
3491#ifdef CONFIG_SWIOTLB
3492 swiotlb = 0;
3493#endif
3494 dma_ops = &intel_dma_ops;
3495
3496 init_iommu_pm_ops();
3497
3498 register_iommu(&intel_iommu_ops);
3499
3500 bus_register_notifier(&pci_bus_type, &device_nb);
3501
3502 return 0;
3503}
3504
3505static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3506 struct pci_dev *pdev)
3507{
3508 struct pci_dev *tmp, *parent;
3509
3510 if (!iommu || !pdev)
3511 return;
3512
3513 /* dependent device detach */
3514 tmp = pci_find_upstream_pcie_bridge(pdev);
3515 /* Secondary interface's bus number and devfn 0 */
3516 if (tmp) {
3517 parent = pdev->bus->self;
3518 while (parent != tmp) {
3519 iommu_detach_dev(iommu, parent->bus->number,
3520 parent->devfn);
3521 parent = parent->bus->self;
3522 }
3523 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3524 iommu_detach_dev(iommu,
3525 tmp->subordinate->number, 0);
3526 else /* this is a legacy PCI bridge */
3527 iommu_detach_dev(iommu, tmp->bus->number,
3528 tmp->devfn);
3529 }
3530}
3531
3532static void domain_remove_one_dev_info(struct dmar_domain *domain,
3533 struct pci_dev *pdev)
3534{
3535 struct device_domain_info *info;
3536 struct intel_iommu *iommu;
3537 unsigned long flags;
3538 int found = 0;
3539 struct list_head *entry, *tmp;
3540
3541 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3542 pdev->devfn);
3543 if (!iommu)
3544 return;
3545
3546 spin_lock_irqsave(&device_domain_lock, flags);
3547 list_for_each_safe(entry, tmp, &domain->devices) {
3548 info = list_entry(entry, struct device_domain_info, link);
3549 if (info->segment == pci_domain_nr(pdev->bus) &&
3550 info->bus == pdev->bus->number &&
3551 info->devfn == pdev->devfn) {
3552 list_del(&info->link);
3553 list_del(&info->global);
3554 if (info->dev)
3555 info->dev->dev.archdata.iommu = NULL;
3556 spin_unlock_irqrestore(&device_domain_lock, flags);
3557
3558 iommu_disable_dev_iotlb(info);
3559 iommu_detach_dev(iommu, info->bus, info->devfn);
3560 iommu_detach_dependent_devices(iommu, pdev);
3561 free_devinfo_mem(info);
3562
3563 spin_lock_irqsave(&device_domain_lock, flags);
3564
3565 if (found)
3566 break;
3567 else
3568 continue;
3569 }
3570
3571 /* if there is no other devices under the same iommu
3572 * owned by this domain, clear this iommu in iommu_bmp
3573 * update iommu count and coherency
3574 */
3575 if (iommu == device_to_iommu(info->segment, info->bus,
3576 info->devfn))
3577 found = 1;
3578 }
3579
3580 spin_unlock_irqrestore(&device_domain_lock, flags);
3581
3582 if (found == 0) {
3583 unsigned long tmp_flags;
3584 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3585 clear_bit(iommu->seq_id, &domain->iommu_bmp);
3586 domain->iommu_count--;
3587 domain_update_iommu_cap(domain);
3588 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3589
3590 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3591 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3592 spin_lock_irqsave(&iommu->lock, tmp_flags);
3593 clear_bit(domain->id, iommu->domain_ids);
3594 iommu->domains[domain->id] = NULL;
3595 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3596 }
3597 }
3598}
3599
3600static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3601{
3602 struct device_domain_info *info;
3603 struct intel_iommu *iommu;
3604 unsigned long flags1, flags2;
3605
3606 spin_lock_irqsave(&device_domain_lock, flags1);
3607 while (!list_empty(&domain->devices)) {
3608 info = list_entry(domain->devices.next,
3609 struct device_domain_info, link);
3610 list_del(&info->link);
3611 list_del(&info->global);
3612 if (info->dev)
3613 info->dev->dev.archdata.iommu = NULL;
3614
3615 spin_unlock_irqrestore(&device_domain_lock, flags1);
3616
3617 iommu_disable_dev_iotlb(info);
3618 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3619 iommu_detach_dev(iommu, info->bus, info->devfn);
3620 iommu_detach_dependent_devices(iommu, info->dev);
3621
3622 /* clear this iommu in iommu_bmp, update iommu count
3623 * and capabilities
3624 */
3625 spin_lock_irqsave(&domain->iommu_lock, flags2);
3626 if (test_and_clear_bit(iommu->seq_id,
3627 &domain->iommu_bmp)) {
3628 domain->iommu_count--;
3629 domain_update_iommu_cap(domain);
3630 }
3631 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3632
3633 free_devinfo_mem(info);
3634 spin_lock_irqsave(&device_domain_lock, flags1);
3635 }
3636 spin_unlock_irqrestore(&device_domain_lock, flags1);
3637}
3638
3639/* domain id for virtual machine, it won't be set in context */
3640static unsigned long vm_domid;
3641
3642static struct dmar_domain *iommu_alloc_vm_domain(void)
3643{
3644 struct dmar_domain *domain;
3645
3646 domain = alloc_domain_mem();
3647 if (!domain)
3648 return NULL;
3649
3650 domain->id = vm_domid++;
3651 domain->nid = -1;
3652 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3653 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3654
3655 return domain;
3656}
3657
3658static int md_domain_init(struct dmar_domain *domain, int guest_width)
3659{
3660 int adjust_width;
3661
3662 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3663 spin_lock_init(&domain->iommu_lock);
3664
3665 domain_reserve_special_ranges(domain);
3666
3667 /* calculate AGAW */
3668 domain->gaw = guest_width;
3669 adjust_width = guestwidth_to_adjustwidth(guest_width);
3670 domain->agaw = width_to_agaw(adjust_width);
3671
3672 INIT_LIST_HEAD(&domain->devices);
3673
3674 domain->iommu_count = 0;
3675 domain->iommu_coherency = 0;
3676 domain->iommu_snooping = 0;
3677 domain->iommu_superpage = 0;
3678 domain->max_addr = 0;
3679 domain->nid = -1;
3680
3681 /* always allocate the top pgd */
3682 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3683 if (!domain->pgd)
3684 return -ENOMEM;
3685 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3686 return 0;
3687}
3688
3689static void iommu_free_vm_domain(struct dmar_domain *domain)
3690{
3691 unsigned long flags;
3692 struct dmar_drhd_unit *drhd;
3693 struct intel_iommu *iommu;
3694 unsigned long i;
3695 unsigned long ndomains;
3696
3697 for_each_drhd_unit(drhd) {
3698 if (drhd->ignored)
3699 continue;
3700 iommu = drhd->iommu;
3701
3702 ndomains = cap_ndoms(iommu->cap);
3703 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3704 if (iommu->domains[i] == domain) {
3705 spin_lock_irqsave(&iommu->lock, flags);
3706 clear_bit(i, iommu->domain_ids);
3707 iommu->domains[i] = NULL;
3708 spin_unlock_irqrestore(&iommu->lock, flags);
3709 break;
3710 }
3711 }
3712 }
3713}
3714
3715static void vm_domain_exit(struct dmar_domain *domain)
3716{
3717 /* Domain 0 is reserved, so dont process it */
3718 if (!domain)
3719 return;
3720
3721 vm_domain_remove_all_dev_info(domain);
3722 /* destroy iovas */
3723 put_iova_domain(&domain->iovad);
3724
3725 /* clear ptes */
3726 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3727
3728 /* free page tables */
3729 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3730
3731 iommu_free_vm_domain(domain);
3732 free_domain_mem(domain);
3733}
3734
3735static int intel_iommu_domain_init(struct iommu_domain *domain)
3736{
3737 struct dmar_domain *dmar_domain;
3738
3739 dmar_domain = iommu_alloc_vm_domain();
3740 if (!dmar_domain) {
3741 printk(KERN_ERR
3742 "intel_iommu_domain_init: dmar_domain == NULL\n");
3743 return -ENOMEM;
3744 }
3745 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3746 printk(KERN_ERR
3747 "intel_iommu_domain_init() failed\n");
3748 vm_domain_exit(dmar_domain);
3749 return -ENOMEM;
3750 }
3751 domain_update_iommu_cap(dmar_domain);
3752 domain->priv = dmar_domain;
3753
3754 return 0;
3755}
3756
3757static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3758{
3759 struct dmar_domain *dmar_domain = domain->priv;
3760
3761 domain->priv = NULL;
3762 vm_domain_exit(dmar_domain);
3763}
3764
3765static int intel_iommu_attach_device(struct iommu_domain *domain,
3766 struct device *dev)
3767{
3768 struct dmar_domain *dmar_domain = domain->priv;
3769 struct pci_dev *pdev = to_pci_dev(dev);
3770 struct intel_iommu *iommu;
3771 int addr_width;
3772
3773 /* normally pdev is not mapped */
3774 if (unlikely(domain_context_mapped(pdev))) {
3775 struct dmar_domain *old_domain;
3776
3777 old_domain = find_domain(pdev);
3778 if (old_domain) {
3779 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3780 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3781 domain_remove_one_dev_info(old_domain, pdev);
3782 else
3783 domain_remove_dev_info(old_domain);
3784 }
3785 }
3786
3787 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3788 pdev->devfn);
3789 if (!iommu)
3790 return -ENODEV;
3791
3792 /* check if this iommu agaw is sufficient for max mapped address */
3793 addr_width = agaw_to_width(iommu->agaw);
3794 if (addr_width > cap_mgaw(iommu->cap))
3795 addr_width = cap_mgaw(iommu->cap);
3796
3797 if (dmar_domain->max_addr > (1LL << addr_width)) {
3798 printk(KERN_ERR "%s: iommu width (%d) is not "
3799 "sufficient for the mapped address (%llx)\n",
3800 __func__, addr_width, dmar_domain->max_addr);
3801 return -EFAULT;
3802 }
3803 dmar_domain->gaw = addr_width;
3804
3805 /*
3806 * Knock out extra levels of page tables if necessary
3807 */
3808 while (iommu->agaw < dmar_domain->agaw) {
3809 struct dma_pte *pte;
3810
3811 pte = dmar_domain->pgd;
3812 if (dma_pte_present(pte)) {
3813 dmar_domain->pgd = (struct dma_pte *)
3814 phys_to_virt(dma_pte_addr(pte));
3815 free_pgtable_page(pte);
3816 }
3817 dmar_domain->agaw--;
3818 }
3819
3820 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3821}
3822
3823static void intel_iommu_detach_device(struct iommu_domain *domain,
3824 struct device *dev)
3825{
3826 struct dmar_domain *dmar_domain = domain->priv;
3827 struct pci_dev *pdev = to_pci_dev(dev);
3828
3829 domain_remove_one_dev_info(dmar_domain, pdev);
3830}
3831
3832static int intel_iommu_map(struct iommu_domain *domain,
3833 unsigned long iova, phys_addr_t hpa,
3834 int gfp_order, int iommu_prot)
3835{
3836 struct dmar_domain *dmar_domain = domain->priv;
3837 u64 max_addr;
3838 int prot = 0;
3839 size_t size;
3840 int ret;
3841
3842 if (iommu_prot & IOMMU_READ)
3843 prot |= DMA_PTE_READ;
3844 if (iommu_prot & IOMMU_WRITE)
3845 prot |= DMA_PTE_WRITE;
3846 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3847 prot |= DMA_PTE_SNP;
3848
3849 size = PAGE_SIZE << gfp_order;
3850 max_addr = iova + size;
3851 if (dmar_domain->max_addr < max_addr) {
3852 u64 end;
3853
3854 /* check if minimum agaw is sufficient for mapped address */
3855 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3856 if (end < max_addr) {
3857 printk(KERN_ERR "%s: iommu width (%d) is not "
3858 "sufficient for the mapped address (%llx)\n",
3859 __func__, dmar_domain->gaw, max_addr);
3860 return -EFAULT;
3861 }
3862 dmar_domain->max_addr = max_addr;
3863 }
3864 /* Round up size to next multiple of PAGE_SIZE, if it and
3865 the low bits of hpa would take us onto the next page */
3866 size = aligned_nrpages(hpa, size);
3867 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3868 hpa >> VTD_PAGE_SHIFT, size, prot);
3869 return ret;
3870}
3871
3872static int intel_iommu_unmap(struct iommu_domain *domain,
3873 unsigned long iova, int gfp_order)
3874{
3875 struct dmar_domain *dmar_domain = domain->priv;
3876 size_t size = PAGE_SIZE << gfp_order;
3877 int order;
3878
3879 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3880 (iova + size - 1) >> VTD_PAGE_SHIFT);
3881
3882 if (dmar_domain->max_addr == iova + size)
3883 dmar_domain->max_addr = iova;
3884
3885 return order;
3886}
3887
3888static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3889 unsigned long iova)
3890{
3891 struct dmar_domain *dmar_domain = domain->priv;
3892 struct dma_pte *pte;
3893 u64 phys = 0;
3894
3895 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
3896 if (pte)
3897 phys = dma_pte_addr(pte);
3898
3899 return phys;
3900}
3901
3902static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3903 unsigned long cap)
3904{
3905 struct dmar_domain *dmar_domain = domain->priv;
3906
3907 if (cap == IOMMU_CAP_CACHE_COHERENCY)
3908 return dmar_domain->iommu_snooping;
3909 if (cap == IOMMU_CAP_INTR_REMAP)
3910 return intr_remapping_enabled;
3911
3912 return 0;
3913}
3914
3915static struct iommu_ops intel_iommu_ops = {
3916 .domain_init = intel_iommu_domain_init,
3917 .domain_destroy = intel_iommu_domain_destroy,
3918 .attach_dev = intel_iommu_attach_device,
3919 .detach_dev = intel_iommu_detach_device,
3920 .map = intel_iommu_map,
3921 .unmap = intel_iommu_unmap,
3922 .iova_to_phys = intel_iommu_iova_to_phys,
3923 .domain_has_cap = intel_iommu_domain_has_cap,
3924};
3925
3926static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3927{
3928 /*
3929 * Mobile 4 Series Chipset neglects to set RWBF capability,
3930 * but needs it:
3931 */
3932 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3933 rwbf_quirk = 1;
3934
3935 /* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3936 if (dev->revision == 0x07) {
3937 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3938 dmar_map_gfx = 0;
3939 }
3940}
3941
3942DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3943
3944#define GGC 0x52
3945#define GGC_MEMORY_SIZE_MASK (0xf << 8)
3946#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
3947#define GGC_MEMORY_SIZE_1M (0x1 << 8)
3948#define GGC_MEMORY_SIZE_2M (0x3 << 8)
3949#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
3950#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
3951#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
3952#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
3953
3954static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3955{
3956 unsigned short ggc;
3957
3958 if (pci_read_config_word(dev, GGC, &ggc))
3959 return;
3960
3961 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3962 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3963 dmar_map_gfx = 0;
3964 } else if (dmar_map_gfx) {
3965 /* we have to ensure the gfx device is idle before we flush */
3966 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
3967 intel_iommu_strict = 1;
3968 }
3969}
3970DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3971DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3972DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3973DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3974
3975/* On Tylersburg chipsets, some BIOSes have been known to enable the
3976 ISOCH DMAR unit for the Azalia sound device, but not give it any
3977 TLB entries, which causes it to deadlock. Check for that. We do
3978 this in a function called from init_dmars(), instead of in a PCI
3979 quirk, because we don't want to print the obnoxious "BIOS broken"
3980 message if VT-d is actually disabled.
3981*/
3982static void __init check_tylersburg_isoch(void)
3983{
3984 struct pci_dev *pdev;
3985 uint32_t vtisochctrl;
3986
3987 /* If there's no Azalia in the system anyway, forget it. */
3988 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3989 if (!pdev)
3990 return;
3991 pci_dev_put(pdev);
3992
3993 /* System Management Registers. Might be hidden, in which case
3994 we can't do the sanity check. But that's OK, because the
3995 known-broken BIOSes _don't_ actually hide it, so far. */
3996 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3997 if (!pdev)
3998 return;
3999
4000 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4001 pci_dev_put(pdev);
4002 return;
4003 }
4004
4005 pci_dev_put(pdev);
4006
4007 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4008 if (vtisochctrl & 1)
4009 return;
4010
4011 /* Drop all bits other than the number of TLB entries */
4012 vtisochctrl &= 0x1c;
4013
4014 /* If we have the recommended number of TLB entries (16), fine. */
4015 if (vtisochctrl == 0x10)
4016 return;
4017
4018 /* Zero TLB entries? You get to ride the short bus to school. */
4019 if (!vtisochctrl) {
4020 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4021 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4022 dmi_get_system_info(DMI_BIOS_VENDOR),
4023 dmi_get_system_info(DMI_BIOS_VERSION),
4024 dmi_get_system_info(DMI_PRODUCT_VERSION));
4025 iommu_identity_mapping |= IDENTMAP_AZALIA;
4026 return;
4027 }
4028
4029 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4030 vtisochctrl);
4031}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/init.h>
17#include <linux/bitmap.h>
18#include <linux/debugfs.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/irq.h>
22#include <linux/interrupt.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25#include <linux/dmar.h>
26#include <linux/dma-mapping.h>
27#include <linux/mempool.h>
28#include <linux/memory.h>
29#include <linux/cpu.h>
30#include <linux/timer.h>
31#include <linux/io.h>
32#include <linux/iova.h>
33#include <linux/iommu.h>
34#include <linux/intel-iommu.h>
35#include <linux/syscore_ops.h>
36#include <linux/tboot.h>
37#include <linux/dmi.h>
38#include <linux/pci-ats.h>
39#include <linux/memblock.h>
40#include <linux/dma-contiguous.h>
41#include <linux/dma-direct.h>
42#include <linux/crash_dump.h>
43#include <linux/numa.h>
44#include <linux/swiotlb.h>
45#include <asm/irq_remapping.h>
46#include <asm/cacheflush.h>
47#include <asm/iommu.h>
48#include <trace/events/intel_iommu.h>
49
50#include "irq_remapping.h"
51#include "intel-pasid.h"
52
53#define ROOT_SIZE VTD_PAGE_SIZE
54#define CONTEXT_SIZE VTD_PAGE_SIZE
55
56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61#define IOAPIC_RANGE_START (0xfee00000)
62#define IOAPIC_RANGE_END (0xfeefffff)
63#define IOVA_START_ADDR (0x1000)
64
65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67#define MAX_AGAW_WIDTH 64
68#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79/* IO virtual address start page frame number */
80#define IOVA_START_PFN (1)
81
82#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84/* page table handling */
85#define LEVEL_STRIDE (9)
86#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88/*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
106static inline int agaw_to_level(int agaw)
107{
108 return agaw + 2;
109}
110
111static inline int agaw_to_width(int agaw)
112{
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114}
115
116static inline int width_to_agaw(int width)
117{
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119}
120
121static inline unsigned int level_to_offset_bits(int level)
122{
123 return (level - 1) * LEVEL_STRIDE;
124}
125
126static inline int pfn_level_offset(unsigned long pfn, int level)
127{
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129}
130
131static inline unsigned long level_mask(int level)
132{
133 return -1UL << level_to_offset_bits(level);
134}
135
136static inline unsigned long level_size(int level)
137{
138 return 1UL << level_to_offset_bits(level);
139}
140
141static inline unsigned long align_to_level(unsigned long pfn, int level)
142{
143 return (pfn + level_size(level) - 1) & level_mask(level);
144}
145
146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147{
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149}
150
151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154{
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156}
157
158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159{
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161}
162static inline unsigned long page_to_dma_pfn(struct page *pg)
163{
164 return mm_to_dma_pfn(page_to_pfn(pg));
165}
166static inline unsigned long virt_to_dma_pfn(void *p)
167{
168 return page_to_dma_pfn(virt_to_page(p));
169}
170
171/* global iommu list, set NULL for ignored DMAR units */
172static struct intel_iommu **g_iommus;
173
174static void __init check_tylersburg_isoch(void);
175static int rwbf_quirk;
176
177/*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181static int force_on = 0;
182int intel_iommu_tboot_noforce;
183static int no_platform_optin;
184
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187/*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191static phys_addr_t root_entry_lctp(struct root_entry *re)
192{
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197}
198
199/*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203static phys_addr_t root_entry_uctp(struct root_entry *re)
204{
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209}
210
211static inline void context_clear_pasid_enable(struct context_entry *context)
212{
213 context->lo &= ~(1ULL << 11);
214}
215
216static inline bool context_pasid_enabled(struct context_entry *context)
217{
218 return !!(context->lo & (1ULL << 11));
219}
220
221static inline void context_set_copied(struct context_entry *context)
222{
223 context->hi |= (1ull << 3);
224}
225
226static inline bool context_copied(struct context_entry *context)
227{
228 return !!(context->hi & (1ULL << 3));
229}
230
231static inline bool __context_present(struct context_entry *context)
232{
233 return (context->lo & 1);
234}
235
236bool context_present(struct context_entry *context)
237{
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241}
242
243static inline void context_set_present(struct context_entry *context)
244{
245 context->lo |= 1;
246}
247
248static inline void context_set_fault_enable(struct context_entry *context)
249{
250 context->lo &= (((u64)-1) << 2) | 1;
251}
252
253static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255{
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258}
259
260static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262{
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265}
266
267static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269{
270 context->hi |= value & 7;
271}
272
273static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275{
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277}
278
279static inline int context_domain_id(struct context_entry *c)
280{
281 return((c->hi >> 8) & 0xffff);
282}
283
284static inline void context_clear_entry(struct context_entry *context)
285{
286 context->lo = 0;
287 context->hi = 0;
288}
289
290/*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296static struct dmar_domain *si_domain;
297static int hw_pass_through = 1;
298
299/* si_domain contains mulitple devices */
300#define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301
302/*
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
307 */
308#define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
309
310#define for_each_domain_iommu(idx, domain) \
311 for (idx = 0; idx < g_num_of_iommus; idx++) \
312 if (domain->iommu_refcnt[idx])
313
314struct dmar_rmrr_unit {
315 struct list_head list; /* list of rmrr units */
316 struct acpi_dmar_header *hdr; /* ACPI header */
317 u64 base_address; /* reserved base address*/
318 u64 end_address; /* reserved end address */
319 struct dmar_dev_scope *devices; /* target devices */
320 int devices_cnt; /* target device count */
321};
322
323struct dmar_atsr_unit {
324 struct list_head list; /* list of ATSR units */
325 struct acpi_dmar_header *hdr; /* ACPI header */
326 struct dmar_dev_scope *devices; /* target devices */
327 int devices_cnt; /* target device count */
328 u8 include_all:1; /* include all ports */
329};
330
331static LIST_HEAD(dmar_atsr_units);
332static LIST_HEAD(dmar_rmrr_units);
333
334#define for_each_rmrr_units(rmrr) \
335 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337/* bitmap for indexing intel_iommus */
338static int g_num_of_iommus;
339
340static void domain_exit(struct dmar_domain *domain);
341static void domain_remove_dev_info(struct dmar_domain *domain);
342static void dmar_remove_one_dev_info(struct device *dev);
343static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344static void domain_context_clear(struct intel_iommu *iommu,
345 struct device *dev);
346static int domain_detach_iommu(struct dmar_domain *domain,
347 struct intel_iommu *iommu);
348static bool device_is_rmrr_locked(struct device *dev);
349static int intel_iommu_attach_device(struct iommu_domain *domain,
350 struct device *dev);
351static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352 dma_addr_t iova);
353
354#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355int dmar_disabled = 0;
356#else
357int dmar_disabled = 1;
358#endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360int intel_iommu_sm;
361int intel_iommu_enabled = 0;
362EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363
364static int dmar_map_gfx = 1;
365static int dmar_forcedac;
366static int intel_iommu_strict;
367static int intel_iommu_superpage = 1;
368static int iommu_identity_mapping;
369static int intel_no_bounce;
370
371#define IDENTMAP_ALL 1
372#define IDENTMAP_GFX 2
373#define IDENTMAP_AZALIA 4
374
375int intel_iommu_gfx_mapped;
376EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
377
378#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380static DEFINE_SPINLOCK(device_domain_lock);
381static LIST_HEAD(device_domain_list);
382
383#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
384 to_pci_dev(d)->untrusted)
385
386/*
387 * Iterate over elements in device_domain_list and call the specified
388 * callback @fn against each element.
389 */
390int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391 void *data), void *data)
392{
393 int ret = 0;
394 unsigned long flags;
395 struct device_domain_info *info;
396
397 spin_lock_irqsave(&device_domain_lock, flags);
398 list_for_each_entry(info, &device_domain_list, global) {
399 ret = fn(info, data);
400 if (ret) {
401 spin_unlock_irqrestore(&device_domain_lock, flags);
402 return ret;
403 }
404 }
405 spin_unlock_irqrestore(&device_domain_lock, flags);
406
407 return 0;
408}
409
410const struct iommu_ops intel_iommu_ops;
411
412static bool translation_pre_enabled(struct intel_iommu *iommu)
413{
414 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415}
416
417static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418{
419 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420}
421
422static void init_translation_status(struct intel_iommu *iommu)
423{
424 u32 gsts;
425
426 gsts = readl(iommu->reg + DMAR_GSTS_REG);
427 if (gsts & DMA_GSTS_TES)
428 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429}
430
431/* Convert generic 'struct iommu_domain to private struct dmar_domain */
432static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433{
434 return container_of(dom, struct dmar_domain, domain);
435}
436
437static int __init intel_iommu_setup(char *str)
438{
439 if (!str)
440 return -EINVAL;
441 while (*str) {
442 if (!strncmp(str, "on", 2)) {
443 dmar_disabled = 0;
444 pr_info("IOMMU enabled\n");
445 } else if (!strncmp(str, "off", 3)) {
446 dmar_disabled = 1;
447 no_platform_optin = 1;
448 pr_info("IOMMU disabled\n");
449 } else if (!strncmp(str, "igfx_off", 8)) {
450 dmar_map_gfx = 0;
451 pr_info("Disable GFX device mapping\n");
452 } else if (!strncmp(str, "forcedac", 8)) {
453 pr_info("Forcing DAC for PCI devices\n");
454 dmar_forcedac = 1;
455 } else if (!strncmp(str, "strict", 6)) {
456 pr_info("Disable batched IOTLB flush\n");
457 intel_iommu_strict = 1;
458 } else if (!strncmp(str, "sp_off", 6)) {
459 pr_info("Disable supported super page\n");
460 intel_iommu_superpage = 0;
461 } else if (!strncmp(str, "sm_on", 5)) {
462 pr_info("Intel-IOMMU: scalable mode supported\n");
463 intel_iommu_sm = 1;
464 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 printk(KERN_INFO
466 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 } else if (!strncmp(str, "nobounce", 8)) {
469 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470 intel_no_bounce = 1;
471 }
472
473 str += strcspn(str, ",");
474 while (*str == ',')
475 str++;
476 }
477 return 0;
478}
479__setup("intel_iommu=", intel_iommu_setup);
480
481static struct kmem_cache *iommu_domain_cache;
482static struct kmem_cache *iommu_devinfo_cache;
483
484static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
485{
486 struct dmar_domain **domains;
487 int idx = did >> 8;
488
489 domains = iommu->domains[idx];
490 if (!domains)
491 return NULL;
492
493 return domains[did & 0xff];
494}
495
496static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497 struct dmar_domain *domain)
498{
499 struct dmar_domain **domains;
500 int idx = did >> 8;
501
502 if (!iommu->domains[idx]) {
503 size_t size = 256 * sizeof(struct dmar_domain *);
504 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
505 }
506
507 domains = iommu->domains[idx];
508 if (WARN_ON(!domains))
509 return;
510 else
511 domains[did & 0xff] = domain;
512}
513
514void *alloc_pgtable_page(int node)
515{
516 struct page *page;
517 void *vaddr = NULL;
518
519 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520 if (page)
521 vaddr = page_address(page);
522 return vaddr;
523}
524
525void free_pgtable_page(void *vaddr)
526{
527 free_page((unsigned long)vaddr);
528}
529
530static inline void *alloc_domain_mem(void)
531{
532 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
533}
534
535static void free_domain_mem(void *vaddr)
536{
537 kmem_cache_free(iommu_domain_cache, vaddr);
538}
539
540static inline void * alloc_devinfo_mem(void)
541{
542 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
543}
544
545static inline void free_devinfo_mem(void *vaddr)
546{
547 kmem_cache_free(iommu_devinfo_cache, vaddr);
548}
549
550static inline int domain_type_is_si(struct dmar_domain *domain)
551{
552 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553}
554
555static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
557{
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561}
562
563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564{
565 unsigned long sagaw;
566 int agaw = -1;
567
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
573 }
574
575 return agaw;
576}
577
578/*
579 * Calculate max SAGAW for each iommu.
580 */
581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582{
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584}
585
586/*
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
590 */
591int iommu_calculate_agaw(struct intel_iommu *iommu)
592{
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594}
595
596/* This functionin only returns single iommu in a domain */
597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598{
599 int iommu_id;
600
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
604
605 for_each_domain_iommu(iommu_id, domain)
606 break;
607
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
610
611 return g_iommus[iommu_id];
612}
613
614static void domain_update_iommu_coherency(struct dmar_domain *domain)
615{
616 struct dmar_drhd_unit *drhd;
617 struct intel_iommu *iommu;
618 bool found = false;
619 int i;
620
621 domain->iommu_coherency = 1;
622
623 for_each_domain_iommu(i, domain) {
624 found = true;
625 if (!ecap_coherent(g_iommus[i]->ecap)) {
626 domain->iommu_coherency = 0;
627 break;
628 }
629 }
630 if (found)
631 return;
632
633 /* No hardware attached; use lowest common denominator */
634 rcu_read_lock();
635 for_each_active_iommu(iommu, drhd) {
636 if (!ecap_coherent(iommu->ecap)) {
637 domain->iommu_coherency = 0;
638 break;
639 }
640 }
641 rcu_read_unlock();
642}
643
644static int domain_update_iommu_snooping(struct intel_iommu *skip)
645{
646 struct dmar_drhd_unit *drhd;
647 struct intel_iommu *iommu;
648 int ret = 1;
649
650 rcu_read_lock();
651 for_each_active_iommu(iommu, drhd) {
652 if (iommu != skip) {
653 if (!ecap_sc_support(iommu->ecap)) {
654 ret = 0;
655 break;
656 }
657 }
658 }
659 rcu_read_unlock();
660
661 return ret;
662}
663
664static int domain_update_iommu_superpage(struct intel_iommu *skip)
665{
666 struct dmar_drhd_unit *drhd;
667 struct intel_iommu *iommu;
668 int mask = 0xf;
669
670 if (!intel_iommu_superpage) {
671 return 0;
672 }
673
674 /* set iommu_superpage to the smallest common denominator */
675 rcu_read_lock();
676 for_each_active_iommu(iommu, drhd) {
677 if (iommu != skip) {
678 mask &= cap_super_page_val(iommu->cap);
679 if (!mask)
680 break;
681 }
682 }
683 rcu_read_unlock();
684
685 return fls(mask);
686}
687
688/* Some capabilities may be different across iommus */
689static void domain_update_iommu_cap(struct dmar_domain *domain)
690{
691 domain_update_iommu_coherency(domain);
692 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
694}
695
696struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697 u8 devfn, int alloc)
698{
699 struct root_entry *root = &iommu->root_entry[bus];
700 struct context_entry *context;
701 u64 *entry;
702
703 entry = &root->lo;
704 if (sm_supported(iommu)) {
705 if (devfn >= 0x80) {
706 devfn -= 0x80;
707 entry = &root->hi;
708 }
709 devfn *= 2;
710 }
711 if (*entry & 1)
712 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713 else {
714 unsigned long phy_addr;
715 if (!alloc)
716 return NULL;
717
718 context = alloc_pgtable_page(iommu->node);
719 if (!context)
720 return NULL;
721
722 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723 phy_addr = virt_to_phys((void *)context);
724 *entry = phy_addr | 1;
725 __iommu_flush_cache(iommu, entry, sizeof(*entry));
726 }
727 return &context[devfn];
728}
729
730static int iommu_dummy(struct device *dev)
731{
732 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
733}
734
735/**
736 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737 * sub-hierarchy of a candidate PCI-PCI bridge
738 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739 * @bridge: the candidate PCI-PCI bridge
740 *
741 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
742 */
743static bool
744is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
745{
746 struct pci_dev *pdev, *pbridge;
747
748 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749 return false;
750
751 pdev = to_pci_dev(dev);
752 pbridge = to_pci_dev(bridge);
753
754 if (pbridge->subordinate &&
755 pbridge->subordinate->number <= pdev->bus->number &&
756 pbridge->subordinate->busn_res.end >= pdev->bus->number)
757 return true;
758
759 return false;
760}
761
762static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
763{
764 struct dmar_drhd_unit *drhd = NULL;
765 struct intel_iommu *iommu;
766 struct device *tmp;
767 struct pci_dev *pdev = NULL;
768 u16 segment = 0;
769 int i;
770
771 if (iommu_dummy(dev))
772 return NULL;
773
774 if (dev_is_pci(dev)) {
775 struct pci_dev *pf_pdev;
776
777 pdev = to_pci_dev(dev);
778
779#ifdef CONFIG_X86
780 /* VMD child devices currently cannot be handled individually */
781 if (is_vmd(pdev->bus))
782 return NULL;
783#endif
784
785 /* VFs aren't listed in scope tables; we need to look up
786 * the PF instead to find the IOMMU. */
787 pf_pdev = pci_physfn(pdev);
788 dev = &pf_pdev->dev;
789 segment = pci_domain_nr(pdev->bus);
790 } else if (has_acpi_companion(dev))
791 dev = &ACPI_COMPANION(dev)->dev;
792
793 rcu_read_lock();
794 for_each_active_iommu(iommu, drhd) {
795 if (pdev && segment != drhd->segment)
796 continue;
797
798 for_each_active_dev_scope(drhd->devices,
799 drhd->devices_cnt, i, tmp) {
800 if (tmp == dev) {
801 /* For a VF use its original BDF# not that of the PF
802 * which we used for the IOMMU lookup. Strictly speaking
803 * we could do this for all PCI devices; we only need to
804 * get the BDF# from the scope table for ACPI matches. */
805 if (pdev && pdev->is_virtfn)
806 goto got_pdev;
807
808 *bus = drhd->devices[i].bus;
809 *devfn = drhd->devices[i].devfn;
810 goto out;
811 }
812
813 if (is_downstream_to_pci_bridge(dev, tmp))
814 goto got_pdev;
815 }
816
817 if (pdev && drhd->include_all) {
818 got_pdev:
819 *bus = pdev->bus->number;
820 *devfn = pdev->devfn;
821 goto out;
822 }
823 }
824 iommu = NULL;
825 out:
826 rcu_read_unlock();
827
828 return iommu;
829}
830
831static void domain_flush_cache(struct dmar_domain *domain,
832 void *addr, int size)
833{
834 if (!domain->iommu_coherency)
835 clflush_cache_range(addr, size);
836}
837
838static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
839{
840 struct context_entry *context;
841 int ret = 0;
842 unsigned long flags;
843
844 spin_lock_irqsave(&iommu->lock, flags);
845 context = iommu_context_addr(iommu, bus, devfn, 0);
846 if (context)
847 ret = context_present(context);
848 spin_unlock_irqrestore(&iommu->lock, flags);
849 return ret;
850}
851
852static void free_context_table(struct intel_iommu *iommu)
853{
854 int i;
855 unsigned long flags;
856 struct context_entry *context;
857
858 spin_lock_irqsave(&iommu->lock, flags);
859 if (!iommu->root_entry) {
860 goto out;
861 }
862 for (i = 0; i < ROOT_ENTRY_NR; i++) {
863 context = iommu_context_addr(iommu, i, 0, 0);
864 if (context)
865 free_pgtable_page(context);
866
867 if (!sm_supported(iommu))
868 continue;
869
870 context = iommu_context_addr(iommu, i, 0x80, 0);
871 if (context)
872 free_pgtable_page(context);
873
874 }
875 free_pgtable_page(iommu->root_entry);
876 iommu->root_entry = NULL;
877out:
878 spin_unlock_irqrestore(&iommu->lock, flags);
879}
880
881static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882 unsigned long pfn, int *target_level)
883{
884 struct dma_pte *parent, *pte;
885 int level = agaw_to_level(domain->agaw);
886 int offset;
887
888 BUG_ON(!domain->pgd);
889
890 if (!domain_pfn_supported(domain, pfn))
891 /* Address beyond IOMMU's addressing capabilities. */
892 return NULL;
893
894 parent = domain->pgd;
895
896 while (1) {
897 void *tmp_page;
898
899 offset = pfn_level_offset(pfn, level);
900 pte = &parent[offset];
901 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
902 break;
903 if (level == *target_level)
904 break;
905
906 if (!dma_pte_present(pte)) {
907 uint64_t pteval;
908
909 tmp_page = alloc_pgtable_page(domain->nid);
910
911 if (!tmp_page)
912 return NULL;
913
914 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916 if (cmpxchg64(&pte->val, 0ULL, pteval))
917 /* Someone else set it while we were thinking; use theirs. */
918 free_pgtable_page(tmp_page);
919 else
920 domain_flush_cache(domain, pte, sizeof(*pte));
921 }
922 if (level == 1)
923 break;
924
925 parent = phys_to_virt(dma_pte_addr(pte));
926 level--;
927 }
928
929 if (!*target_level)
930 *target_level = level;
931
932 return pte;
933}
934
935/* return address's pte at specific level */
936static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
937 unsigned long pfn,
938 int level, int *large_page)
939{
940 struct dma_pte *parent, *pte;
941 int total = agaw_to_level(domain->agaw);
942 int offset;
943
944 parent = domain->pgd;
945 while (level <= total) {
946 offset = pfn_level_offset(pfn, total);
947 pte = &parent[offset];
948 if (level == total)
949 return pte;
950
951 if (!dma_pte_present(pte)) {
952 *large_page = total;
953 break;
954 }
955
956 if (dma_pte_superpage(pte)) {
957 *large_page = total;
958 return pte;
959 }
960
961 parent = phys_to_virt(dma_pte_addr(pte));
962 total--;
963 }
964 return NULL;
965}
966
967/* clear last level pte, a tlb flush should be followed */
968static void dma_pte_clear_range(struct dmar_domain *domain,
969 unsigned long start_pfn,
970 unsigned long last_pfn)
971{
972 unsigned int large_page;
973 struct dma_pte *first_pte, *pte;
974
975 BUG_ON(!domain_pfn_supported(domain, start_pfn));
976 BUG_ON(!domain_pfn_supported(domain, last_pfn));
977 BUG_ON(start_pfn > last_pfn);
978
979 /* we don't need lock here; nobody else touches the iova range */
980 do {
981 large_page = 1;
982 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
983 if (!pte) {
984 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
985 continue;
986 }
987 do {
988 dma_clear_pte(pte);
989 start_pfn += lvl_to_nr_pages(large_page);
990 pte++;
991 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
992
993 domain_flush_cache(domain, first_pte,
994 (void *)pte - (void *)first_pte);
995
996 } while (start_pfn && start_pfn <= last_pfn);
997}
998
999static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000 int retain_level, struct dma_pte *pte,
1001 unsigned long pfn, unsigned long start_pfn,
1002 unsigned long last_pfn)
1003{
1004 pfn = max(start_pfn, pfn);
1005 pte = &pte[pfn_level_offset(pfn, level)];
1006
1007 do {
1008 unsigned long level_pfn;
1009 struct dma_pte *level_pte;
1010
1011 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1012 goto next;
1013
1014 level_pfn = pfn & level_mask(level);
1015 level_pte = phys_to_virt(dma_pte_addr(pte));
1016
1017 if (level > 2) {
1018 dma_pte_free_level(domain, level - 1, retain_level,
1019 level_pte, level_pfn, start_pfn,
1020 last_pfn);
1021 }
1022
1023 /*
1024 * Free the page table if we're below the level we want to
1025 * retain and the range covers the entire table.
1026 */
1027 if (level < retain_level && !(start_pfn > level_pfn ||
1028 last_pfn < level_pfn + level_size(level) - 1)) {
1029 dma_clear_pte(pte);
1030 domain_flush_cache(domain, pte, sizeof(*pte));
1031 free_pgtable_page(level_pte);
1032 }
1033next:
1034 pfn += level_size(level);
1035 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1036}
1037
1038/*
1039 * clear last level (leaf) ptes and free page table pages below the
1040 * level we wish to keep intact.
1041 */
1042static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043 unsigned long start_pfn,
1044 unsigned long last_pfn,
1045 int retain_level)
1046{
1047 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049 BUG_ON(start_pfn > last_pfn);
1050
1051 dma_pte_clear_range(domain, start_pfn, last_pfn);
1052
1053 /* We don't need lock here; nobody else touches the iova range */
1054 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055 domain->pgd, 0, start_pfn, last_pfn);
1056
1057 /* free pgd */
1058 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059 free_pgtable_page(domain->pgd);
1060 domain->pgd = NULL;
1061 }
1062}
1063
1064/* When a page at a given level is being unlinked from its parent, we don't
1065 need to *modify* it at all. All we need to do is make a list of all the
1066 pages which can be freed just as soon as we've flushed the IOTLB and we
1067 know the hardware page-walk will no longer touch them.
1068 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1069 be freed. */
1070static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071 int level, struct dma_pte *pte,
1072 struct page *freelist)
1073{
1074 struct page *pg;
1075
1076 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077 pg->freelist = freelist;
1078 freelist = pg;
1079
1080 if (level == 1)
1081 return freelist;
1082
1083 pte = page_address(pg);
1084 do {
1085 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086 freelist = dma_pte_list_pagetables(domain, level - 1,
1087 pte, freelist);
1088 pte++;
1089 } while (!first_pte_in_page(pte));
1090
1091 return freelist;
1092}
1093
1094static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095 struct dma_pte *pte, unsigned long pfn,
1096 unsigned long start_pfn,
1097 unsigned long last_pfn,
1098 struct page *freelist)
1099{
1100 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1101
1102 pfn = max(start_pfn, pfn);
1103 pte = &pte[pfn_level_offset(pfn, level)];
1104
1105 do {
1106 unsigned long level_pfn;
1107
1108 if (!dma_pte_present(pte))
1109 goto next;
1110
1111 level_pfn = pfn & level_mask(level);
1112
1113 /* If range covers entire pagetable, free it */
1114 if (start_pfn <= level_pfn &&
1115 last_pfn >= level_pfn + level_size(level) - 1) {
1116 /* These suborbinate page tables are going away entirely. Don't
1117 bother to clear them; we're just going to *free* them. */
1118 if (level > 1 && !dma_pte_superpage(pte))
1119 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1120
1121 dma_clear_pte(pte);
1122 if (!first_pte)
1123 first_pte = pte;
1124 last_pte = pte;
1125 } else if (level > 1) {
1126 /* Recurse down into a level that isn't *entirely* obsolete */
1127 freelist = dma_pte_clear_level(domain, level - 1,
1128 phys_to_virt(dma_pte_addr(pte)),
1129 level_pfn, start_pfn, last_pfn,
1130 freelist);
1131 }
1132next:
1133 pfn += level_size(level);
1134 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1135
1136 if (first_pte)
1137 domain_flush_cache(domain, first_pte,
1138 (void *)++last_pte - (void *)first_pte);
1139
1140 return freelist;
1141}
1142
1143/* We can't just free the pages because the IOMMU may still be walking
1144 the page tables, and may have cached the intermediate levels. The
1145 pages can only be freed after the IOTLB flush has been done. */
1146static struct page *domain_unmap(struct dmar_domain *domain,
1147 unsigned long start_pfn,
1148 unsigned long last_pfn)
1149{
1150 struct page *freelist;
1151
1152 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154 BUG_ON(start_pfn > last_pfn);
1155
1156 /* we don't need lock here; nobody else touches the iova range */
1157 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158 domain->pgd, 0, start_pfn, last_pfn, NULL);
1159
1160 /* free pgd */
1161 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162 struct page *pgd_page = virt_to_page(domain->pgd);
1163 pgd_page->freelist = freelist;
1164 freelist = pgd_page;
1165
1166 domain->pgd = NULL;
1167 }
1168
1169 return freelist;
1170}
1171
1172static void dma_free_pagelist(struct page *freelist)
1173{
1174 struct page *pg;
1175
1176 while ((pg = freelist)) {
1177 freelist = pg->freelist;
1178 free_pgtable_page(page_address(pg));
1179 }
1180}
1181
1182static void iova_entry_free(unsigned long data)
1183{
1184 struct page *freelist = (struct page *)data;
1185
1186 dma_free_pagelist(freelist);
1187}
1188
1189/* iommu handling */
1190static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191{
1192 struct root_entry *root;
1193 unsigned long flags;
1194
1195 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1196 if (!root) {
1197 pr_err("Allocating root entry for %s failed\n",
1198 iommu->name);
1199 return -ENOMEM;
1200 }
1201
1202 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1203
1204 spin_lock_irqsave(&iommu->lock, flags);
1205 iommu->root_entry = root;
1206 spin_unlock_irqrestore(&iommu->lock, flags);
1207
1208 return 0;
1209}
1210
1211static void iommu_set_root_entry(struct intel_iommu *iommu)
1212{
1213 u64 addr;
1214 u32 sts;
1215 unsigned long flag;
1216
1217 addr = virt_to_phys(iommu->root_entry);
1218 if (sm_supported(iommu))
1219 addr |= DMA_RTADDR_SMT;
1220
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1223
1224 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1225
1226 /* Make sure hardware complete it */
1227 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228 readl, (sts & DMA_GSTS_RTPS), sts);
1229
1230 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1231}
1232
1233void iommu_flush_write_buffer(struct intel_iommu *iommu)
1234{
1235 u32 val;
1236 unsigned long flag;
1237
1238 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1239 return;
1240
1241 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1243
1244 /* Make sure hardware complete it */
1245 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246 readl, (!(val & DMA_GSTS_WBFS)), val);
1247
1248 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1249}
1250
1251/* return value determine if we need a write buffer flush */
1252static void __iommu_flush_context(struct intel_iommu *iommu,
1253 u16 did, u16 source_id, u8 function_mask,
1254 u64 type)
1255{
1256 u64 val = 0;
1257 unsigned long flag;
1258
1259 switch (type) {
1260 case DMA_CCMD_GLOBAL_INVL:
1261 val = DMA_CCMD_GLOBAL_INVL;
1262 break;
1263 case DMA_CCMD_DOMAIN_INVL:
1264 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1265 break;
1266 case DMA_CCMD_DEVICE_INVL:
1267 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1269 break;
1270 default:
1271 BUG();
1272 }
1273 val |= DMA_CCMD_ICC;
1274
1275 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1277
1278 /* Make sure hardware complete it */
1279 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1281
1282 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283}
1284
1285/* return value determine if we need a write buffer flush */
1286static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287 u64 addr, unsigned int size_order, u64 type)
1288{
1289 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290 u64 val = 0, val_iva = 0;
1291 unsigned long flag;
1292
1293 switch (type) {
1294 case DMA_TLB_GLOBAL_FLUSH:
1295 /* global flush doesn't need set IVA_REG */
1296 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1297 break;
1298 case DMA_TLB_DSI_FLUSH:
1299 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300 break;
1301 case DMA_TLB_PSI_FLUSH:
1302 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 /* IH bit is passed in as part of address */
1304 val_iva = size_order | addr;
1305 break;
1306 default:
1307 BUG();
1308 }
1309 /* Note: set drain read/write */
1310#if 0
1311 /*
1312 * This is probably to be super secure.. Looks like we can
1313 * ignore it without any impact.
1314 */
1315 if (cap_read_drain(iommu->cap))
1316 val |= DMA_TLB_READ_DRAIN;
1317#endif
1318 if (cap_write_drain(iommu->cap))
1319 val |= DMA_TLB_WRITE_DRAIN;
1320
1321 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322 /* Note: Only uses first TLB reg currently */
1323 if (val_iva)
1324 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1326
1327 /* Make sure hardware complete it */
1328 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1330
1331 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332
1333 /* check IOTLB invalidation granularity */
1334 if (DMA_TLB_IAIG(val) == 0)
1335 pr_err("Flush IOTLB failed\n");
1336 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337 pr_debug("TLB flush request %Lx, actual %Lx\n",
1338 (unsigned long long)DMA_TLB_IIRG(type),
1339 (unsigned long long)DMA_TLB_IAIG(val));
1340}
1341
1342static struct device_domain_info *
1343iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1344 u8 bus, u8 devfn)
1345{
1346 struct device_domain_info *info;
1347
1348 assert_spin_locked(&device_domain_lock);
1349
1350 if (!iommu->qi)
1351 return NULL;
1352
1353 list_for_each_entry(info, &domain->devices, link)
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 if (info->ats_supported && info->dev)
1357 return info;
1358 break;
1359 }
1360
1361 return NULL;
1362}
1363
1364static void domain_update_iotlb(struct dmar_domain *domain)
1365{
1366 struct device_domain_info *info;
1367 bool has_iotlb_device = false;
1368
1369 assert_spin_locked(&device_domain_lock);
1370
1371 list_for_each_entry(info, &domain->devices, link) {
1372 struct pci_dev *pdev;
1373
1374 if (!info->dev || !dev_is_pci(info->dev))
1375 continue;
1376
1377 pdev = to_pci_dev(info->dev);
1378 if (pdev->ats_enabled) {
1379 has_iotlb_device = true;
1380 break;
1381 }
1382 }
1383
1384 domain->has_iotlb_device = has_iotlb_device;
1385}
1386
1387static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1388{
1389 struct pci_dev *pdev;
1390
1391 assert_spin_locked(&device_domain_lock);
1392
1393 if (!info || !dev_is_pci(info->dev))
1394 return;
1395
1396 pdev = to_pci_dev(info->dev);
1397 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400 * reserved, which should be set to 0.
1401 */
1402 if (!ecap_dit(info->iommu->ecap))
1403 info->pfsid = 0;
1404 else {
1405 struct pci_dev *pf_pdev;
1406
1407 /* pdev will be returned if device is not a vf */
1408 pf_pdev = pci_physfn(pdev);
1409 info->pfsid = pci_dev_id(pf_pdev);
1410 }
1411
1412#ifdef CONFIG_INTEL_IOMMU_SVM
1413 /* The PCIe spec, in its wisdom, declares that the behaviour of
1414 the device if you enable PASID support after ATS support is
1415 undefined. So always enable PASID support on devices which
1416 have it, even if we can't yet know if we're ever going to
1417 use it. */
1418 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419 info->pasid_enabled = 1;
1420
1421 if (info->pri_supported &&
1422 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1423 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424 info->pri_enabled = 1;
1425#endif
1426 if (!pdev->untrusted && info->ats_supported &&
1427 pci_ats_page_aligned(pdev) &&
1428 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429 info->ats_enabled = 1;
1430 domain_update_iotlb(info->domain);
1431 info->ats_qdep = pci_ats_queue_depth(pdev);
1432 }
1433}
1434
1435static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1436{
1437 struct pci_dev *pdev;
1438
1439 assert_spin_locked(&device_domain_lock);
1440
1441 if (!dev_is_pci(info->dev))
1442 return;
1443
1444 pdev = to_pci_dev(info->dev);
1445
1446 if (info->ats_enabled) {
1447 pci_disable_ats(pdev);
1448 info->ats_enabled = 0;
1449 domain_update_iotlb(info->domain);
1450 }
1451#ifdef CONFIG_INTEL_IOMMU_SVM
1452 if (info->pri_enabled) {
1453 pci_disable_pri(pdev);
1454 info->pri_enabled = 0;
1455 }
1456 if (info->pasid_enabled) {
1457 pci_disable_pasid(pdev);
1458 info->pasid_enabled = 0;
1459 }
1460#endif
1461}
1462
1463static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464 u64 addr, unsigned mask)
1465{
1466 u16 sid, qdep;
1467 unsigned long flags;
1468 struct device_domain_info *info;
1469
1470 if (!domain->has_iotlb_device)
1471 return;
1472
1473 spin_lock_irqsave(&device_domain_lock, flags);
1474 list_for_each_entry(info, &domain->devices, link) {
1475 if (!info->ats_enabled)
1476 continue;
1477
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 qdep, addr, mask);
1482 }
1483 spin_unlock_irqrestore(&device_domain_lock, flags);
1484}
1485
1486static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487 struct dmar_domain *domain,
1488 unsigned long pfn, unsigned int pages,
1489 int ih, int map)
1490{
1491 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493 u16 did = domain->iommu_did[iommu->seq_id];
1494
1495 BUG_ON(pages == 0);
1496
1497 if (ih)
1498 ih = 1 << 6;
1499 /*
1500 * Fallback to domain selective flush if no PSI support or the size is
1501 * too big.
1502 * PSI requires page size to be 2 ^ x, and the base address is naturally
1503 * aligned to the size
1504 */
1505 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507 DMA_TLB_DSI_FLUSH);
1508 else
1509 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510 DMA_TLB_PSI_FLUSH);
1511
1512 /*
1513 * In caching mode, changes of pages from non-present to present require
1514 * flush. However, device IOTLB doesn't need to be flushed in this case.
1515 */
1516 if (!cap_caching_mode(iommu->cap) || !map)
1517 iommu_flush_dev_iotlb(domain, addr, mask);
1518}
1519
1520/* Notification for newly created mappings */
1521static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522 struct dmar_domain *domain,
1523 unsigned long pfn, unsigned int pages)
1524{
1525 /* It's a non-present to present mapping. Only flush if caching mode */
1526 if (cap_caching_mode(iommu->cap))
1527 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1528 else
1529 iommu_flush_write_buffer(iommu);
1530}
1531
1532static void iommu_flush_iova(struct iova_domain *iovad)
1533{
1534 struct dmar_domain *domain;
1535 int idx;
1536
1537 domain = container_of(iovad, struct dmar_domain, iovad);
1538
1539 for_each_domain_iommu(idx, domain) {
1540 struct intel_iommu *iommu = g_iommus[idx];
1541 u16 did = domain->iommu_did[iommu->seq_id];
1542
1543 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1544
1545 if (!cap_caching_mode(iommu->cap))
1546 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547 0, MAX_AGAW_PFN_WIDTH);
1548 }
1549}
1550
1551static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1552{
1553 u32 pmen;
1554 unsigned long flags;
1555
1556 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1557 return;
1558
1559 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561 pmen &= ~DMA_PMEN_EPM;
1562 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1563
1564 /* wait for the protected region status bit to clear */
1565 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566 readl, !(pmen & DMA_PMEN_PRS), pmen);
1567
1568 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569}
1570
1571static void iommu_enable_translation(struct intel_iommu *iommu)
1572{
1573 u32 sts;
1574 unsigned long flags;
1575
1576 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577 iommu->gcmd |= DMA_GCMD_TE;
1578 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1579
1580 /* Make sure hardware complete it */
1581 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582 readl, (sts & DMA_GSTS_TES), sts);
1583
1584 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585}
1586
1587static void iommu_disable_translation(struct intel_iommu *iommu)
1588{
1589 u32 sts;
1590 unsigned long flag;
1591
1592 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593 iommu->gcmd &= ~DMA_GCMD_TE;
1594 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595
1596 /* Make sure hardware complete it */
1597 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598 readl, (!(sts & DMA_GSTS_TES)), sts);
1599
1600 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1601}
1602
1603static int iommu_init_domains(struct intel_iommu *iommu)
1604{
1605 u32 ndomains, nlongs;
1606 size_t size;
1607
1608 ndomains = cap_ndoms(iommu->cap);
1609 pr_debug("%s: Number of Domains supported <%d>\n",
1610 iommu->name, ndomains);
1611 nlongs = BITS_TO_LONGS(ndomains);
1612
1613 spin_lock_init(&iommu->lock);
1614
1615 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616 if (!iommu->domain_ids) {
1617 pr_err("%s: Allocating domain id array failed\n",
1618 iommu->name);
1619 return -ENOMEM;
1620 }
1621
1622 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623 iommu->domains = kzalloc(size, GFP_KERNEL);
1624
1625 if (iommu->domains) {
1626 size = 256 * sizeof(struct dmar_domain *);
1627 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1628 }
1629
1630 if (!iommu->domains || !iommu->domains[0]) {
1631 pr_err("%s: Allocating domain array failed\n",
1632 iommu->name);
1633 kfree(iommu->domain_ids);
1634 kfree(iommu->domains);
1635 iommu->domain_ids = NULL;
1636 iommu->domains = NULL;
1637 return -ENOMEM;
1638 }
1639
1640 /*
1641 * If Caching mode is set, then invalid translations are tagged
1642 * with domain-id 0, hence we need to pre-allocate it. We also
1643 * use domain-id 0 as a marker for non-allocated domain-id, so
1644 * make sure it is not used for a real domain.
1645 */
1646 set_bit(0, iommu->domain_ids);
1647
1648 /*
1649 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650 * entry for first-level or pass-through translation modes should
1651 * be programmed with a domain id different from those used for
1652 * second-level or nested translation. We reserve a domain id for
1653 * this purpose.
1654 */
1655 if (sm_supported(iommu))
1656 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1657
1658 return 0;
1659}
1660
1661static void disable_dmar_iommu(struct intel_iommu *iommu)
1662{
1663 struct device_domain_info *info, *tmp;
1664 unsigned long flags;
1665
1666 if (!iommu->domains || !iommu->domain_ids)
1667 return;
1668
1669 spin_lock_irqsave(&device_domain_lock, flags);
1670 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671 if (info->iommu != iommu)
1672 continue;
1673
1674 if (!info->dev || !info->domain)
1675 continue;
1676
1677 __dmar_remove_one_dev_info(info);
1678 }
1679 spin_unlock_irqrestore(&device_domain_lock, flags);
1680
1681 if (iommu->gcmd & DMA_GCMD_TE)
1682 iommu_disable_translation(iommu);
1683}
1684
1685static void free_dmar_iommu(struct intel_iommu *iommu)
1686{
1687 if ((iommu->domains) && (iommu->domain_ids)) {
1688 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1689 int i;
1690
1691 for (i = 0; i < elems; i++)
1692 kfree(iommu->domains[i]);
1693 kfree(iommu->domains);
1694 kfree(iommu->domain_ids);
1695 iommu->domains = NULL;
1696 iommu->domain_ids = NULL;
1697 }
1698
1699 g_iommus[iommu->seq_id] = NULL;
1700
1701 /* free context mapping */
1702 free_context_table(iommu);
1703
1704#ifdef CONFIG_INTEL_IOMMU_SVM
1705 if (pasid_supported(iommu)) {
1706 if (ecap_prs(iommu->ecap))
1707 intel_svm_finish_prq(iommu);
1708 }
1709#endif
1710}
1711
1712static struct dmar_domain *alloc_domain(int flags)
1713{
1714 struct dmar_domain *domain;
1715
1716 domain = alloc_domain_mem();
1717 if (!domain)
1718 return NULL;
1719
1720 memset(domain, 0, sizeof(*domain));
1721 domain->nid = NUMA_NO_NODE;
1722 domain->flags = flags;
1723 domain->has_iotlb_device = false;
1724 INIT_LIST_HEAD(&domain->devices);
1725
1726 return domain;
1727}
1728
1729/* Must be called with iommu->lock */
1730static int domain_attach_iommu(struct dmar_domain *domain,
1731 struct intel_iommu *iommu)
1732{
1733 unsigned long ndomains;
1734 int num;
1735
1736 assert_spin_locked(&device_domain_lock);
1737 assert_spin_locked(&iommu->lock);
1738
1739 domain->iommu_refcnt[iommu->seq_id] += 1;
1740 domain->iommu_count += 1;
1741 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742 ndomains = cap_ndoms(iommu->cap);
1743 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1744
1745 if (num >= ndomains) {
1746 pr_err("%s: No free domain ids\n", iommu->name);
1747 domain->iommu_refcnt[iommu->seq_id] -= 1;
1748 domain->iommu_count -= 1;
1749 return -ENOSPC;
1750 }
1751
1752 set_bit(num, iommu->domain_ids);
1753 set_iommu_domain(iommu, num, domain);
1754
1755 domain->iommu_did[iommu->seq_id] = num;
1756 domain->nid = iommu->node;
1757
1758 domain_update_iommu_cap(domain);
1759 }
1760
1761 return 0;
1762}
1763
1764static int domain_detach_iommu(struct dmar_domain *domain,
1765 struct intel_iommu *iommu)
1766{
1767 int num, count;
1768
1769 assert_spin_locked(&device_domain_lock);
1770 assert_spin_locked(&iommu->lock);
1771
1772 domain->iommu_refcnt[iommu->seq_id] -= 1;
1773 count = --domain->iommu_count;
1774 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775 num = domain->iommu_did[iommu->seq_id];
1776 clear_bit(num, iommu->domain_ids);
1777 set_iommu_domain(iommu, num, NULL);
1778
1779 domain_update_iommu_cap(domain);
1780 domain->iommu_did[iommu->seq_id] = 0;
1781 }
1782
1783 return count;
1784}
1785
1786static struct iova_domain reserved_iova_list;
1787static struct lock_class_key reserved_rbtree_key;
1788
1789static int dmar_init_reserved_ranges(void)
1790{
1791 struct pci_dev *pdev = NULL;
1792 struct iova *iova;
1793 int i;
1794
1795 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1796
1797 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798 &reserved_rbtree_key);
1799
1800 /* IOAPIC ranges shouldn't be accessed by DMA */
1801 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802 IOVA_PFN(IOAPIC_RANGE_END));
1803 if (!iova) {
1804 pr_err("Reserve IOAPIC range failed\n");
1805 return -ENODEV;
1806 }
1807
1808 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809 for_each_pci_dev(pdev) {
1810 struct resource *r;
1811
1812 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813 r = &pdev->resource[i];
1814 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815 continue;
1816 iova = reserve_iova(&reserved_iova_list,
1817 IOVA_PFN(r->start),
1818 IOVA_PFN(r->end));
1819 if (!iova) {
1820 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1821 return -ENODEV;
1822 }
1823 }
1824 }
1825 return 0;
1826}
1827
1828static void domain_reserve_special_ranges(struct dmar_domain *domain)
1829{
1830 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1831}
1832
1833static inline int guestwidth_to_adjustwidth(int gaw)
1834{
1835 int agaw;
1836 int r = (gaw - 12) % 9;
1837
1838 if (r == 0)
1839 agaw = gaw;
1840 else
1841 agaw = gaw + 9 - r;
1842 if (agaw > 64)
1843 agaw = 64;
1844 return agaw;
1845}
1846
1847static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1848 int guest_width)
1849{
1850 int adjust_width, agaw;
1851 unsigned long sagaw;
1852 int err;
1853
1854 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1855
1856 err = init_iova_flush_queue(&domain->iovad,
1857 iommu_flush_iova, iova_entry_free);
1858 if (err)
1859 return err;
1860
1861 domain_reserve_special_ranges(domain);
1862
1863 /* calculate AGAW */
1864 if (guest_width > cap_mgaw(iommu->cap))
1865 guest_width = cap_mgaw(iommu->cap);
1866 domain->gaw = guest_width;
1867 adjust_width = guestwidth_to_adjustwidth(guest_width);
1868 agaw = width_to_agaw(adjust_width);
1869 sagaw = cap_sagaw(iommu->cap);
1870 if (!test_bit(agaw, &sagaw)) {
1871 /* hardware doesn't support it, choose a bigger one */
1872 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873 agaw = find_next_bit(&sagaw, 5, agaw);
1874 if (agaw >= 5)
1875 return -ENODEV;
1876 }
1877 domain->agaw = agaw;
1878
1879 if (ecap_coherent(iommu->ecap))
1880 domain->iommu_coherency = 1;
1881 else
1882 domain->iommu_coherency = 0;
1883
1884 if (ecap_sc_support(iommu->ecap))
1885 domain->iommu_snooping = 1;
1886 else
1887 domain->iommu_snooping = 0;
1888
1889 if (intel_iommu_superpage)
1890 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891 else
1892 domain->iommu_superpage = 0;
1893
1894 domain->nid = iommu->node;
1895
1896 /* always allocate the top pgd */
1897 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1898 if (!domain->pgd)
1899 return -ENOMEM;
1900 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1901 return 0;
1902}
1903
1904static void domain_exit(struct dmar_domain *domain)
1905{
1906
1907 /* Remove associated devices and clear attached or cached domains */
1908 domain_remove_dev_info(domain);
1909
1910 /* destroy iovas */
1911 put_iova_domain(&domain->iovad);
1912
1913 if (domain->pgd) {
1914 struct page *freelist;
1915
1916 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917 dma_free_pagelist(freelist);
1918 }
1919
1920 free_domain_mem(domain);
1921}
1922
1923/*
1924 * Get the PASID directory size for scalable mode context entry.
1925 * Value of X in the PDTS field of a scalable mode context entry
1926 * indicates PASID directory with 2^(X + 7) entries.
1927 */
1928static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1929{
1930 int pds, max_pde;
1931
1932 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1934 if (pds < 7)
1935 return 0;
1936
1937 return pds - 7;
1938}
1939
1940/*
1941 * Set the RID_PASID field of a scalable mode context entry. The
1942 * IOMMU hardware will use the PASID value set in this field for
1943 * DMA translations of DMA requests without PASID.
1944 */
1945static inline void
1946context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1947{
1948 context->hi |= pasid & ((1 << 20) - 1);
1949 context->hi |= (1 << 20);
1950}
1951
1952/*
1953 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1954 * entry.
1955 */
1956static inline void context_set_sm_dte(struct context_entry *context)
1957{
1958 context->lo |= (1 << 2);
1959}
1960
1961/*
1962 * Set the PRE(Page Request Enable) field of a scalable mode context
1963 * entry.
1964 */
1965static inline void context_set_sm_pre(struct context_entry *context)
1966{
1967 context->lo |= (1 << 4);
1968}
1969
1970/* Convert value to context PASID directory size field coding. */
1971#define context_pdts(pds) (((pds) & 0x7) << 9)
1972
1973static int domain_context_mapping_one(struct dmar_domain *domain,
1974 struct intel_iommu *iommu,
1975 struct pasid_table *table,
1976 u8 bus, u8 devfn)
1977{
1978 u16 did = domain->iommu_did[iommu->seq_id];
1979 int translation = CONTEXT_TT_MULTI_LEVEL;
1980 struct device_domain_info *info = NULL;
1981 struct context_entry *context;
1982 unsigned long flags;
1983 int ret;
1984
1985 WARN_ON(did == 0);
1986
1987 if (hw_pass_through && domain_type_is_si(domain))
1988 translation = CONTEXT_TT_PASS_THROUGH;
1989
1990 pr_debug("Set context mapping for %02x:%02x.%d\n",
1991 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1992
1993 BUG_ON(!domain->pgd);
1994
1995 spin_lock_irqsave(&device_domain_lock, flags);
1996 spin_lock(&iommu->lock);
1997
1998 ret = -ENOMEM;
1999 context = iommu_context_addr(iommu, bus, devfn, 1);
2000 if (!context)
2001 goto out_unlock;
2002
2003 ret = 0;
2004 if (context_present(context))
2005 goto out_unlock;
2006
2007 /*
2008 * For kdump cases, old valid entries may be cached due to the
2009 * in-flight DMA and copied pgtable, but there is no unmapping
2010 * behaviour for them, thus we need an explicit cache flush for
2011 * the newly-mapped device. For kdump, at this point, the device
2012 * is supposed to finish reset at its driver probe stage, so no
2013 * in-flight DMA will exist, and we don't need to worry anymore
2014 * hereafter.
2015 */
2016 if (context_copied(context)) {
2017 u16 did_old = context_domain_id(context);
2018
2019 if (did_old < cap_ndoms(iommu->cap)) {
2020 iommu->flush.flush_context(iommu, did_old,
2021 (((u16)bus) << 8) | devfn,
2022 DMA_CCMD_MASK_NOBIT,
2023 DMA_CCMD_DEVICE_INVL);
2024 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2025 DMA_TLB_DSI_FLUSH);
2026 }
2027 }
2028
2029 context_clear_entry(context);
2030
2031 if (sm_supported(iommu)) {
2032 unsigned long pds;
2033
2034 WARN_ON(!table);
2035
2036 /* Setup the PASID DIR pointer: */
2037 pds = context_get_sm_pds(table);
2038 context->lo = (u64)virt_to_phys(table->table) |
2039 context_pdts(pds);
2040
2041 /* Setup the RID_PASID field: */
2042 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2043
2044 /*
2045 * Setup the Device-TLB enable bit and Page request
2046 * Enable bit:
2047 */
2048 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049 if (info && info->ats_supported)
2050 context_set_sm_dte(context);
2051 if (info && info->pri_supported)
2052 context_set_sm_pre(context);
2053 } else {
2054 struct dma_pte *pgd = domain->pgd;
2055 int agaw;
2056
2057 context_set_domain_id(context, did);
2058
2059 if (translation != CONTEXT_TT_PASS_THROUGH) {
2060 /*
2061 * Skip top levels of page tables for iommu which has
2062 * less agaw than default. Unnecessary for PT mode.
2063 */
2064 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2065 ret = -ENOMEM;
2066 pgd = phys_to_virt(dma_pte_addr(pgd));
2067 if (!dma_pte_present(pgd))
2068 goto out_unlock;
2069 }
2070
2071 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072 if (info && info->ats_supported)
2073 translation = CONTEXT_TT_DEV_IOTLB;
2074 else
2075 translation = CONTEXT_TT_MULTI_LEVEL;
2076
2077 context_set_address_root(context, virt_to_phys(pgd));
2078 context_set_address_width(context, agaw);
2079 } else {
2080 /*
2081 * In pass through mode, AW must be programmed to
2082 * indicate the largest AGAW value supported by
2083 * hardware. And ASR is ignored by hardware.
2084 */
2085 context_set_address_width(context, iommu->msagaw);
2086 }
2087
2088 context_set_translation_type(context, translation);
2089 }
2090
2091 context_set_fault_enable(context);
2092 context_set_present(context);
2093 domain_flush_cache(domain, context, sizeof(*context));
2094
2095 /*
2096 * It's a non-present to present mapping. If hardware doesn't cache
2097 * non-present entry we only need to flush the write-buffer. If the
2098 * _does_ cache non-present entries, then it does so in the special
2099 * domain #0, which we have to flush:
2100 */
2101 if (cap_caching_mode(iommu->cap)) {
2102 iommu->flush.flush_context(iommu, 0,
2103 (((u16)bus) << 8) | devfn,
2104 DMA_CCMD_MASK_NOBIT,
2105 DMA_CCMD_DEVICE_INVL);
2106 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2107 } else {
2108 iommu_flush_write_buffer(iommu);
2109 }
2110 iommu_enable_dev_iotlb(info);
2111
2112 ret = 0;
2113
2114out_unlock:
2115 spin_unlock(&iommu->lock);
2116 spin_unlock_irqrestore(&device_domain_lock, flags);
2117
2118 return ret;
2119}
2120
2121struct domain_context_mapping_data {
2122 struct dmar_domain *domain;
2123 struct intel_iommu *iommu;
2124 struct pasid_table *table;
2125};
2126
2127static int domain_context_mapping_cb(struct pci_dev *pdev,
2128 u16 alias, void *opaque)
2129{
2130 struct domain_context_mapping_data *data = opaque;
2131
2132 return domain_context_mapping_one(data->domain, data->iommu,
2133 data->table, PCI_BUS_NUM(alias),
2134 alias & 0xff);
2135}
2136
2137static int
2138domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2139{
2140 struct domain_context_mapping_data data;
2141 struct pasid_table *table;
2142 struct intel_iommu *iommu;
2143 u8 bus, devfn;
2144
2145 iommu = device_to_iommu(dev, &bus, &devfn);
2146 if (!iommu)
2147 return -ENODEV;
2148
2149 table = intel_pasid_get_table(dev);
2150
2151 if (!dev_is_pci(dev))
2152 return domain_context_mapping_one(domain, iommu, table,
2153 bus, devfn);
2154
2155 data.domain = domain;
2156 data.iommu = iommu;
2157 data.table = table;
2158
2159 return pci_for_each_dma_alias(to_pci_dev(dev),
2160 &domain_context_mapping_cb, &data);
2161}
2162
2163static int domain_context_mapped_cb(struct pci_dev *pdev,
2164 u16 alias, void *opaque)
2165{
2166 struct intel_iommu *iommu = opaque;
2167
2168 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2169}
2170
2171static int domain_context_mapped(struct device *dev)
2172{
2173 struct intel_iommu *iommu;
2174 u8 bus, devfn;
2175
2176 iommu = device_to_iommu(dev, &bus, &devfn);
2177 if (!iommu)
2178 return -ENODEV;
2179
2180 if (!dev_is_pci(dev))
2181 return device_context_mapped(iommu, bus, devfn);
2182
2183 return !pci_for_each_dma_alias(to_pci_dev(dev),
2184 domain_context_mapped_cb, iommu);
2185}
2186
2187/* Returns a number of VTD pages, but aligned to MM page size */
2188static inline unsigned long aligned_nrpages(unsigned long host_addr,
2189 size_t size)
2190{
2191 host_addr &= ~PAGE_MASK;
2192 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2193}
2194
2195/* Return largest possible superpage level for a given mapping */
2196static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197 unsigned long iov_pfn,
2198 unsigned long phy_pfn,
2199 unsigned long pages)
2200{
2201 int support, level = 1;
2202 unsigned long pfnmerge;
2203
2204 support = domain->iommu_superpage;
2205
2206 /* To use a large page, the virtual *and* physical addresses
2207 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208 of them will mean we have to use smaller pages. So just
2209 merge them and check both at once. */
2210 pfnmerge = iov_pfn | phy_pfn;
2211
2212 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213 pages >>= VTD_STRIDE_SHIFT;
2214 if (!pages)
2215 break;
2216 pfnmerge >>= VTD_STRIDE_SHIFT;
2217 level++;
2218 support--;
2219 }
2220 return level;
2221}
2222
2223static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224 struct scatterlist *sg, unsigned long phys_pfn,
2225 unsigned long nr_pages, int prot)
2226{
2227 struct dma_pte *first_pte = NULL, *pte = NULL;
2228 phys_addr_t uninitialized_var(pteval);
2229 unsigned long sg_res = 0;
2230 unsigned int largepage_lvl = 0;
2231 unsigned long lvl_pages = 0;
2232
2233 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2234
2235 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2236 return -EINVAL;
2237
2238 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2239
2240 if (!sg) {
2241 sg_res = nr_pages;
2242 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2243 }
2244
2245 while (nr_pages > 0) {
2246 uint64_t tmp;
2247
2248 if (!sg_res) {
2249 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2250
2251 sg_res = aligned_nrpages(sg->offset, sg->length);
2252 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253 sg->dma_length = sg->length;
2254 pteval = (sg_phys(sg) - pgoff) | prot;
2255 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2256 }
2257
2258 if (!pte) {
2259 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2260
2261 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2262 if (!pte)
2263 return -ENOMEM;
2264 /* It is large page*/
2265 if (largepage_lvl > 1) {
2266 unsigned long nr_superpages, end_pfn;
2267
2268 pteval |= DMA_PTE_LARGE_PAGE;
2269 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2270
2271 nr_superpages = sg_res / lvl_pages;
2272 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2273
2274 /*
2275 * Ensure that old small page tables are
2276 * removed to make room for superpage(s).
2277 * We're adding new large pages, so make sure
2278 * we don't remove their parent tables.
2279 */
2280 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2281 largepage_lvl + 1);
2282 } else {
2283 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2284 }
2285
2286 }
2287 /* We don't need lock here, nobody else
2288 * touches the iova range
2289 */
2290 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2291 if (tmp) {
2292 static int dumps = 5;
2293 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294 iov_pfn, tmp, (unsigned long long)pteval);
2295 if (dumps) {
2296 dumps--;
2297 debug_dma_dump_mappings(NULL);
2298 }
2299 WARN_ON(1);
2300 }
2301
2302 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2303
2304 BUG_ON(nr_pages < lvl_pages);
2305 BUG_ON(sg_res < lvl_pages);
2306
2307 nr_pages -= lvl_pages;
2308 iov_pfn += lvl_pages;
2309 phys_pfn += lvl_pages;
2310 pteval += lvl_pages * VTD_PAGE_SIZE;
2311 sg_res -= lvl_pages;
2312
2313 /* If the next PTE would be the first in a new page, then we
2314 need to flush the cache on the entries we've just written.
2315 And then we'll need to recalculate 'pte', so clear it and
2316 let it get set again in the if (!pte) block above.
2317
2318 If we're done (!nr_pages) we need to flush the cache too.
2319
2320 Also if we've been setting superpages, we may need to
2321 recalculate 'pte' and switch back to smaller pages for the
2322 end of the mapping, if the trailing size is not enough to
2323 use another superpage (i.e. sg_res < lvl_pages). */
2324 pte++;
2325 if (!nr_pages || first_pte_in_page(pte) ||
2326 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327 domain_flush_cache(domain, first_pte,
2328 (void *)pte - (void *)first_pte);
2329 pte = NULL;
2330 }
2331
2332 if (!sg_res && nr_pages)
2333 sg = sg_next(sg);
2334 }
2335 return 0;
2336}
2337
2338static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339 struct scatterlist *sg, unsigned long phys_pfn,
2340 unsigned long nr_pages, int prot)
2341{
2342 int iommu_id, ret;
2343 struct intel_iommu *iommu;
2344
2345 /* Do the real mapping first */
2346 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2347 if (ret)
2348 return ret;
2349
2350 for_each_domain_iommu(iommu_id, domain) {
2351 iommu = g_iommus[iommu_id];
2352 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2353 }
2354
2355 return 0;
2356}
2357
2358static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359 struct scatterlist *sg, unsigned long nr_pages,
2360 int prot)
2361{
2362 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2363}
2364
2365static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366 unsigned long phys_pfn, unsigned long nr_pages,
2367 int prot)
2368{
2369 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2370}
2371
2372static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2373{
2374 unsigned long flags;
2375 struct context_entry *context;
2376 u16 did_old;
2377
2378 if (!iommu)
2379 return;
2380
2381 spin_lock_irqsave(&iommu->lock, flags);
2382 context = iommu_context_addr(iommu, bus, devfn, 0);
2383 if (!context) {
2384 spin_unlock_irqrestore(&iommu->lock, flags);
2385 return;
2386 }
2387 did_old = context_domain_id(context);
2388 context_clear_entry(context);
2389 __iommu_flush_cache(iommu, context, sizeof(*context));
2390 spin_unlock_irqrestore(&iommu->lock, flags);
2391 iommu->flush.flush_context(iommu,
2392 did_old,
2393 (((u16)bus) << 8) | devfn,
2394 DMA_CCMD_MASK_NOBIT,
2395 DMA_CCMD_DEVICE_INVL);
2396 iommu->flush.flush_iotlb(iommu,
2397 did_old,
2398 0,
2399 0,
2400 DMA_TLB_DSI_FLUSH);
2401}
2402
2403static inline void unlink_domain_info(struct device_domain_info *info)
2404{
2405 assert_spin_locked(&device_domain_lock);
2406 list_del(&info->link);
2407 list_del(&info->global);
2408 if (info->dev)
2409 info->dev->archdata.iommu = NULL;
2410}
2411
2412static void domain_remove_dev_info(struct dmar_domain *domain)
2413{
2414 struct device_domain_info *info, *tmp;
2415 unsigned long flags;
2416
2417 spin_lock_irqsave(&device_domain_lock, flags);
2418 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419 __dmar_remove_one_dev_info(info);
2420 spin_unlock_irqrestore(&device_domain_lock, flags);
2421}
2422
2423/*
2424 * find_domain
2425 * Note: we use struct device->archdata.iommu stores the info
2426 */
2427static struct dmar_domain *find_domain(struct device *dev)
2428{
2429 struct device_domain_info *info;
2430
2431 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2432 struct iommu_domain *domain;
2433
2434 dev->archdata.iommu = NULL;
2435 domain = iommu_get_domain_for_dev(dev);
2436 if (domain)
2437 intel_iommu_attach_device(domain, dev);
2438 }
2439
2440 /* No lock here, assumes no domain exit in normal case */
2441 info = dev->archdata.iommu;
2442
2443 if (likely(info))
2444 return info->domain;
2445 return NULL;
2446}
2447
2448static inline struct device_domain_info *
2449dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2450{
2451 struct device_domain_info *info;
2452
2453 list_for_each_entry(info, &device_domain_list, global)
2454 if (info->iommu->segment == segment && info->bus == bus &&
2455 info->devfn == devfn)
2456 return info;
2457
2458 return NULL;
2459}
2460
2461static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2462 int bus, int devfn,
2463 struct device *dev,
2464 struct dmar_domain *domain)
2465{
2466 struct dmar_domain *found = NULL;
2467 struct device_domain_info *info;
2468 unsigned long flags;
2469 int ret;
2470
2471 info = alloc_devinfo_mem();
2472 if (!info)
2473 return NULL;
2474
2475 info->bus = bus;
2476 info->devfn = devfn;
2477 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2479 info->ats_qdep = 0;
2480 info->dev = dev;
2481 info->domain = domain;
2482 info->iommu = iommu;
2483 info->pasid_table = NULL;
2484 info->auxd_enabled = 0;
2485 INIT_LIST_HEAD(&info->auxiliary_domains);
2486
2487 if (dev && dev_is_pci(dev)) {
2488 struct pci_dev *pdev = to_pci_dev(info->dev);
2489
2490 if (!pdev->untrusted &&
2491 !pci_ats_disabled() &&
2492 ecap_dev_iotlb_support(iommu->ecap) &&
2493 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494 dmar_find_matched_atsr_unit(pdev))
2495 info->ats_supported = 1;
2496
2497 if (sm_supported(iommu)) {
2498 if (pasid_supported(iommu)) {
2499 int features = pci_pasid_features(pdev);
2500 if (features >= 0)
2501 info->pasid_supported = features | 1;
2502 }
2503
2504 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506 info->pri_supported = 1;
2507 }
2508 }
2509
2510 spin_lock_irqsave(&device_domain_lock, flags);
2511 if (dev)
2512 found = find_domain(dev);
2513
2514 if (!found) {
2515 struct device_domain_info *info2;
2516 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2517 if (info2) {
2518 found = info2->domain;
2519 info2->dev = dev;
2520 }
2521 }
2522
2523 if (found) {
2524 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 free_devinfo_mem(info);
2526 /* Caller must free the original domain */
2527 return found;
2528 }
2529
2530 spin_lock(&iommu->lock);
2531 ret = domain_attach_iommu(domain, iommu);
2532 spin_unlock(&iommu->lock);
2533
2534 if (ret) {
2535 spin_unlock_irqrestore(&device_domain_lock, flags);
2536 free_devinfo_mem(info);
2537 return NULL;
2538 }
2539
2540 list_add(&info->link, &domain->devices);
2541 list_add(&info->global, &device_domain_list);
2542 if (dev)
2543 dev->archdata.iommu = info;
2544 spin_unlock_irqrestore(&device_domain_lock, flags);
2545
2546 /* PASID table is mandatory for a PCI device in scalable mode. */
2547 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2548 ret = intel_pasid_alloc_table(dev);
2549 if (ret) {
2550 dev_err(dev, "PASID table allocation failed\n");
2551 dmar_remove_one_dev_info(dev);
2552 return NULL;
2553 }
2554
2555 /* Setup the PASID entry for requests without PASID: */
2556 spin_lock(&iommu->lock);
2557 if (hw_pass_through && domain_type_is_si(domain))
2558 ret = intel_pasid_setup_pass_through(iommu, domain,
2559 dev, PASID_RID2PASID);
2560 else
2561 ret = intel_pasid_setup_second_level(iommu, domain,
2562 dev, PASID_RID2PASID);
2563 spin_unlock(&iommu->lock);
2564 if (ret) {
2565 dev_err(dev, "Setup RID2PASID failed\n");
2566 dmar_remove_one_dev_info(dev);
2567 return NULL;
2568 }
2569 }
2570
2571 if (dev && domain_context_mapping(domain, dev)) {
2572 dev_err(dev, "Domain context map failed\n");
2573 dmar_remove_one_dev_info(dev);
2574 return NULL;
2575 }
2576
2577 return domain;
2578}
2579
2580static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2581{
2582 *(u16 *)opaque = alias;
2583 return 0;
2584}
2585
2586static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2587{
2588 struct device_domain_info *info;
2589 struct dmar_domain *domain = NULL;
2590 struct intel_iommu *iommu;
2591 u16 dma_alias;
2592 unsigned long flags;
2593 u8 bus, devfn;
2594
2595 iommu = device_to_iommu(dev, &bus, &devfn);
2596 if (!iommu)
2597 return NULL;
2598
2599 if (dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(dev);
2601
2602 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2603
2604 spin_lock_irqsave(&device_domain_lock, flags);
2605 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2606 PCI_BUS_NUM(dma_alias),
2607 dma_alias & 0xff);
2608 if (info) {
2609 iommu = info->iommu;
2610 domain = info->domain;
2611 }
2612 spin_unlock_irqrestore(&device_domain_lock, flags);
2613
2614 /* DMA alias already has a domain, use it */
2615 if (info)
2616 goto out;
2617 }
2618
2619 /* Allocate and initialize new domain for the device */
2620 domain = alloc_domain(0);
2621 if (!domain)
2622 return NULL;
2623 if (domain_init(domain, iommu, gaw)) {
2624 domain_exit(domain);
2625 return NULL;
2626 }
2627
2628out:
2629 return domain;
2630}
2631
2632static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633 struct dmar_domain *domain)
2634{
2635 struct intel_iommu *iommu;
2636 struct dmar_domain *tmp;
2637 u16 req_id, dma_alias;
2638 u8 bus, devfn;
2639
2640 iommu = device_to_iommu(dev, &bus, &devfn);
2641 if (!iommu)
2642 return NULL;
2643
2644 req_id = ((u16)bus << 8) | devfn;
2645
2646 if (dev_is_pci(dev)) {
2647 struct pci_dev *pdev = to_pci_dev(dev);
2648
2649 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2650
2651 /* register PCI DMA alias device */
2652 if (req_id != dma_alias) {
2653 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654 dma_alias & 0xff, NULL, domain);
2655
2656 if (!tmp || tmp != domain)
2657 return tmp;
2658 }
2659 }
2660
2661 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662 if (!tmp || tmp != domain)
2663 return tmp;
2664
2665 return domain;
2666}
2667
2668static int iommu_domain_identity_map(struct dmar_domain *domain,
2669 unsigned long long start,
2670 unsigned long long end)
2671{
2672 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2673 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2674
2675 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2676 dma_to_mm_pfn(last_vpfn))) {
2677 pr_err("Reserving iova failed\n");
2678 return -ENOMEM;
2679 }
2680
2681 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2682 /*
2683 * RMRR range might have overlap with physical memory range,
2684 * clear it first
2685 */
2686 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2687
2688 return __domain_mapping(domain, first_vpfn, NULL,
2689 first_vpfn, last_vpfn - first_vpfn + 1,
2690 DMA_PTE_READ|DMA_PTE_WRITE);
2691}
2692
2693static int domain_prepare_identity_map(struct device *dev,
2694 struct dmar_domain *domain,
2695 unsigned long long start,
2696 unsigned long long end)
2697{
2698 /* For _hardware_ passthrough, don't bother. But for software
2699 passthrough, we do it anyway -- it may indicate a memory
2700 range which is reserved in E820, so which didn't get set
2701 up to start with in si_domain */
2702 if (domain == si_domain && hw_pass_through) {
2703 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2704 start, end);
2705 return 0;
2706 }
2707
2708 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2709
2710 if (end < start) {
2711 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2712 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2713 dmi_get_system_info(DMI_BIOS_VENDOR),
2714 dmi_get_system_info(DMI_BIOS_VERSION),
2715 dmi_get_system_info(DMI_PRODUCT_VERSION));
2716 return -EIO;
2717 }
2718
2719 if (end >> agaw_to_width(domain->agaw)) {
2720 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2721 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2722 agaw_to_width(domain->agaw),
2723 dmi_get_system_info(DMI_BIOS_VENDOR),
2724 dmi_get_system_info(DMI_BIOS_VERSION),
2725 dmi_get_system_info(DMI_PRODUCT_VERSION));
2726 return -EIO;
2727 }
2728
2729 return iommu_domain_identity_map(domain, start, end);
2730}
2731
2732static int md_domain_init(struct dmar_domain *domain, int guest_width);
2733
2734static int __init si_domain_init(int hw)
2735{
2736 struct dmar_rmrr_unit *rmrr;
2737 struct device *dev;
2738 int i, nid, ret;
2739
2740 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2741 if (!si_domain)
2742 return -EFAULT;
2743
2744 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2745 domain_exit(si_domain);
2746 return -EFAULT;
2747 }
2748
2749 if (hw)
2750 return 0;
2751
2752 for_each_online_node(nid) {
2753 unsigned long start_pfn, end_pfn;
2754 int i;
2755
2756 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2757 ret = iommu_domain_identity_map(si_domain,
2758 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2759 if (ret)
2760 return ret;
2761 }
2762 }
2763
2764 /*
2765 * Normally we use DMA domains for devices which have RMRRs. But we
2766 * loose this requirement for graphic and usb devices. Identity map
2767 * the RMRRs for graphic and USB devices so that they could use the
2768 * si_domain.
2769 */
2770 for_each_rmrr_units(rmrr) {
2771 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2772 i, dev) {
2773 unsigned long long start = rmrr->base_address;
2774 unsigned long long end = rmrr->end_address;
2775
2776 if (device_is_rmrr_locked(dev))
2777 continue;
2778
2779 if (WARN_ON(end < start ||
2780 end >> agaw_to_width(si_domain->agaw)))
2781 continue;
2782
2783 ret = iommu_domain_identity_map(si_domain, start, end);
2784 if (ret)
2785 return ret;
2786 }
2787 }
2788
2789 return 0;
2790}
2791
2792static int identity_mapping(struct device *dev)
2793{
2794 struct device_domain_info *info;
2795
2796 info = dev->archdata.iommu;
2797 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2798 return (info->domain == si_domain);
2799
2800 return 0;
2801}
2802
2803static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2804{
2805 struct dmar_domain *ndomain;
2806 struct intel_iommu *iommu;
2807 u8 bus, devfn;
2808
2809 iommu = device_to_iommu(dev, &bus, &devfn);
2810 if (!iommu)
2811 return -ENODEV;
2812
2813 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2814 if (ndomain != domain)
2815 return -EBUSY;
2816
2817 return 0;
2818}
2819
2820static bool device_has_rmrr(struct device *dev)
2821{
2822 struct dmar_rmrr_unit *rmrr;
2823 struct device *tmp;
2824 int i;
2825
2826 rcu_read_lock();
2827 for_each_rmrr_units(rmrr) {
2828 /*
2829 * Return TRUE if this RMRR contains the device that
2830 * is passed in.
2831 */
2832 for_each_active_dev_scope(rmrr->devices,
2833 rmrr->devices_cnt, i, tmp)
2834 if (tmp == dev ||
2835 is_downstream_to_pci_bridge(dev, tmp)) {
2836 rcu_read_unlock();
2837 return true;
2838 }
2839 }
2840 rcu_read_unlock();
2841 return false;
2842}
2843
2844/**
2845 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2846 * is relaxable (ie. is allowed to be not enforced under some conditions)
2847 * @dev: device handle
2848 *
2849 * We assume that PCI USB devices with RMRRs have them largely
2850 * for historical reasons and that the RMRR space is not actively used post
2851 * boot. This exclusion may change if vendors begin to abuse it.
2852 *
2853 * The same exception is made for graphics devices, with the requirement that
2854 * any use of the RMRR regions will be torn down before assigning the device
2855 * to a guest.
2856 *
2857 * Return: true if the RMRR is relaxable, false otherwise
2858 */
2859static bool device_rmrr_is_relaxable(struct device *dev)
2860{
2861 struct pci_dev *pdev;
2862
2863 if (!dev_is_pci(dev))
2864 return false;
2865
2866 pdev = to_pci_dev(dev);
2867 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2868 return true;
2869 else
2870 return false;
2871}
2872
2873/*
2874 * There are a couple cases where we need to restrict the functionality of
2875 * devices associated with RMRRs. The first is when evaluating a device for
2876 * identity mapping because problems exist when devices are moved in and out
2877 * of domains and their respective RMRR information is lost. This means that
2878 * a device with associated RMRRs will never be in a "passthrough" domain.
2879 * The second is use of the device through the IOMMU API. This interface
2880 * expects to have full control of the IOVA space for the device. We cannot
2881 * satisfy both the requirement that RMRR access is maintained and have an
2882 * unencumbered IOVA space. We also have no ability to quiesce the device's
2883 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2884 * We therefore prevent devices associated with an RMRR from participating in
2885 * the IOMMU API, which eliminates them from device assignment.
2886 *
2887 * In both cases, devices which have relaxable RMRRs are not concerned by this
2888 * restriction. See device_rmrr_is_relaxable comment.
2889 */
2890static bool device_is_rmrr_locked(struct device *dev)
2891{
2892 if (!device_has_rmrr(dev))
2893 return false;
2894
2895 if (device_rmrr_is_relaxable(dev))
2896 return false;
2897
2898 return true;
2899}
2900
2901/*
2902 * Return the required default domain type for a specific device.
2903 *
2904 * @dev: the device in query
2905 * @startup: true if this is during early boot
2906 *
2907 * Returns:
2908 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2909 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2910 * - 0: both identity and dynamic domains work for this device
2911 */
2912static int device_def_domain_type(struct device *dev)
2913{
2914 if (dev_is_pci(dev)) {
2915 struct pci_dev *pdev = to_pci_dev(dev);
2916
2917 if (device_is_rmrr_locked(dev))
2918 return IOMMU_DOMAIN_DMA;
2919
2920 /*
2921 * Prevent any device marked as untrusted from getting
2922 * placed into the statically identity mapping domain.
2923 */
2924 if (pdev->untrusted)
2925 return IOMMU_DOMAIN_DMA;
2926
2927 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2928 return IOMMU_DOMAIN_IDENTITY;
2929
2930 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2931 return IOMMU_DOMAIN_IDENTITY;
2932
2933 /*
2934 * We want to start off with all devices in the 1:1 domain, and
2935 * take them out later if we find they can't access all of memory.
2936 *
2937 * However, we can't do this for PCI devices behind bridges,
2938 * because all PCI devices behind the same bridge will end up
2939 * with the same source-id on their transactions.
2940 *
2941 * Practically speaking, we can't change things around for these
2942 * devices at run-time, because we can't be sure there'll be no
2943 * DMA transactions in flight for any of their siblings.
2944 *
2945 * So PCI devices (unless they're on the root bus) as well as
2946 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2947 * the 1:1 domain, just in _case_ one of their siblings turns out
2948 * not to be able to map all of memory.
2949 */
2950 if (!pci_is_pcie(pdev)) {
2951 if (!pci_is_root_bus(pdev->bus))
2952 return IOMMU_DOMAIN_DMA;
2953 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2954 return IOMMU_DOMAIN_DMA;
2955 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2956 return IOMMU_DOMAIN_DMA;
2957 } else {
2958 if (device_has_rmrr(dev))
2959 return IOMMU_DOMAIN_DMA;
2960 }
2961
2962 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2963 IOMMU_DOMAIN_IDENTITY : 0;
2964}
2965
2966static void intel_iommu_init_qi(struct intel_iommu *iommu)
2967{
2968 /*
2969 * Start from the sane iommu hardware state.
2970 * If the queued invalidation is already initialized by us
2971 * (for example, while enabling interrupt-remapping) then
2972 * we got the things already rolling from a sane state.
2973 */
2974 if (!iommu->qi) {
2975 /*
2976 * Clear any previous faults.
2977 */
2978 dmar_fault(-1, iommu);
2979 /*
2980 * Disable queued invalidation if supported and already enabled
2981 * before OS handover.
2982 */
2983 dmar_disable_qi(iommu);
2984 }
2985
2986 if (dmar_enable_qi(iommu)) {
2987 /*
2988 * Queued Invalidate not enabled, use Register Based Invalidate
2989 */
2990 iommu->flush.flush_context = __iommu_flush_context;
2991 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2992 pr_info("%s: Using Register based invalidation\n",
2993 iommu->name);
2994 } else {
2995 iommu->flush.flush_context = qi_flush_context;
2996 iommu->flush.flush_iotlb = qi_flush_iotlb;
2997 pr_info("%s: Using Queued invalidation\n", iommu->name);
2998 }
2999}
3000
3001static int copy_context_table(struct intel_iommu *iommu,
3002 struct root_entry *old_re,
3003 struct context_entry **tbl,
3004 int bus, bool ext)
3005{
3006 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3007 struct context_entry *new_ce = NULL, ce;
3008 struct context_entry *old_ce = NULL;
3009 struct root_entry re;
3010 phys_addr_t old_ce_phys;
3011
3012 tbl_idx = ext ? bus * 2 : bus;
3013 memcpy(&re, old_re, sizeof(re));
3014
3015 for (devfn = 0; devfn < 256; devfn++) {
3016 /* First calculate the correct index */
3017 idx = (ext ? devfn * 2 : devfn) % 256;
3018
3019 if (idx == 0) {
3020 /* First save what we may have and clean up */
3021 if (new_ce) {
3022 tbl[tbl_idx] = new_ce;
3023 __iommu_flush_cache(iommu, new_ce,
3024 VTD_PAGE_SIZE);
3025 pos = 1;
3026 }
3027
3028 if (old_ce)
3029 memunmap(old_ce);
3030
3031 ret = 0;
3032 if (devfn < 0x80)
3033 old_ce_phys = root_entry_lctp(&re);
3034 else
3035 old_ce_phys = root_entry_uctp(&re);
3036
3037 if (!old_ce_phys) {
3038 if (ext && devfn == 0) {
3039 /* No LCTP, try UCTP */
3040 devfn = 0x7f;
3041 continue;
3042 } else {
3043 goto out;
3044 }
3045 }
3046
3047 ret = -ENOMEM;
3048 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3049 MEMREMAP_WB);
3050 if (!old_ce)
3051 goto out;
3052
3053 new_ce = alloc_pgtable_page(iommu->node);
3054 if (!new_ce)
3055 goto out_unmap;
3056
3057 ret = 0;
3058 }
3059
3060 /* Now copy the context entry */
3061 memcpy(&ce, old_ce + idx, sizeof(ce));
3062
3063 if (!__context_present(&ce))
3064 continue;
3065
3066 did = context_domain_id(&ce);
3067 if (did >= 0 && did < cap_ndoms(iommu->cap))
3068 set_bit(did, iommu->domain_ids);
3069
3070 /*
3071 * We need a marker for copied context entries. This
3072 * marker needs to work for the old format as well as
3073 * for extended context entries.
3074 *
3075 * Bit 67 of the context entry is used. In the old
3076 * format this bit is available to software, in the
3077 * extended format it is the PGE bit, but PGE is ignored
3078 * by HW if PASIDs are disabled (and thus still
3079 * available).
3080 *
3081 * So disable PASIDs first and then mark the entry
3082 * copied. This means that we don't copy PASID
3083 * translations from the old kernel, but this is fine as
3084 * faults there are not fatal.
3085 */
3086 context_clear_pasid_enable(&ce);
3087 context_set_copied(&ce);
3088
3089 new_ce[idx] = ce;
3090 }
3091
3092 tbl[tbl_idx + pos] = new_ce;
3093
3094 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3095
3096out_unmap:
3097 memunmap(old_ce);
3098
3099out:
3100 return ret;
3101}
3102
3103static int copy_translation_tables(struct intel_iommu *iommu)
3104{
3105 struct context_entry **ctxt_tbls;
3106 struct root_entry *old_rt;
3107 phys_addr_t old_rt_phys;
3108 int ctxt_table_entries;
3109 unsigned long flags;
3110 u64 rtaddr_reg;
3111 int bus, ret;
3112 bool new_ext, ext;
3113
3114 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3115 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3116 new_ext = !!ecap_ecs(iommu->ecap);
3117
3118 /*
3119 * The RTT bit can only be changed when translation is disabled,
3120 * but disabling translation means to open a window for data
3121 * corruption. So bail out and don't copy anything if we would
3122 * have to change the bit.
3123 */
3124 if (new_ext != ext)
3125 return -EINVAL;
3126
3127 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3128 if (!old_rt_phys)
3129 return -EINVAL;
3130
3131 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3132 if (!old_rt)
3133 return -ENOMEM;
3134
3135 /* This is too big for the stack - allocate it from slab */
3136 ctxt_table_entries = ext ? 512 : 256;
3137 ret = -ENOMEM;
3138 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3139 if (!ctxt_tbls)
3140 goto out_unmap;
3141
3142 for (bus = 0; bus < 256; bus++) {
3143 ret = copy_context_table(iommu, &old_rt[bus],
3144 ctxt_tbls, bus, ext);
3145 if (ret) {
3146 pr_err("%s: Failed to copy context table for bus %d\n",
3147 iommu->name, bus);
3148 continue;
3149 }
3150 }
3151
3152 spin_lock_irqsave(&iommu->lock, flags);
3153
3154 /* Context tables are copied, now write them to the root_entry table */
3155 for (bus = 0; bus < 256; bus++) {
3156 int idx = ext ? bus * 2 : bus;
3157 u64 val;
3158
3159 if (ctxt_tbls[idx]) {
3160 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3161 iommu->root_entry[bus].lo = val;
3162 }
3163
3164 if (!ext || !ctxt_tbls[idx + 1])
3165 continue;
3166
3167 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3168 iommu->root_entry[bus].hi = val;
3169 }
3170
3171 spin_unlock_irqrestore(&iommu->lock, flags);
3172
3173 kfree(ctxt_tbls);
3174
3175 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3176
3177 ret = 0;
3178
3179out_unmap:
3180 memunmap(old_rt);
3181
3182 return ret;
3183}
3184
3185static int __init init_dmars(void)
3186{
3187 struct dmar_drhd_unit *drhd;
3188 struct intel_iommu *iommu;
3189 int ret;
3190
3191 /*
3192 * for each drhd
3193 * allocate root
3194 * initialize and program root entry to not present
3195 * endfor
3196 */
3197 for_each_drhd_unit(drhd) {
3198 /*
3199 * lock not needed as this is only incremented in the single
3200 * threaded kernel __init code path all other access are read
3201 * only
3202 */
3203 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3204 g_num_of_iommus++;
3205 continue;
3206 }
3207 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3208 }
3209
3210 /* Preallocate enough resources for IOMMU hot-addition */
3211 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3212 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3213
3214 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3215 GFP_KERNEL);
3216 if (!g_iommus) {
3217 pr_err("Allocating global iommu array failed\n");
3218 ret = -ENOMEM;
3219 goto error;
3220 }
3221
3222 for_each_iommu(iommu, drhd) {
3223 if (drhd->ignored) {
3224 iommu_disable_translation(iommu);
3225 continue;
3226 }
3227
3228 /*
3229 * Find the max pasid size of all IOMMU's in the system.
3230 * We need to ensure the system pasid table is no bigger
3231 * than the smallest supported.
3232 */
3233 if (pasid_supported(iommu)) {
3234 u32 temp = 2 << ecap_pss(iommu->ecap);
3235
3236 intel_pasid_max_id = min_t(u32, temp,
3237 intel_pasid_max_id);
3238 }
3239
3240 g_iommus[iommu->seq_id] = iommu;
3241
3242 intel_iommu_init_qi(iommu);
3243
3244 ret = iommu_init_domains(iommu);
3245 if (ret)
3246 goto free_iommu;
3247
3248 init_translation_status(iommu);
3249
3250 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3251 iommu_disable_translation(iommu);
3252 clear_translation_pre_enabled(iommu);
3253 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3254 iommu->name);
3255 }
3256
3257 /*
3258 * TBD:
3259 * we could share the same root & context tables
3260 * among all IOMMU's. Need to Split it later.
3261 */
3262 ret = iommu_alloc_root_entry(iommu);
3263 if (ret)
3264 goto free_iommu;
3265
3266 if (translation_pre_enabled(iommu)) {
3267 pr_info("Translation already enabled - trying to copy translation structures\n");
3268
3269 ret = copy_translation_tables(iommu);
3270 if (ret) {
3271 /*
3272 * We found the IOMMU with translation
3273 * enabled - but failed to copy over the
3274 * old root-entry table. Try to proceed
3275 * by disabling translation now and
3276 * allocating a clean root-entry table.
3277 * This might cause DMAR faults, but
3278 * probably the dump will still succeed.
3279 */
3280 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3281 iommu->name);
3282 iommu_disable_translation(iommu);
3283 clear_translation_pre_enabled(iommu);
3284 } else {
3285 pr_info("Copied translation tables from previous kernel for %s\n",
3286 iommu->name);
3287 }
3288 }
3289
3290 if (!ecap_pass_through(iommu->ecap))
3291 hw_pass_through = 0;
3292#ifdef CONFIG_INTEL_IOMMU_SVM
3293 if (pasid_supported(iommu))
3294 intel_svm_init(iommu);
3295#endif
3296 }
3297
3298 /*
3299 * Now that qi is enabled on all iommus, set the root entry and flush
3300 * caches. This is required on some Intel X58 chipsets, otherwise the
3301 * flush_context function will loop forever and the boot hangs.
3302 */
3303 for_each_active_iommu(iommu, drhd) {
3304 iommu_flush_write_buffer(iommu);
3305 iommu_set_root_entry(iommu);
3306 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3307 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3308 }
3309
3310 if (iommu_default_passthrough())
3311 iommu_identity_mapping |= IDENTMAP_ALL;
3312
3313#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3314 dmar_map_gfx = 0;
3315#endif
3316
3317 if (!dmar_map_gfx)
3318 iommu_identity_mapping |= IDENTMAP_GFX;
3319
3320 check_tylersburg_isoch();
3321
3322 ret = si_domain_init(hw_pass_through);
3323 if (ret)
3324 goto free_iommu;
3325
3326 /*
3327 * for each drhd
3328 * enable fault log
3329 * global invalidate context cache
3330 * global invalidate iotlb
3331 * enable translation
3332 */
3333 for_each_iommu(iommu, drhd) {
3334 if (drhd->ignored) {
3335 /*
3336 * we always have to disable PMRs or DMA may fail on
3337 * this device
3338 */
3339 if (force_on)
3340 iommu_disable_protect_mem_regions(iommu);
3341 continue;
3342 }
3343
3344 iommu_flush_write_buffer(iommu);
3345
3346#ifdef CONFIG_INTEL_IOMMU_SVM
3347 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3348 /*
3349 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3350 * could cause possible lock race condition.
3351 */
3352 up_write(&dmar_global_lock);
3353 ret = intel_svm_enable_prq(iommu);
3354 down_write(&dmar_global_lock);
3355 if (ret)
3356 goto free_iommu;
3357 }
3358#endif
3359 ret = dmar_set_interrupt(iommu);
3360 if (ret)
3361 goto free_iommu;
3362 }
3363
3364 return 0;
3365
3366free_iommu:
3367 for_each_active_iommu(iommu, drhd) {
3368 disable_dmar_iommu(iommu);
3369 free_dmar_iommu(iommu);
3370 }
3371
3372 kfree(g_iommus);
3373
3374error:
3375 return ret;
3376}
3377
3378/* This takes a number of _MM_ pages, not VTD pages */
3379static unsigned long intel_alloc_iova(struct device *dev,
3380 struct dmar_domain *domain,
3381 unsigned long nrpages, uint64_t dma_mask)
3382{
3383 unsigned long iova_pfn;
3384
3385 /* Restrict dma_mask to the width that the iommu can handle */
3386 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3387 /* Ensure we reserve the whole size-aligned region */
3388 nrpages = __roundup_pow_of_two(nrpages);
3389
3390 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3391 /*
3392 * First try to allocate an io virtual address in
3393 * DMA_BIT_MASK(32) and if that fails then try allocating
3394 * from higher range
3395 */
3396 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3397 IOVA_PFN(DMA_BIT_MASK(32)), false);
3398 if (iova_pfn)
3399 return iova_pfn;
3400 }
3401 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3402 IOVA_PFN(dma_mask), true);
3403 if (unlikely(!iova_pfn)) {
3404 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3405 return 0;
3406 }
3407
3408 return iova_pfn;
3409}
3410
3411static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3412{
3413 struct dmar_domain *domain, *tmp;
3414 struct dmar_rmrr_unit *rmrr;
3415 struct device *i_dev;
3416 int i, ret;
3417
3418 /* Device shouldn't be attached by any domains. */
3419 domain = find_domain(dev);
3420 if (domain)
3421 return NULL;
3422
3423 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3424 if (!domain)
3425 goto out;
3426
3427 /* We have a new domain - setup possible RMRRs for the device */
3428 rcu_read_lock();
3429 for_each_rmrr_units(rmrr) {
3430 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3431 i, i_dev) {
3432 if (i_dev != dev)
3433 continue;
3434
3435 ret = domain_prepare_identity_map(dev, domain,
3436 rmrr->base_address,
3437 rmrr->end_address);
3438 if (ret)
3439 dev_err(dev, "Mapping reserved region failed\n");
3440 }
3441 }
3442 rcu_read_unlock();
3443
3444 tmp = set_domain_for_dev(dev, domain);
3445 if (!tmp || domain != tmp) {
3446 domain_exit(domain);
3447 domain = tmp;
3448 }
3449
3450out:
3451 if (!domain)
3452 dev_err(dev, "Allocating domain failed\n");
3453 else
3454 domain->domain.type = IOMMU_DOMAIN_DMA;
3455
3456 return domain;
3457}
3458
3459/* Check if the dev needs to go through non-identity map and unmap process.*/
3460static bool iommu_need_mapping(struct device *dev)
3461{
3462 int ret;
3463
3464 if (iommu_dummy(dev))
3465 return false;
3466
3467 ret = identity_mapping(dev);
3468 if (ret) {
3469 u64 dma_mask = *dev->dma_mask;
3470
3471 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3472 dma_mask = dev->coherent_dma_mask;
3473
3474 if (dma_mask >= dma_direct_get_required_mask(dev))
3475 return false;
3476
3477 /*
3478 * 32 bit DMA is removed from si_domain and fall back to
3479 * non-identity mapping.
3480 */
3481 dmar_remove_one_dev_info(dev);
3482 ret = iommu_request_dma_domain_for_dev(dev);
3483 if (ret) {
3484 struct iommu_domain *domain;
3485 struct dmar_domain *dmar_domain;
3486
3487 domain = iommu_get_domain_for_dev(dev);
3488 if (domain) {
3489 dmar_domain = to_dmar_domain(domain);
3490 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3491 }
3492 dmar_remove_one_dev_info(dev);
3493 get_private_domain_for_dev(dev);
3494 }
3495
3496 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3497 }
3498
3499 return true;
3500}
3501
3502static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3503 size_t size, int dir, u64 dma_mask)
3504{
3505 struct dmar_domain *domain;
3506 phys_addr_t start_paddr;
3507 unsigned long iova_pfn;
3508 int prot = 0;
3509 int ret;
3510 struct intel_iommu *iommu;
3511 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3512
3513 BUG_ON(dir == DMA_NONE);
3514
3515 domain = find_domain(dev);
3516 if (!domain)
3517 return DMA_MAPPING_ERROR;
3518
3519 iommu = domain_get_iommu(domain);
3520 size = aligned_nrpages(paddr, size);
3521
3522 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3523 if (!iova_pfn)
3524 goto error;
3525
3526 /*
3527 * Check if DMAR supports zero-length reads on write only
3528 * mappings..
3529 */
3530 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3531 !cap_zlr(iommu->cap))
3532 prot |= DMA_PTE_READ;
3533 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3534 prot |= DMA_PTE_WRITE;
3535 /*
3536 * paddr - (paddr + size) might be partial page, we should map the whole
3537 * page. Note: if two part of one page are separately mapped, we
3538 * might have two guest_addr mapping to the same host paddr, but this
3539 * is not a big problem
3540 */
3541 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3542 mm_to_dma_pfn(paddr_pfn), size, prot);
3543 if (ret)
3544 goto error;
3545
3546 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3547 start_paddr += paddr & ~PAGE_MASK;
3548
3549 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3550
3551 return start_paddr;
3552
3553error:
3554 if (iova_pfn)
3555 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3556 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3557 size, (unsigned long long)paddr, dir);
3558 return DMA_MAPPING_ERROR;
3559}
3560
3561static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3562 unsigned long offset, size_t size,
3563 enum dma_data_direction dir,
3564 unsigned long attrs)
3565{
3566 if (iommu_need_mapping(dev))
3567 return __intel_map_single(dev, page_to_phys(page) + offset,
3568 size, dir, *dev->dma_mask);
3569 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3570}
3571
3572static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3573 size_t size, enum dma_data_direction dir,
3574 unsigned long attrs)
3575{
3576 if (iommu_need_mapping(dev))
3577 return __intel_map_single(dev, phys_addr, size, dir,
3578 *dev->dma_mask);
3579 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3580}
3581
3582static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3583{
3584 struct dmar_domain *domain;
3585 unsigned long start_pfn, last_pfn;
3586 unsigned long nrpages;
3587 unsigned long iova_pfn;
3588 struct intel_iommu *iommu;
3589 struct page *freelist;
3590 struct pci_dev *pdev = NULL;
3591
3592 domain = find_domain(dev);
3593 BUG_ON(!domain);
3594
3595 iommu = domain_get_iommu(domain);
3596
3597 iova_pfn = IOVA_PFN(dev_addr);
3598
3599 nrpages = aligned_nrpages(dev_addr, size);
3600 start_pfn = mm_to_dma_pfn(iova_pfn);
3601 last_pfn = start_pfn + nrpages - 1;
3602
3603 if (dev_is_pci(dev))
3604 pdev = to_pci_dev(dev);
3605
3606 freelist = domain_unmap(domain, start_pfn, last_pfn);
3607 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3608 !has_iova_flush_queue(&domain->iovad)) {
3609 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3610 nrpages, !freelist, 0);
3611 /* free iova */
3612 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3613 dma_free_pagelist(freelist);
3614 } else {
3615 queue_iova(&domain->iovad, iova_pfn, nrpages,
3616 (unsigned long)freelist);
3617 /*
3618 * queue up the release of the unmap to save the 1/6th of the
3619 * cpu used up by the iotlb flush operation...
3620 */
3621 }
3622
3623 trace_unmap_single(dev, dev_addr, size);
3624}
3625
3626static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3627 size_t size, enum dma_data_direction dir,
3628 unsigned long attrs)
3629{
3630 if (iommu_need_mapping(dev))
3631 intel_unmap(dev, dev_addr, size);
3632 else
3633 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3634}
3635
3636static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3637 size_t size, enum dma_data_direction dir, unsigned long attrs)
3638{
3639 if (iommu_need_mapping(dev))
3640 intel_unmap(dev, dev_addr, size);
3641}
3642
3643static void *intel_alloc_coherent(struct device *dev, size_t size,
3644 dma_addr_t *dma_handle, gfp_t flags,
3645 unsigned long attrs)
3646{
3647 struct page *page = NULL;
3648 int order;
3649
3650 if (!iommu_need_mapping(dev))
3651 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3652
3653 size = PAGE_ALIGN(size);
3654 order = get_order(size);
3655
3656 if (gfpflags_allow_blocking(flags)) {
3657 unsigned int count = size >> PAGE_SHIFT;
3658
3659 page = dma_alloc_from_contiguous(dev, count, order,
3660 flags & __GFP_NOWARN);
3661 }
3662
3663 if (!page)
3664 page = alloc_pages(flags, order);
3665 if (!page)
3666 return NULL;
3667 memset(page_address(page), 0, size);
3668
3669 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3670 DMA_BIDIRECTIONAL,
3671 dev->coherent_dma_mask);
3672 if (*dma_handle != DMA_MAPPING_ERROR)
3673 return page_address(page);
3674 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3675 __free_pages(page, order);
3676
3677 return NULL;
3678}
3679
3680static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3681 dma_addr_t dma_handle, unsigned long attrs)
3682{
3683 int order;
3684 struct page *page = virt_to_page(vaddr);
3685
3686 if (!iommu_need_mapping(dev))
3687 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3688
3689 size = PAGE_ALIGN(size);
3690 order = get_order(size);
3691
3692 intel_unmap(dev, dma_handle, size);
3693 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3694 __free_pages(page, order);
3695}
3696
3697static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3698 int nelems, enum dma_data_direction dir,
3699 unsigned long attrs)
3700{
3701 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3702 unsigned long nrpages = 0;
3703 struct scatterlist *sg;
3704 int i;
3705
3706 if (!iommu_need_mapping(dev))
3707 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3708
3709 for_each_sg(sglist, sg, nelems, i) {
3710 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3711 }
3712
3713 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3714
3715 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3716}
3717
3718static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3719 enum dma_data_direction dir, unsigned long attrs)
3720{
3721 int i;
3722 struct dmar_domain *domain;
3723 size_t size = 0;
3724 int prot = 0;
3725 unsigned long iova_pfn;
3726 int ret;
3727 struct scatterlist *sg;
3728 unsigned long start_vpfn;
3729 struct intel_iommu *iommu;
3730
3731 BUG_ON(dir == DMA_NONE);
3732 if (!iommu_need_mapping(dev))
3733 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3734
3735 domain = find_domain(dev);
3736 if (!domain)
3737 return 0;
3738
3739 iommu = domain_get_iommu(domain);
3740
3741 for_each_sg(sglist, sg, nelems, i)
3742 size += aligned_nrpages(sg->offset, sg->length);
3743
3744 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3745 *dev->dma_mask);
3746 if (!iova_pfn) {
3747 sglist->dma_length = 0;
3748 return 0;
3749 }
3750
3751 /*
3752 * Check if DMAR supports zero-length reads on write only
3753 * mappings..
3754 */
3755 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3756 !cap_zlr(iommu->cap))
3757 prot |= DMA_PTE_READ;
3758 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3759 prot |= DMA_PTE_WRITE;
3760
3761 start_vpfn = mm_to_dma_pfn(iova_pfn);
3762
3763 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3764 if (unlikely(ret)) {
3765 dma_pte_free_pagetable(domain, start_vpfn,
3766 start_vpfn + size - 1,
3767 agaw_to_level(domain->agaw) + 1);
3768 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3769 return 0;
3770 }
3771
3772 trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3773 sg_phys(sglist), size << VTD_PAGE_SHIFT);
3774
3775 return nelems;
3776}
3777
3778static u64 intel_get_required_mask(struct device *dev)
3779{
3780 if (!iommu_need_mapping(dev))
3781 return dma_direct_get_required_mask(dev);
3782 return DMA_BIT_MASK(32);
3783}
3784
3785static const struct dma_map_ops intel_dma_ops = {
3786 .alloc = intel_alloc_coherent,
3787 .free = intel_free_coherent,
3788 .map_sg = intel_map_sg,
3789 .unmap_sg = intel_unmap_sg,
3790 .map_page = intel_map_page,
3791 .unmap_page = intel_unmap_page,
3792 .map_resource = intel_map_resource,
3793 .unmap_resource = intel_unmap_resource,
3794 .dma_supported = dma_direct_supported,
3795 .mmap = dma_common_mmap,
3796 .get_sgtable = dma_common_get_sgtable,
3797 .get_required_mask = intel_get_required_mask,
3798};
3799
3800static void
3801bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3802 enum dma_data_direction dir, enum dma_sync_target target)
3803{
3804 struct dmar_domain *domain;
3805 phys_addr_t tlb_addr;
3806
3807 domain = find_domain(dev);
3808 if (WARN_ON(!domain))
3809 return;
3810
3811 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3812 if (is_swiotlb_buffer(tlb_addr))
3813 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3814}
3815
3816static dma_addr_t
3817bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3818 enum dma_data_direction dir, unsigned long attrs,
3819 u64 dma_mask)
3820{
3821 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3822 struct dmar_domain *domain;
3823 struct intel_iommu *iommu;
3824 unsigned long iova_pfn;
3825 unsigned long nrpages;
3826 phys_addr_t tlb_addr;
3827 int prot = 0;
3828 int ret;
3829
3830 domain = find_domain(dev);
3831 if (WARN_ON(dir == DMA_NONE || !domain))
3832 return DMA_MAPPING_ERROR;
3833
3834 iommu = domain_get_iommu(domain);
3835 if (WARN_ON(!iommu))
3836 return DMA_MAPPING_ERROR;
3837
3838 nrpages = aligned_nrpages(0, size);
3839 iova_pfn = intel_alloc_iova(dev, domain,
3840 dma_to_mm_pfn(nrpages), dma_mask);
3841 if (!iova_pfn)
3842 return DMA_MAPPING_ERROR;
3843
3844 /*
3845 * Check if DMAR supports zero-length reads on write only
3846 * mappings..
3847 */
3848 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3849 !cap_zlr(iommu->cap))
3850 prot |= DMA_PTE_READ;
3851 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3852 prot |= DMA_PTE_WRITE;
3853
3854 /*
3855 * If both the physical buffer start address and size are
3856 * page aligned, we don't need to use a bounce page.
3857 */
3858 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3859 tlb_addr = swiotlb_tbl_map_single(dev,
3860 __phys_to_dma(dev, io_tlb_start),
3861 paddr, size, aligned_size, dir, attrs);
3862 if (tlb_addr == DMA_MAPPING_ERROR) {
3863 goto swiotlb_error;
3864 } else {
3865 /* Cleanup the padding area. */
3866 void *padding_start = phys_to_virt(tlb_addr);
3867 size_t padding_size = aligned_size;
3868
3869 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3870 (dir == DMA_TO_DEVICE ||
3871 dir == DMA_BIDIRECTIONAL)) {
3872 padding_start += size;
3873 padding_size -= size;
3874 }
3875
3876 memset(padding_start, 0, padding_size);
3877 }
3878 } else {
3879 tlb_addr = paddr;
3880 }
3881
3882 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3883 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3884 if (ret)
3885 goto mapping_error;
3886
3887 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3888
3889 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3890
3891mapping_error:
3892 if (is_swiotlb_buffer(tlb_addr))
3893 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3894 aligned_size, dir, attrs);
3895swiotlb_error:
3896 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3897 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3898 size, (unsigned long long)paddr, dir);
3899
3900 return DMA_MAPPING_ERROR;
3901}
3902
3903static void
3904bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3905 enum dma_data_direction dir, unsigned long attrs)
3906{
3907 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3908 struct dmar_domain *domain;
3909 phys_addr_t tlb_addr;
3910
3911 domain = find_domain(dev);
3912 if (WARN_ON(!domain))
3913 return;
3914
3915 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3916 if (WARN_ON(!tlb_addr))
3917 return;
3918
3919 intel_unmap(dev, dev_addr, size);
3920 if (is_swiotlb_buffer(tlb_addr))
3921 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3922 aligned_size, dir, attrs);
3923
3924 trace_bounce_unmap_single(dev, dev_addr, size);
3925}
3926
3927static dma_addr_t
3928bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3929 size_t size, enum dma_data_direction dir, unsigned long attrs)
3930{
3931 return bounce_map_single(dev, page_to_phys(page) + offset,
3932 size, dir, attrs, *dev->dma_mask);
3933}
3934
3935static dma_addr_t
3936bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3937 enum dma_data_direction dir, unsigned long attrs)
3938{
3939 return bounce_map_single(dev, phys_addr, size,
3940 dir, attrs, *dev->dma_mask);
3941}
3942
3943static void
3944bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3945 enum dma_data_direction dir, unsigned long attrs)
3946{
3947 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3948}
3949
3950static void
3951bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3952 enum dma_data_direction dir, unsigned long attrs)
3953{
3954 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3955}
3956
3957static void
3958bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3959 enum dma_data_direction dir, unsigned long attrs)
3960{
3961 struct scatterlist *sg;
3962 int i;
3963
3964 for_each_sg(sglist, sg, nelems, i)
3965 bounce_unmap_page(dev, sg->dma_address,
3966 sg_dma_len(sg), dir, attrs);
3967}
3968
3969static int
3970bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3971 enum dma_data_direction dir, unsigned long attrs)
3972{
3973 int i;
3974 struct scatterlist *sg;
3975
3976 for_each_sg(sglist, sg, nelems, i) {
3977 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3978 sg->offset, sg->length,
3979 dir, attrs);
3980 if (sg->dma_address == DMA_MAPPING_ERROR)
3981 goto out_unmap;
3982 sg_dma_len(sg) = sg->length;
3983 }
3984
3985 return nelems;
3986
3987out_unmap:
3988 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3989 return 0;
3990}
3991
3992static void
3993bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3994 size_t size, enum dma_data_direction dir)
3995{
3996 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3997}
3998
3999static void
4000bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4001 size_t size, enum dma_data_direction dir)
4002{
4003 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4004}
4005
4006static void
4007bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4008 int nelems, enum dma_data_direction dir)
4009{
4010 struct scatterlist *sg;
4011 int i;
4012
4013 for_each_sg(sglist, sg, nelems, i)
4014 bounce_sync_single(dev, sg_dma_address(sg),
4015 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4016}
4017
4018static void
4019bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4020 int nelems, enum dma_data_direction dir)
4021{
4022 struct scatterlist *sg;
4023 int i;
4024
4025 for_each_sg(sglist, sg, nelems, i)
4026 bounce_sync_single(dev, sg_dma_address(sg),
4027 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4028}
4029
4030static const struct dma_map_ops bounce_dma_ops = {
4031 .alloc = intel_alloc_coherent,
4032 .free = intel_free_coherent,
4033 .map_sg = bounce_map_sg,
4034 .unmap_sg = bounce_unmap_sg,
4035 .map_page = bounce_map_page,
4036 .unmap_page = bounce_unmap_page,
4037 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4038 .sync_single_for_device = bounce_sync_single_for_device,
4039 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4040 .sync_sg_for_device = bounce_sync_sg_for_device,
4041 .map_resource = bounce_map_resource,
4042 .unmap_resource = bounce_unmap_resource,
4043 .dma_supported = dma_direct_supported,
4044};
4045
4046static inline int iommu_domain_cache_init(void)
4047{
4048 int ret = 0;
4049
4050 iommu_domain_cache = kmem_cache_create("iommu_domain",
4051 sizeof(struct dmar_domain),
4052 0,
4053 SLAB_HWCACHE_ALIGN,
4054
4055 NULL);
4056 if (!iommu_domain_cache) {
4057 pr_err("Couldn't create iommu_domain cache\n");
4058 ret = -ENOMEM;
4059 }
4060
4061 return ret;
4062}
4063
4064static inline int iommu_devinfo_cache_init(void)
4065{
4066 int ret = 0;
4067
4068 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4069 sizeof(struct device_domain_info),
4070 0,
4071 SLAB_HWCACHE_ALIGN,
4072 NULL);
4073 if (!iommu_devinfo_cache) {
4074 pr_err("Couldn't create devinfo cache\n");
4075 ret = -ENOMEM;
4076 }
4077
4078 return ret;
4079}
4080
4081static int __init iommu_init_mempool(void)
4082{
4083 int ret;
4084 ret = iova_cache_get();
4085 if (ret)
4086 return ret;
4087
4088 ret = iommu_domain_cache_init();
4089 if (ret)
4090 goto domain_error;
4091
4092 ret = iommu_devinfo_cache_init();
4093 if (!ret)
4094 return ret;
4095
4096 kmem_cache_destroy(iommu_domain_cache);
4097domain_error:
4098 iova_cache_put();
4099
4100 return -ENOMEM;
4101}
4102
4103static void __init iommu_exit_mempool(void)
4104{
4105 kmem_cache_destroy(iommu_devinfo_cache);
4106 kmem_cache_destroy(iommu_domain_cache);
4107 iova_cache_put();
4108}
4109
4110static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4111{
4112 struct dmar_drhd_unit *drhd;
4113 u32 vtbar;
4114 int rc;
4115
4116 /* We know that this device on this chipset has its own IOMMU.
4117 * If we find it under a different IOMMU, then the BIOS is lying
4118 * to us. Hope that the IOMMU for this device is actually
4119 * disabled, and it needs no translation...
4120 */
4121 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4122 if (rc) {
4123 /* "can't" happen */
4124 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4125 return;
4126 }
4127 vtbar &= 0xffff0000;
4128
4129 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4130 drhd = dmar_find_matched_drhd_unit(pdev);
4131 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4132 TAINT_FIRMWARE_WORKAROUND,
4133 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4134 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4135}
4136DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4137
4138static void __init init_no_remapping_devices(void)
4139{
4140 struct dmar_drhd_unit *drhd;
4141 struct device *dev;
4142 int i;
4143
4144 for_each_drhd_unit(drhd) {
4145 if (!drhd->include_all) {
4146 for_each_active_dev_scope(drhd->devices,
4147 drhd->devices_cnt, i, dev)
4148 break;
4149 /* ignore DMAR unit if no devices exist */
4150 if (i == drhd->devices_cnt)
4151 drhd->ignored = 1;
4152 }
4153 }
4154
4155 for_each_active_drhd_unit(drhd) {
4156 if (drhd->include_all)
4157 continue;
4158
4159 for_each_active_dev_scope(drhd->devices,
4160 drhd->devices_cnt, i, dev)
4161 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4162 break;
4163 if (i < drhd->devices_cnt)
4164 continue;
4165
4166 /* This IOMMU has *only* gfx devices. Either bypass it or
4167 set the gfx_mapped flag, as appropriate */
4168 if (!dmar_map_gfx) {
4169 drhd->ignored = 1;
4170 for_each_active_dev_scope(drhd->devices,
4171 drhd->devices_cnt, i, dev)
4172 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4173 }
4174 }
4175}
4176
4177#ifdef CONFIG_SUSPEND
4178static int init_iommu_hw(void)
4179{
4180 struct dmar_drhd_unit *drhd;
4181 struct intel_iommu *iommu = NULL;
4182
4183 for_each_active_iommu(iommu, drhd)
4184 if (iommu->qi)
4185 dmar_reenable_qi(iommu);
4186
4187 for_each_iommu(iommu, drhd) {
4188 if (drhd->ignored) {
4189 /*
4190 * we always have to disable PMRs or DMA may fail on
4191 * this device
4192 */
4193 if (force_on)
4194 iommu_disable_protect_mem_regions(iommu);
4195 continue;
4196 }
4197
4198 iommu_flush_write_buffer(iommu);
4199
4200 iommu_set_root_entry(iommu);
4201
4202 iommu->flush.flush_context(iommu, 0, 0, 0,
4203 DMA_CCMD_GLOBAL_INVL);
4204 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4205 iommu_enable_translation(iommu);
4206 iommu_disable_protect_mem_regions(iommu);
4207 }
4208
4209 return 0;
4210}
4211
4212static void iommu_flush_all(void)
4213{
4214 struct dmar_drhd_unit *drhd;
4215 struct intel_iommu *iommu;
4216
4217 for_each_active_iommu(iommu, drhd) {
4218 iommu->flush.flush_context(iommu, 0, 0, 0,
4219 DMA_CCMD_GLOBAL_INVL);
4220 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4221 DMA_TLB_GLOBAL_FLUSH);
4222 }
4223}
4224
4225static int iommu_suspend(void)
4226{
4227 struct dmar_drhd_unit *drhd;
4228 struct intel_iommu *iommu = NULL;
4229 unsigned long flag;
4230
4231 for_each_active_iommu(iommu, drhd) {
4232 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4233 GFP_ATOMIC);
4234 if (!iommu->iommu_state)
4235 goto nomem;
4236 }
4237
4238 iommu_flush_all();
4239
4240 for_each_active_iommu(iommu, drhd) {
4241 iommu_disable_translation(iommu);
4242
4243 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4244
4245 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4246 readl(iommu->reg + DMAR_FECTL_REG);
4247 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4248 readl(iommu->reg + DMAR_FEDATA_REG);
4249 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4250 readl(iommu->reg + DMAR_FEADDR_REG);
4251 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4252 readl(iommu->reg + DMAR_FEUADDR_REG);
4253
4254 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4255 }
4256 return 0;
4257
4258nomem:
4259 for_each_active_iommu(iommu, drhd)
4260 kfree(iommu->iommu_state);
4261
4262 return -ENOMEM;
4263}
4264
4265static void iommu_resume(void)
4266{
4267 struct dmar_drhd_unit *drhd;
4268 struct intel_iommu *iommu = NULL;
4269 unsigned long flag;
4270
4271 if (init_iommu_hw()) {
4272 if (force_on)
4273 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4274 else
4275 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4276 return;
4277 }
4278
4279 for_each_active_iommu(iommu, drhd) {
4280
4281 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4282
4283 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4284 iommu->reg + DMAR_FECTL_REG);
4285 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4286 iommu->reg + DMAR_FEDATA_REG);
4287 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4288 iommu->reg + DMAR_FEADDR_REG);
4289 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4290 iommu->reg + DMAR_FEUADDR_REG);
4291
4292 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4293 }
4294
4295 for_each_active_iommu(iommu, drhd)
4296 kfree(iommu->iommu_state);
4297}
4298
4299static struct syscore_ops iommu_syscore_ops = {
4300 .resume = iommu_resume,
4301 .suspend = iommu_suspend,
4302};
4303
4304static void __init init_iommu_pm_ops(void)
4305{
4306 register_syscore_ops(&iommu_syscore_ops);
4307}
4308
4309#else
4310static inline void init_iommu_pm_ops(void) {}
4311#endif /* CONFIG_PM */
4312
4313int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4314{
4315 struct acpi_dmar_reserved_memory *rmrr;
4316 struct dmar_rmrr_unit *rmrru;
4317
4318 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4319 if (!rmrru)
4320 goto out;
4321
4322 rmrru->hdr = header;
4323 rmrr = (struct acpi_dmar_reserved_memory *)header;
4324 rmrru->base_address = rmrr->base_address;
4325 rmrru->end_address = rmrr->end_address;
4326
4327 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4328 ((void *)rmrr) + rmrr->header.length,
4329 &rmrru->devices_cnt);
4330 if (rmrru->devices_cnt && rmrru->devices == NULL)
4331 goto free_rmrru;
4332
4333 list_add(&rmrru->list, &dmar_rmrr_units);
4334
4335 return 0;
4336free_rmrru:
4337 kfree(rmrru);
4338out:
4339 return -ENOMEM;
4340}
4341
4342static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4343{
4344 struct dmar_atsr_unit *atsru;
4345 struct acpi_dmar_atsr *tmp;
4346
4347 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4348 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4349 if (atsr->segment != tmp->segment)
4350 continue;
4351 if (atsr->header.length != tmp->header.length)
4352 continue;
4353 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4354 return atsru;
4355 }
4356
4357 return NULL;
4358}
4359
4360int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4361{
4362 struct acpi_dmar_atsr *atsr;
4363 struct dmar_atsr_unit *atsru;
4364
4365 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4366 return 0;
4367
4368 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4369 atsru = dmar_find_atsr(atsr);
4370 if (atsru)
4371 return 0;
4372
4373 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4374 if (!atsru)
4375 return -ENOMEM;
4376
4377 /*
4378 * If memory is allocated from slab by ACPI _DSM method, we need to
4379 * copy the memory content because the memory buffer will be freed
4380 * on return.
4381 */
4382 atsru->hdr = (void *)(atsru + 1);
4383 memcpy(atsru->hdr, hdr, hdr->length);
4384 atsru->include_all = atsr->flags & 0x1;
4385 if (!atsru->include_all) {
4386 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4387 (void *)atsr + atsr->header.length,
4388 &atsru->devices_cnt);
4389 if (atsru->devices_cnt && atsru->devices == NULL) {
4390 kfree(atsru);
4391 return -ENOMEM;
4392 }
4393 }
4394
4395 list_add_rcu(&atsru->list, &dmar_atsr_units);
4396
4397 return 0;
4398}
4399
4400static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4401{
4402 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4403 kfree(atsru);
4404}
4405
4406int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4407{
4408 struct acpi_dmar_atsr *atsr;
4409 struct dmar_atsr_unit *atsru;
4410
4411 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4412 atsru = dmar_find_atsr(atsr);
4413 if (atsru) {
4414 list_del_rcu(&atsru->list);
4415 synchronize_rcu();
4416 intel_iommu_free_atsr(atsru);
4417 }
4418
4419 return 0;
4420}
4421
4422int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4423{
4424 int i;
4425 struct device *dev;
4426 struct acpi_dmar_atsr *atsr;
4427 struct dmar_atsr_unit *atsru;
4428
4429 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4430 atsru = dmar_find_atsr(atsr);
4431 if (!atsru)
4432 return 0;
4433
4434 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4435 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4436 i, dev)
4437 return -EBUSY;
4438 }
4439
4440 return 0;
4441}
4442
4443static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4444{
4445 int sp, ret;
4446 struct intel_iommu *iommu = dmaru->iommu;
4447
4448 if (g_iommus[iommu->seq_id])
4449 return 0;
4450
4451 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4452 pr_warn("%s: Doesn't support hardware pass through.\n",
4453 iommu->name);
4454 return -ENXIO;
4455 }
4456 if (!ecap_sc_support(iommu->ecap) &&
4457 domain_update_iommu_snooping(iommu)) {
4458 pr_warn("%s: Doesn't support snooping.\n",
4459 iommu->name);
4460 return -ENXIO;
4461 }
4462 sp = domain_update_iommu_superpage(iommu) - 1;
4463 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4464 pr_warn("%s: Doesn't support large page.\n",
4465 iommu->name);
4466 return -ENXIO;
4467 }
4468
4469 /*
4470 * Disable translation if already enabled prior to OS handover.
4471 */
4472 if (iommu->gcmd & DMA_GCMD_TE)
4473 iommu_disable_translation(iommu);
4474
4475 g_iommus[iommu->seq_id] = iommu;
4476 ret = iommu_init_domains(iommu);
4477 if (ret == 0)
4478 ret = iommu_alloc_root_entry(iommu);
4479 if (ret)
4480 goto out;
4481
4482#ifdef CONFIG_INTEL_IOMMU_SVM
4483 if (pasid_supported(iommu))
4484 intel_svm_init(iommu);
4485#endif
4486
4487 if (dmaru->ignored) {
4488 /*
4489 * we always have to disable PMRs or DMA may fail on this device
4490 */
4491 if (force_on)
4492 iommu_disable_protect_mem_regions(iommu);
4493 return 0;
4494 }
4495
4496 intel_iommu_init_qi(iommu);
4497 iommu_flush_write_buffer(iommu);
4498
4499#ifdef CONFIG_INTEL_IOMMU_SVM
4500 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4501 ret = intel_svm_enable_prq(iommu);
4502 if (ret)
4503 goto disable_iommu;
4504 }
4505#endif
4506 ret = dmar_set_interrupt(iommu);
4507 if (ret)
4508 goto disable_iommu;
4509
4510 iommu_set_root_entry(iommu);
4511 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4512 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4513 iommu_enable_translation(iommu);
4514
4515 iommu_disable_protect_mem_regions(iommu);
4516 return 0;
4517
4518disable_iommu:
4519 disable_dmar_iommu(iommu);
4520out:
4521 free_dmar_iommu(iommu);
4522 return ret;
4523}
4524
4525int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4526{
4527 int ret = 0;
4528 struct intel_iommu *iommu = dmaru->iommu;
4529
4530 if (!intel_iommu_enabled)
4531 return 0;
4532 if (iommu == NULL)
4533 return -EINVAL;
4534
4535 if (insert) {
4536 ret = intel_iommu_add(dmaru);
4537 } else {
4538 disable_dmar_iommu(iommu);
4539 free_dmar_iommu(iommu);
4540 }
4541
4542 return ret;
4543}
4544
4545static void intel_iommu_free_dmars(void)
4546{
4547 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4548 struct dmar_atsr_unit *atsru, *atsr_n;
4549
4550 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4551 list_del(&rmrru->list);
4552 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4553 kfree(rmrru);
4554 }
4555
4556 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4557 list_del(&atsru->list);
4558 intel_iommu_free_atsr(atsru);
4559 }
4560}
4561
4562int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4563{
4564 int i, ret = 1;
4565 struct pci_bus *bus;
4566 struct pci_dev *bridge = NULL;
4567 struct device *tmp;
4568 struct acpi_dmar_atsr *atsr;
4569 struct dmar_atsr_unit *atsru;
4570
4571 dev = pci_physfn(dev);
4572 for (bus = dev->bus; bus; bus = bus->parent) {
4573 bridge = bus->self;
4574 /* If it's an integrated device, allow ATS */
4575 if (!bridge)
4576 return 1;
4577 /* Connected via non-PCIe: no ATS */
4578 if (!pci_is_pcie(bridge) ||
4579 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4580 return 0;
4581 /* If we found the root port, look it up in the ATSR */
4582 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4583 break;
4584 }
4585
4586 rcu_read_lock();
4587 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4588 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4589 if (atsr->segment != pci_domain_nr(dev->bus))
4590 continue;
4591
4592 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4593 if (tmp == &bridge->dev)
4594 goto out;
4595
4596 if (atsru->include_all)
4597 goto out;
4598 }
4599 ret = 0;
4600out:
4601 rcu_read_unlock();
4602
4603 return ret;
4604}
4605
4606int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4607{
4608 int ret;
4609 struct dmar_rmrr_unit *rmrru;
4610 struct dmar_atsr_unit *atsru;
4611 struct acpi_dmar_atsr *atsr;
4612 struct acpi_dmar_reserved_memory *rmrr;
4613
4614 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4615 return 0;
4616
4617 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4618 rmrr = container_of(rmrru->hdr,
4619 struct acpi_dmar_reserved_memory, header);
4620 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4621 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4622 ((void *)rmrr) + rmrr->header.length,
4623 rmrr->segment, rmrru->devices,
4624 rmrru->devices_cnt);
4625 if (ret < 0)
4626 return ret;
4627 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4628 dmar_remove_dev_scope(info, rmrr->segment,
4629 rmrru->devices, rmrru->devices_cnt);
4630 }
4631 }
4632
4633 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4634 if (atsru->include_all)
4635 continue;
4636
4637 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4638 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4639 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4640 (void *)atsr + atsr->header.length,
4641 atsr->segment, atsru->devices,
4642 atsru->devices_cnt);
4643 if (ret > 0)
4644 break;
4645 else if (ret < 0)
4646 return ret;
4647 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4648 if (dmar_remove_dev_scope(info, atsr->segment,
4649 atsru->devices, atsru->devices_cnt))
4650 break;
4651 }
4652 }
4653
4654 return 0;
4655}
4656
4657static int intel_iommu_memory_notifier(struct notifier_block *nb,
4658 unsigned long val, void *v)
4659{
4660 struct memory_notify *mhp = v;
4661 unsigned long long start, end;
4662 unsigned long start_vpfn, last_vpfn;
4663
4664 switch (val) {
4665 case MEM_GOING_ONLINE:
4666 start = mhp->start_pfn << PAGE_SHIFT;
4667 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4668 if (iommu_domain_identity_map(si_domain, start, end)) {
4669 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4670 start, end);
4671 return NOTIFY_BAD;
4672 }
4673 break;
4674
4675 case MEM_OFFLINE:
4676 case MEM_CANCEL_ONLINE:
4677 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4678 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4679 while (start_vpfn <= last_vpfn) {
4680 struct iova *iova;
4681 struct dmar_drhd_unit *drhd;
4682 struct intel_iommu *iommu;
4683 struct page *freelist;
4684
4685 iova = find_iova(&si_domain->iovad, start_vpfn);
4686 if (iova == NULL) {
4687 pr_debug("Failed get IOVA for PFN %lx\n",
4688 start_vpfn);
4689 break;
4690 }
4691
4692 iova = split_and_remove_iova(&si_domain->iovad, iova,
4693 start_vpfn, last_vpfn);
4694 if (iova == NULL) {
4695 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4696 start_vpfn, last_vpfn);
4697 return NOTIFY_BAD;
4698 }
4699
4700 freelist = domain_unmap(si_domain, iova->pfn_lo,
4701 iova->pfn_hi);
4702
4703 rcu_read_lock();
4704 for_each_active_iommu(iommu, drhd)
4705 iommu_flush_iotlb_psi(iommu, si_domain,
4706 iova->pfn_lo, iova_size(iova),
4707 !freelist, 0);
4708 rcu_read_unlock();
4709 dma_free_pagelist(freelist);
4710
4711 start_vpfn = iova->pfn_hi + 1;
4712 free_iova_mem(iova);
4713 }
4714 break;
4715 }
4716
4717 return NOTIFY_OK;
4718}
4719
4720static struct notifier_block intel_iommu_memory_nb = {
4721 .notifier_call = intel_iommu_memory_notifier,
4722 .priority = 0
4723};
4724
4725static void free_all_cpu_cached_iovas(unsigned int cpu)
4726{
4727 int i;
4728
4729 for (i = 0; i < g_num_of_iommus; i++) {
4730 struct intel_iommu *iommu = g_iommus[i];
4731 struct dmar_domain *domain;
4732 int did;
4733
4734 if (!iommu)
4735 continue;
4736
4737 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4738 domain = get_iommu_domain(iommu, (u16)did);
4739
4740 if (!domain)
4741 continue;
4742 free_cpu_cached_iovas(cpu, &domain->iovad);
4743 }
4744 }
4745}
4746
4747static int intel_iommu_cpu_dead(unsigned int cpu)
4748{
4749 free_all_cpu_cached_iovas(cpu);
4750 return 0;
4751}
4752
4753static void intel_disable_iommus(void)
4754{
4755 struct intel_iommu *iommu = NULL;
4756 struct dmar_drhd_unit *drhd;
4757
4758 for_each_iommu(iommu, drhd)
4759 iommu_disable_translation(iommu);
4760}
4761
4762static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4763{
4764 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4765
4766 return container_of(iommu_dev, struct intel_iommu, iommu);
4767}
4768
4769static ssize_t intel_iommu_show_version(struct device *dev,
4770 struct device_attribute *attr,
4771 char *buf)
4772{
4773 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4774 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4775 return sprintf(buf, "%d:%d\n",
4776 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4777}
4778static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4779
4780static ssize_t intel_iommu_show_address(struct device *dev,
4781 struct device_attribute *attr,
4782 char *buf)
4783{
4784 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4785 return sprintf(buf, "%llx\n", iommu->reg_phys);
4786}
4787static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4788
4789static ssize_t intel_iommu_show_cap(struct device *dev,
4790 struct device_attribute *attr,
4791 char *buf)
4792{
4793 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4794 return sprintf(buf, "%llx\n", iommu->cap);
4795}
4796static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4797
4798static ssize_t intel_iommu_show_ecap(struct device *dev,
4799 struct device_attribute *attr,
4800 char *buf)
4801{
4802 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4803 return sprintf(buf, "%llx\n", iommu->ecap);
4804}
4805static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4806
4807static ssize_t intel_iommu_show_ndoms(struct device *dev,
4808 struct device_attribute *attr,
4809 char *buf)
4810{
4811 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4812 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4813}
4814static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4815
4816static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4817 struct device_attribute *attr,
4818 char *buf)
4819{
4820 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4821 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4822 cap_ndoms(iommu->cap)));
4823}
4824static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4825
4826static struct attribute *intel_iommu_attrs[] = {
4827 &dev_attr_version.attr,
4828 &dev_attr_address.attr,
4829 &dev_attr_cap.attr,
4830 &dev_attr_ecap.attr,
4831 &dev_attr_domains_supported.attr,
4832 &dev_attr_domains_used.attr,
4833 NULL,
4834};
4835
4836static struct attribute_group intel_iommu_group = {
4837 .name = "intel-iommu",
4838 .attrs = intel_iommu_attrs,
4839};
4840
4841const struct attribute_group *intel_iommu_groups[] = {
4842 &intel_iommu_group,
4843 NULL,
4844};
4845
4846static inline bool has_untrusted_dev(void)
4847{
4848 struct pci_dev *pdev = NULL;
4849
4850 for_each_pci_dev(pdev)
4851 if (pdev->untrusted)
4852 return true;
4853
4854 return false;
4855}
4856
4857static int __init platform_optin_force_iommu(void)
4858{
4859 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4860 return 0;
4861
4862 if (no_iommu || dmar_disabled)
4863 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4864
4865 /*
4866 * If Intel-IOMMU is disabled by default, we will apply identity
4867 * map for all devices except those marked as being untrusted.
4868 */
4869 if (dmar_disabled)
4870 iommu_identity_mapping |= IDENTMAP_ALL;
4871
4872 dmar_disabled = 0;
4873 no_iommu = 0;
4874
4875 return 1;
4876}
4877
4878static int __init probe_acpi_namespace_devices(void)
4879{
4880 struct dmar_drhd_unit *drhd;
4881 /* To avoid a -Wunused-but-set-variable warning. */
4882 struct intel_iommu *iommu __maybe_unused;
4883 struct device *dev;
4884 int i, ret = 0;
4885
4886 for_each_active_iommu(iommu, drhd) {
4887 for_each_active_dev_scope(drhd->devices,
4888 drhd->devices_cnt, i, dev) {
4889 struct acpi_device_physical_node *pn;
4890 struct iommu_group *group;
4891 struct acpi_device *adev;
4892
4893 if (dev->bus != &acpi_bus_type)
4894 continue;
4895
4896 adev = to_acpi_device(dev);
4897 mutex_lock(&adev->physical_node_lock);
4898 list_for_each_entry(pn,
4899 &adev->physical_node_list, node) {
4900 group = iommu_group_get(pn->dev);
4901 if (group) {
4902 iommu_group_put(group);
4903 continue;
4904 }
4905
4906 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4907 ret = iommu_probe_device(pn->dev);
4908 if (ret)
4909 break;
4910 }
4911 mutex_unlock(&adev->physical_node_lock);
4912
4913 if (ret)
4914 return ret;
4915 }
4916 }
4917
4918 return 0;
4919}
4920
4921int __init intel_iommu_init(void)
4922{
4923 int ret = -ENODEV;
4924 struct dmar_drhd_unit *drhd;
4925 struct intel_iommu *iommu;
4926
4927 /*
4928 * Intel IOMMU is required for a TXT/tboot launch or platform
4929 * opt in, so enforce that.
4930 */
4931 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4932
4933 if (iommu_init_mempool()) {
4934 if (force_on)
4935 panic("tboot: Failed to initialize iommu memory\n");
4936 return -ENOMEM;
4937 }
4938
4939 down_write(&dmar_global_lock);
4940 if (dmar_table_init()) {
4941 if (force_on)
4942 panic("tboot: Failed to initialize DMAR table\n");
4943 goto out_free_dmar;
4944 }
4945
4946 if (dmar_dev_scope_init() < 0) {
4947 if (force_on)
4948 panic("tboot: Failed to initialize DMAR device scope\n");
4949 goto out_free_dmar;
4950 }
4951
4952 up_write(&dmar_global_lock);
4953
4954 /*
4955 * The bus notifier takes the dmar_global_lock, so lockdep will
4956 * complain later when we register it under the lock.
4957 */
4958 dmar_register_bus_notifier();
4959
4960 down_write(&dmar_global_lock);
4961
4962 if (no_iommu || dmar_disabled) {
4963 /*
4964 * We exit the function here to ensure IOMMU's remapping and
4965 * mempool aren't setup, which means that the IOMMU's PMRs
4966 * won't be disabled via the call to init_dmars(). So disable
4967 * it explicitly here. The PMRs were setup by tboot prior to
4968 * calling SENTER, but the kernel is expected to reset/tear
4969 * down the PMRs.
4970 */
4971 if (intel_iommu_tboot_noforce) {
4972 for_each_iommu(iommu, drhd)
4973 iommu_disable_protect_mem_regions(iommu);
4974 }
4975
4976 /*
4977 * Make sure the IOMMUs are switched off, even when we
4978 * boot into a kexec kernel and the previous kernel left
4979 * them enabled
4980 */
4981 intel_disable_iommus();
4982 goto out_free_dmar;
4983 }
4984
4985 if (list_empty(&dmar_rmrr_units))
4986 pr_info("No RMRR found\n");
4987
4988 if (list_empty(&dmar_atsr_units))
4989 pr_info("No ATSR found\n");
4990
4991 if (dmar_init_reserved_ranges()) {
4992 if (force_on)
4993 panic("tboot: Failed to reserve iommu ranges\n");
4994 goto out_free_reserved_range;
4995 }
4996
4997 if (dmar_map_gfx)
4998 intel_iommu_gfx_mapped = 1;
4999
5000 init_no_remapping_devices();
5001
5002 ret = init_dmars();
5003 if (ret) {
5004 if (force_on)
5005 panic("tboot: Failed to initialize DMARs\n");
5006 pr_err("Initialization failed\n");
5007 goto out_free_reserved_range;
5008 }
5009 up_write(&dmar_global_lock);
5010
5011#if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5012 /*
5013 * If the system has no untrusted device or the user has decided
5014 * to disable the bounce page mechanisms, we don't need swiotlb.
5015 * Mark this and the pre-allocated bounce pages will be released
5016 * later.
5017 */
5018 if (!has_untrusted_dev() || intel_no_bounce)
5019 swiotlb = 0;
5020#endif
5021 dma_ops = &intel_dma_ops;
5022
5023 init_iommu_pm_ops();
5024
5025 for_each_active_iommu(iommu, drhd) {
5026 iommu_device_sysfs_add(&iommu->iommu, NULL,
5027 intel_iommu_groups,
5028 "%s", iommu->name);
5029 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5030 iommu_device_register(&iommu->iommu);
5031 }
5032
5033 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5034 if (si_domain && !hw_pass_through)
5035 register_memory_notifier(&intel_iommu_memory_nb);
5036 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5037 intel_iommu_cpu_dead);
5038
5039 down_read(&dmar_global_lock);
5040 if (probe_acpi_namespace_devices())
5041 pr_warn("ACPI name space devices didn't probe correctly\n");
5042 up_read(&dmar_global_lock);
5043
5044 /* Finally, we enable the DMA remapping hardware. */
5045 for_each_iommu(iommu, drhd) {
5046 if (!drhd->ignored && !translation_pre_enabled(iommu))
5047 iommu_enable_translation(iommu);
5048
5049 iommu_disable_protect_mem_regions(iommu);
5050 }
5051 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5052
5053 intel_iommu_enabled = 1;
5054 intel_iommu_debugfs_init();
5055
5056 return 0;
5057
5058out_free_reserved_range:
5059 put_iova_domain(&reserved_iova_list);
5060out_free_dmar:
5061 intel_iommu_free_dmars();
5062 up_write(&dmar_global_lock);
5063 iommu_exit_mempool();
5064 return ret;
5065}
5066
5067static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5068{
5069 struct intel_iommu *iommu = opaque;
5070
5071 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5072 return 0;
5073}
5074
5075/*
5076 * NB - intel-iommu lacks any sort of reference counting for the users of
5077 * dependent devices. If multiple endpoints have intersecting dependent
5078 * devices, unbinding the driver from any one of them will possibly leave
5079 * the others unable to operate.
5080 */
5081static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5082{
5083 if (!iommu || !dev || !dev_is_pci(dev))
5084 return;
5085
5086 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5087}
5088
5089static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5090{
5091 struct dmar_domain *domain;
5092 struct intel_iommu *iommu;
5093 unsigned long flags;
5094
5095 assert_spin_locked(&device_domain_lock);
5096
5097 if (WARN_ON(!info))
5098 return;
5099
5100 iommu = info->iommu;
5101 domain = info->domain;
5102
5103 if (info->dev) {
5104 if (dev_is_pci(info->dev) && sm_supported(iommu))
5105 intel_pasid_tear_down_entry(iommu, info->dev,
5106 PASID_RID2PASID);
5107
5108 iommu_disable_dev_iotlb(info);
5109 domain_context_clear(iommu, info->dev);
5110 intel_pasid_free_table(info->dev);
5111 }
5112
5113 unlink_domain_info(info);
5114
5115 spin_lock_irqsave(&iommu->lock, flags);
5116 domain_detach_iommu(domain, iommu);
5117 spin_unlock_irqrestore(&iommu->lock, flags);
5118
5119 /* free the private domain */
5120 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5121 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5122 list_empty(&domain->devices))
5123 domain_exit(info->domain);
5124
5125 free_devinfo_mem(info);
5126}
5127
5128static void dmar_remove_one_dev_info(struct device *dev)
5129{
5130 struct device_domain_info *info;
5131 unsigned long flags;
5132
5133 spin_lock_irqsave(&device_domain_lock, flags);
5134 info = dev->archdata.iommu;
5135 if (info)
5136 __dmar_remove_one_dev_info(info);
5137 spin_unlock_irqrestore(&device_domain_lock, flags);
5138}
5139
5140static int md_domain_init(struct dmar_domain *domain, int guest_width)
5141{
5142 int adjust_width;
5143
5144 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5145 domain_reserve_special_ranges(domain);
5146
5147 /* calculate AGAW */
5148 domain->gaw = guest_width;
5149 adjust_width = guestwidth_to_adjustwidth(guest_width);
5150 domain->agaw = width_to_agaw(adjust_width);
5151
5152 domain->iommu_coherency = 0;
5153 domain->iommu_snooping = 0;
5154 domain->iommu_superpage = 0;
5155 domain->max_addr = 0;
5156
5157 /* always allocate the top pgd */
5158 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5159 if (!domain->pgd)
5160 return -ENOMEM;
5161 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5162 return 0;
5163}
5164
5165static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5166{
5167 struct dmar_domain *dmar_domain;
5168 struct iommu_domain *domain;
5169
5170 switch (type) {
5171 case IOMMU_DOMAIN_DMA:
5172 /* fallthrough */
5173 case IOMMU_DOMAIN_UNMANAGED:
5174 dmar_domain = alloc_domain(0);
5175 if (!dmar_domain) {
5176 pr_err("Can't allocate dmar_domain\n");
5177 return NULL;
5178 }
5179 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5180 pr_err("Domain initialization failed\n");
5181 domain_exit(dmar_domain);
5182 return NULL;
5183 }
5184
5185 if (type == IOMMU_DOMAIN_DMA &&
5186 init_iova_flush_queue(&dmar_domain->iovad,
5187 iommu_flush_iova, iova_entry_free)) {
5188 pr_warn("iova flush queue initialization failed\n");
5189 intel_iommu_strict = 1;
5190 }
5191
5192 domain_update_iommu_cap(dmar_domain);
5193
5194 domain = &dmar_domain->domain;
5195 domain->geometry.aperture_start = 0;
5196 domain->geometry.aperture_end =
5197 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5198 domain->geometry.force_aperture = true;
5199
5200 return domain;
5201 case IOMMU_DOMAIN_IDENTITY:
5202 return &si_domain->domain;
5203 default:
5204 return NULL;
5205 }
5206
5207 return NULL;
5208}
5209
5210static void intel_iommu_domain_free(struct iommu_domain *domain)
5211{
5212 if (domain != &si_domain->domain)
5213 domain_exit(to_dmar_domain(domain));
5214}
5215
5216/*
5217 * Check whether a @domain could be attached to the @dev through the
5218 * aux-domain attach/detach APIs.
5219 */
5220static inline bool
5221is_aux_domain(struct device *dev, struct iommu_domain *domain)
5222{
5223 struct device_domain_info *info = dev->archdata.iommu;
5224
5225 return info && info->auxd_enabled &&
5226 domain->type == IOMMU_DOMAIN_UNMANAGED;
5227}
5228
5229static void auxiliary_link_device(struct dmar_domain *domain,
5230 struct device *dev)
5231{
5232 struct device_domain_info *info = dev->archdata.iommu;
5233
5234 assert_spin_locked(&device_domain_lock);
5235 if (WARN_ON(!info))
5236 return;
5237
5238 domain->auxd_refcnt++;
5239 list_add(&domain->auxd, &info->auxiliary_domains);
5240}
5241
5242static void auxiliary_unlink_device(struct dmar_domain *domain,
5243 struct device *dev)
5244{
5245 struct device_domain_info *info = dev->archdata.iommu;
5246
5247 assert_spin_locked(&device_domain_lock);
5248 if (WARN_ON(!info))
5249 return;
5250
5251 list_del(&domain->auxd);
5252 domain->auxd_refcnt--;
5253
5254 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5255 intel_pasid_free_id(domain->default_pasid);
5256}
5257
5258static int aux_domain_add_dev(struct dmar_domain *domain,
5259 struct device *dev)
5260{
5261 int ret;
5262 u8 bus, devfn;
5263 unsigned long flags;
5264 struct intel_iommu *iommu;
5265
5266 iommu = device_to_iommu(dev, &bus, &devfn);
5267 if (!iommu)
5268 return -ENODEV;
5269
5270 if (domain->default_pasid <= 0) {
5271 int pasid;
5272
5273 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5274 pci_max_pasids(to_pci_dev(dev)),
5275 GFP_KERNEL);
5276 if (pasid <= 0) {
5277 pr_err("Can't allocate default pasid\n");
5278 return -ENODEV;
5279 }
5280 domain->default_pasid = pasid;
5281 }
5282
5283 spin_lock_irqsave(&device_domain_lock, flags);
5284 /*
5285 * iommu->lock must be held to attach domain to iommu and setup the
5286 * pasid entry for second level translation.
5287 */
5288 spin_lock(&iommu->lock);
5289 ret = domain_attach_iommu(domain, iommu);
5290 if (ret)
5291 goto attach_failed;
5292
5293 /* Setup the PASID entry for mediated devices: */
5294 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5295 domain->default_pasid);
5296 if (ret)
5297 goto table_failed;
5298 spin_unlock(&iommu->lock);
5299
5300 auxiliary_link_device(domain, dev);
5301
5302 spin_unlock_irqrestore(&device_domain_lock, flags);
5303
5304 return 0;
5305
5306table_failed:
5307 domain_detach_iommu(domain, iommu);
5308attach_failed:
5309 spin_unlock(&iommu->lock);
5310 spin_unlock_irqrestore(&device_domain_lock, flags);
5311 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5312 intel_pasid_free_id(domain->default_pasid);
5313
5314 return ret;
5315}
5316
5317static void aux_domain_remove_dev(struct dmar_domain *domain,
5318 struct device *dev)
5319{
5320 struct device_domain_info *info;
5321 struct intel_iommu *iommu;
5322 unsigned long flags;
5323
5324 if (!is_aux_domain(dev, &domain->domain))
5325 return;
5326
5327 spin_lock_irqsave(&device_domain_lock, flags);
5328 info = dev->archdata.iommu;
5329 iommu = info->iommu;
5330
5331 auxiliary_unlink_device(domain, dev);
5332
5333 spin_lock(&iommu->lock);
5334 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5335 domain_detach_iommu(domain, iommu);
5336 spin_unlock(&iommu->lock);
5337
5338 spin_unlock_irqrestore(&device_domain_lock, flags);
5339}
5340
5341static int prepare_domain_attach_device(struct iommu_domain *domain,
5342 struct device *dev)
5343{
5344 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5345 struct intel_iommu *iommu;
5346 int addr_width;
5347 u8 bus, devfn;
5348
5349 iommu = device_to_iommu(dev, &bus, &devfn);
5350 if (!iommu)
5351 return -ENODEV;
5352
5353 /* check if this iommu agaw is sufficient for max mapped address */
5354 addr_width = agaw_to_width(iommu->agaw);
5355 if (addr_width > cap_mgaw(iommu->cap))
5356 addr_width = cap_mgaw(iommu->cap);
5357
5358 if (dmar_domain->max_addr > (1LL << addr_width)) {
5359 dev_err(dev, "%s: iommu width (%d) is not "
5360 "sufficient for the mapped address (%llx)\n",
5361 __func__, addr_width, dmar_domain->max_addr);
5362 return -EFAULT;
5363 }
5364 dmar_domain->gaw = addr_width;
5365
5366 /*
5367 * Knock out extra levels of page tables if necessary
5368 */
5369 while (iommu->agaw < dmar_domain->agaw) {
5370 struct dma_pte *pte;
5371
5372 pte = dmar_domain->pgd;
5373 if (dma_pte_present(pte)) {
5374 dmar_domain->pgd = (struct dma_pte *)
5375 phys_to_virt(dma_pte_addr(pte));
5376 free_pgtable_page(pte);
5377 }
5378 dmar_domain->agaw--;
5379 }
5380
5381 return 0;
5382}
5383
5384static int intel_iommu_attach_device(struct iommu_domain *domain,
5385 struct device *dev)
5386{
5387 int ret;
5388
5389 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5390 device_is_rmrr_locked(dev)) {
5391 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5392 return -EPERM;
5393 }
5394
5395 if (is_aux_domain(dev, domain))
5396 return -EPERM;
5397
5398 /* normally dev is not mapped */
5399 if (unlikely(domain_context_mapped(dev))) {
5400 struct dmar_domain *old_domain;
5401
5402 old_domain = find_domain(dev);
5403 if (old_domain)
5404 dmar_remove_one_dev_info(dev);
5405 }
5406
5407 ret = prepare_domain_attach_device(domain, dev);
5408 if (ret)
5409 return ret;
5410
5411 return domain_add_dev_info(to_dmar_domain(domain), dev);
5412}
5413
5414static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5415 struct device *dev)
5416{
5417 int ret;
5418
5419 if (!is_aux_domain(dev, domain))
5420 return -EPERM;
5421
5422 ret = prepare_domain_attach_device(domain, dev);
5423 if (ret)
5424 return ret;
5425
5426 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5427}
5428
5429static void intel_iommu_detach_device(struct iommu_domain *domain,
5430 struct device *dev)
5431{
5432 dmar_remove_one_dev_info(dev);
5433}
5434
5435static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5436 struct device *dev)
5437{
5438 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5439}
5440
5441static int intel_iommu_map(struct iommu_domain *domain,
5442 unsigned long iova, phys_addr_t hpa,
5443 size_t size, int iommu_prot)
5444{
5445 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5446 u64 max_addr;
5447 int prot = 0;
5448 int ret;
5449
5450 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5451 return -EINVAL;
5452
5453 if (iommu_prot & IOMMU_READ)
5454 prot |= DMA_PTE_READ;
5455 if (iommu_prot & IOMMU_WRITE)
5456 prot |= DMA_PTE_WRITE;
5457 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5458 prot |= DMA_PTE_SNP;
5459
5460 max_addr = iova + size;
5461 if (dmar_domain->max_addr < max_addr) {
5462 u64 end;
5463
5464 /* check if minimum agaw is sufficient for mapped address */
5465 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5466 if (end < max_addr) {
5467 pr_err("%s: iommu width (%d) is not "
5468 "sufficient for the mapped address (%llx)\n",
5469 __func__, dmar_domain->gaw, max_addr);
5470 return -EFAULT;
5471 }
5472 dmar_domain->max_addr = max_addr;
5473 }
5474 /* Round up size to next multiple of PAGE_SIZE, if it and
5475 the low bits of hpa would take us onto the next page */
5476 size = aligned_nrpages(hpa, size);
5477 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5478 hpa >> VTD_PAGE_SHIFT, size, prot);
5479 return ret;
5480}
5481
5482static size_t intel_iommu_unmap(struct iommu_domain *domain,
5483 unsigned long iova, size_t size,
5484 struct iommu_iotlb_gather *gather)
5485{
5486 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5487 struct page *freelist = NULL;
5488 unsigned long start_pfn, last_pfn;
5489 unsigned int npages;
5490 int iommu_id, level = 0;
5491
5492 /* Cope with horrid API which requires us to unmap more than the
5493 size argument if it happens to be a large-page mapping. */
5494 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5495 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5496 return 0;
5497
5498 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5499 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5500
5501 start_pfn = iova >> VTD_PAGE_SHIFT;
5502 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5503
5504 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5505
5506 npages = last_pfn - start_pfn + 1;
5507
5508 for_each_domain_iommu(iommu_id, dmar_domain)
5509 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5510 start_pfn, npages, !freelist, 0);
5511
5512 dma_free_pagelist(freelist);
5513
5514 if (dmar_domain->max_addr == iova + size)
5515 dmar_domain->max_addr = iova;
5516
5517 return size;
5518}
5519
5520static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5521 dma_addr_t iova)
5522{
5523 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524 struct dma_pte *pte;
5525 int level = 0;
5526 u64 phys = 0;
5527
5528 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5529 return 0;
5530
5531 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5532 if (pte)
5533 phys = dma_pte_addr(pte);
5534
5535 return phys;
5536}
5537
5538static inline bool scalable_mode_support(void)
5539{
5540 struct dmar_drhd_unit *drhd;
5541 struct intel_iommu *iommu;
5542 bool ret = true;
5543
5544 rcu_read_lock();
5545 for_each_active_iommu(iommu, drhd) {
5546 if (!sm_supported(iommu)) {
5547 ret = false;
5548 break;
5549 }
5550 }
5551 rcu_read_unlock();
5552
5553 return ret;
5554}
5555
5556static inline bool iommu_pasid_support(void)
5557{
5558 struct dmar_drhd_unit *drhd;
5559 struct intel_iommu *iommu;
5560 bool ret = true;
5561
5562 rcu_read_lock();
5563 for_each_active_iommu(iommu, drhd) {
5564 if (!pasid_supported(iommu)) {
5565 ret = false;
5566 break;
5567 }
5568 }
5569 rcu_read_unlock();
5570
5571 return ret;
5572}
5573
5574static bool intel_iommu_capable(enum iommu_cap cap)
5575{
5576 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5577 return domain_update_iommu_snooping(NULL) == 1;
5578 if (cap == IOMMU_CAP_INTR_REMAP)
5579 return irq_remapping_enabled == 1;
5580
5581 return false;
5582}
5583
5584static int intel_iommu_add_device(struct device *dev)
5585{
5586 struct dmar_domain *dmar_domain;
5587 struct iommu_domain *domain;
5588 struct intel_iommu *iommu;
5589 struct iommu_group *group;
5590 u8 bus, devfn;
5591 int ret;
5592
5593 iommu = device_to_iommu(dev, &bus, &devfn);
5594 if (!iommu)
5595 return -ENODEV;
5596
5597 iommu_device_link(&iommu->iommu, dev);
5598
5599 if (translation_pre_enabled(iommu))
5600 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5601
5602 group = iommu_group_get_for_dev(dev);
5603
5604 if (IS_ERR(group))
5605 return PTR_ERR(group);
5606
5607 iommu_group_put(group);
5608
5609 domain = iommu_get_domain_for_dev(dev);
5610 dmar_domain = to_dmar_domain(domain);
5611 if (domain->type == IOMMU_DOMAIN_DMA) {
5612 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5613 ret = iommu_request_dm_for_dev(dev);
5614 if (ret) {
5615 dmar_remove_one_dev_info(dev);
5616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5617 domain_add_dev_info(si_domain, dev);
5618 dev_info(dev,
5619 "Device uses a private identity domain.\n");
5620 }
5621 }
5622 } else {
5623 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5624 ret = iommu_request_dma_domain_for_dev(dev);
5625 if (ret) {
5626 dmar_remove_one_dev_info(dev);
5627 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5628 if (!get_private_domain_for_dev(dev)) {
5629 dev_warn(dev,
5630 "Failed to get a private domain.\n");
5631 return -ENOMEM;
5632 }
5633
5634 dev_info(dev,
5635 "Device uses a private dma domain.\n");
5636 }
5637 }
5638 }
5639
5640 if (device_needs_bounce(dev)) {
5641 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5642 set_dma_ops(dev, &bounce_dma_ops);
5643 }
5644
5645 return 0;
5646}
5647
5648static void intel_iommu_remove_device(struct device *dev)
5649{
5650 struct intel_iommu *iommu;
5651 u8 bus, devfn;
5652
5653 iommu = device_to_iommu(dev, &bus, &devfn);
5654 if (!iommu)
5655 return;
5656
5657 dmar_remove_one_dev_info(dev);
5658
5659 iommu_group_remove_device(dev);
5660
5661 iommu_device_unlink(&iommu->iommu, dev);
5662
5663 if (device_needs_bounce(dev))
5664 set_dma_ops(dev, NULL);
5665}
5666
5667static void intel_iommu_get_resv_regions(struct device *device,
5668 struct list_head *head)
5669{
5670 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5671 struct iommu_resv_region *reg;
5672 struct dmar_rmrr_unit *rmrr;
5673 struct device *i_dev;
5674 int i;
5675
5676 down_read(&dmar_global_lock);
5677 for_each_rmrr_units(rmrr) {
5678 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5679 i, i_dev) {
5680 struct iommu_resv_region *resv;
5681 enum iommu_resv_type type;
5682 size_t length;
5683
5684 if (i_dev != device &&
5685 !is_downstream_to_pci_bridge(device, i_dev))
5686 continue;
5687
5688 length = rmrr->end_address - rmrr->base_address + 1;
5689
5690 type = device_rmrr_is_relaxable(device) ?
5691 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5692
5693 resv = iommu_alloc_resv_region(rmrr->base_address,
5694 length, prot, type);
5695 if (!resv)
5696 break;
5697
5698 list_add_tail(&resv->list, head);
5699 }
5700 }
5701 up_read(&dmar_global_lock);
5702
5703#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5704 if (dev_is_pci(device)) {
5705 struct pci_dev *pdev = to_pci_dev(device);
5706
5707 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5708 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5709 IOMMU_RESV_DIRECT);
5710 if (reg)
5711 list_add_tail(®->list, head);
5712 }
5713 }
5714#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5715
5716 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5717 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5718 0, IOMMU_RESV_MSI);
5719 if (!reg)
5720 return;
5721 list_add_tail(®->list, head);
5722}
5723
5724static void intel_iommu_put_resv_regions(struct device *dev,
5725 struct list_head *head)
5726{
5727 struct iommu_resv_region *entry, *next;
5728
5729 list_for_each_entry_safe(entry, next, head, list)
5730 kfree(entry);
5731}
5732
5733int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5734{
5735 struct device_domain_info *info;
5736 struct context_entry *context;
5737 struct dmar_domain *domain;
5738 unsigned long flags;
5739 u64 ctx_lo;
5740 int ret;
5741
5742 domain = find_domain(dev);
5743 if (!domain)
5744 return -EINVAL;
5745
5746 spin_lock_irqsave(&device_domain_lock, flags);
5747 spin_lock(&iommu->lock);
5748
5749 ret = -EINVAL;
5750 info = dev->archdata.iommu;
5751 if (!info || !info->pasid_supported)
5752 goto out;
5753
5754 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5755 if (WARN_ON(!context))
5756 goto out;
5757
5758 ctx_lo = context[0].lo;
5759
5760 if (!(ctx_lo & CONTEXT_PASIDE)) {
5761 ctx_lo |= CONTEXT_PASIDE;
5762 context[0].lo = ctx_lo;
5763 wmb();
5764 iommu->flush.flush_context(iommu,
5765 domain->iommu_did[iommu->seq_id],
5766 PCI_DEVID(info->bus, info->devfn),
5767 DMA_CCMD_MASK_NOBIT,
5768 DMA_CCMD_DEVICE_INVL);
5769 }
5770
5771 /* Enable PASID support in the device, if it wasn't already */
5772 if (!info->pasid_enabled)
5773 iommu_enable_dev_iotlb(info);
5774
5775 ret = 0;
5776
5777 out:
5778 spin_unlock(&iommu->lock);
5779 spin_unlock_irqrestore(&device_domain_lock, flags);
5780
5781 return ret;
5782}
5783
5784static void intel_iommu_apply_resv_region(struct device *dev,
5785 struct iommu_domain *domain,
5786 struct iommu_resv_region *region)
5787{
5788 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5789 unsigned long start, end;
5790
5791 start = IOVA_PFN(region->start);
5792 end = IOVA_PFN(region->start + region->length - 1);
5793
5794 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5795}
5796
5797#ifdef CONFIG_INTEL_IOMMU_SVM
5798struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5799{
5800 struct intel_iommu *iommu;
5801 u8 bus, devfn;
5802
5803 if (iommu_dummy(dev)) {
5804 dev_warn(dev,
5805 "No IOMMU translation for device; cannot enable SVM\n");
5806 return NULL;
5807 }
5808
5809 iommu = device_to_iommu(dev, &bus, &devfn);
5810 if ((!iommu)) {
5811 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5812 return NULL;
5813 }
5814
5815 return iommu;
5816}
5817#endif /* CONFIG_INTEL_IOMMU_SVM */
5818
5819static int intel_iommu_enable_auxd(struct device *dev)
5820{
5821 struct device_domain_info *info;
5822 struct intel_iommu *iommu;
5823 unsigned long flags;
5824 u8 bus, devfn;
5825 int ret;
5826
5827 iommu = device_to_iommu(dev, &bus, &devfn);
5828 if (!iommu || dmar_disabled)
5829 return -EINVAL;
5830
5831 if (!sm_supported(iommu) || !pasid_supported(iommu))
5832 return -EINVAL;
5833
5834 ret = intel_iommu_enable_pasid(iommu, dev);
5835 if (ret)
5836 return -ENODEV;
5837
5838 spin_lock_irqsave(&device_domain_lock, flags);
5839 info = dev->archdata.iommu;
5840 info->auxd_enabled = 1;
5841 spin_unlock_irqrestore(&device_domain_lock, flags);
5842
5843 return 0;
5844}
5845
5846static int intel_iommu_disable_auxd(struct device *dev)
5847{
5848 struct device_domain_info *info;
5849 unsigned long flags;
5850
5851 spin_lock_irqsave(&device_domain_lock, flags);
5852 info = dev->archdata.iommu;
5853 if (!WARN_ON(!info))
5854 info->auxd_enabled = 0;
5855 spin_unlock_irqrestore(&device_domain_lock, flags);
5856
5857 return 0;
5858}
5859
5860/*
5861 * A PCI express designated vendor specific extended capability is defined
5862 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5863 * for system software and tools to detect endpoint devices supporting the
5864 * Intel scalable IO virtualization without host driver dependency.
5865 *
5866 * Returns the address of the matching extended capability structure within
5867 * the device's PCI configuration space or 0 if the device does not support
5868 * it.
5869 */
5870static int siov_find_pci_dvsec(struct pci_dev *pdev)
5871{
5872 int pos;
5873 u16 vendor, id;
5874
5875 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5876 while (pos) {
5877 pci_read_config_word(pdev, pos + 4, &vendor);
5878 pci_read_config_word(pdev, pos + 8, &id);
5879 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5880 return pos;
5881
5882 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5883 }
5884
5885 return 0;
5886}
5887
5888static bool
5889intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5890{
5891 if (feat == IOMMU_DEV_FEAT_AUX) {
5892 int ret;
5893
5894 if (!dev_is_pci(dev) || dmar_disabled ||
5895 !scalable_mode_support() || !iommu_pasid_support())
5896 return false;
5897
5898 ret = pci_pasid_features(to_pci_dev(dev));
5899 if (ret < 0)
5900 return false;
5901
5902 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5903 }
5904
5905 return false;
5906}
5907
5908static int
5909intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5910{
5911 if (feat == IOMMU_DEV_FEAT_AUX)
5912 return intel_iommu_enable_auxd(dev);
5913
5914 return -ENODEV;
5915}
5916
5917static int
5918intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5919{
5920 if (feat == IOMMU_DEV_FEAT_AUX)
5921 return intel_iommu_disable_auxd(dev);
5922
5923 return -ENODEV;
5924}
5925
5926static bool
5927intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5928{
5929 struct device_domain_info *info = dev->archdata.iommu;
5930
5931 if (feat == IOMMU_DEV_FEAT_AUX)
5932 return scalable_mode_support() && info && info->auxd_enabled;
5933
5934 return false;
5935}
5936
5937static int
5938intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5939{
5940 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5941
5942 return dmar_domain->default_pasid > 0 ?
5943 dmar_domain->default_pasid : -EINVAL;
5944}
5945
5946static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5947 struct device *dev)
5948{
5949 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5950}
5951
5952const struct iommu_ops intel_iommu_ops = {
5953 .capable = intel_iommu_capable,
5954 .domain_alloc = intel_iommu_domain_alloc,
5955 .domain_free = intel_iommu_domain_free,
5956 .attach_dev = intel_iommu_attach_device,
5957 .detach_dev = intel_iommu_detach_device,
5958 .aux_attach_dev = intel_iommu_aux_attach_device,
5959 .aux_detach_dev = intel_iommu_aux_detach_device,
5960 .aux_get_pasid = intel_iommu_aux_get_pasid,
5961 .map = intel_iommu_map,
5962 .unmap = intel_iommu_unmap,
5963 .iova_to_phys = intel_iommu_iova_to_phys,
5964 .add_device = intel_iommu_add_device,
5965 .remove_device = intel_iommu_remove_device,
5966 .get_resv_regions = intel_iommu_get_resv_regions,
5967 .put_resv_regions = intel_iommu_put_resv_regions,
5968 .apply_resv_region = intel_iommu_apply_resv_region,
5969 .device_group = pci_device_group,
5970 .dev_has_feat = intel_iommu_dev_has_feat,
5971 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5972 .dev_enable_feat = intel_iommu_dev_enable_feat,
5973 .dev_disable_feat = intel_iommu_dev_disable_feat,
5974 .is_attach_deferred = intel_iommu_is_attach_deferred,
5975 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5976};
5977
5978static void quirk_iommu_igfx(struct pci_dev *dev)
5979{
5980 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5981 dmar_map_gfx = 0;
5982}
5983
5984/* G4x/GM45 integrated gfx dmar support is totally busted. */
5985DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5986DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5987DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5988DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5989DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5990DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5991DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5992
5993/* Broadwell igfx malfunctions with dmar */
5994DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5995DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5996DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5997DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5998DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5999DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6000DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6001DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6002DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6003DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6004DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6005DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6006DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6007DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6008DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6009DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6010DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6011DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6012DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6013DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6014DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6015DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6016DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6017DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6018
6019static void quirk_iommu_rwbf(struct pci_dev *dev)
6020{
6021 /*
6022 * Mobile 4 Series Chipset neglects to set RWBF capability,
6023 * but needs it. Same seems to hold for the desktop versions.
6024 */
6025 pci_info(dev, "Forcing write-buffer flush capability\n");
6026 rwbf_quirk = 1;
6027}
6028
6029DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6030DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6031DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6032DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6033DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6034DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6035DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6036
6037#define GGC 0x52
6038#define GGC_MEMORY_SIZE_MASK (0xf << 8)
6039#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6040#define GGC_MEMORY_SIZE_1M (0x1 << 8)
6041#define GGC_MEMORY_SIZE_2M (0x3 << 8)
6042#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6043#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6044#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6045#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6046
6047static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6048{
6049 unsigned short ggc;
6050
6051 if (pci_read_config_word(dev, GGC, &ggc))
6052 return;
6053
6054 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6055 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6056 dmar_map_gfx = 0;
6057 } else if (dmar_map_gfx) {
6058 /* we have to ensure the gfx device is idle before we flush */
6059 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6060 intel_iommu_strict = 1;
6061 }
6062}
6063DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6064DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6065DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6066DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6067
6068/* On Tylersburg chipsets, some BIOSes have been known to enable the
6069 ISOCH DMAR unit for the Azalia sound device, but not give it any
6070 TLB entries, which causes it to deadlock. Check for that. We do
6071 this in a function called from init_dmars(), instead of in a PCI
6072 quirk, because we don't want to print the obnoxious "BIOS broken"
6073 message if VT-d is actually disabled.
6074*/
6075static void __init check_tylersburg_isoch(void)
6076{
6077 struct pci_dev *pdev;
6078 uint32_t vtisochctrl;
6079
6080 /* If there's no Azalia in the system anyway, forget it. */
6081 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6082 if (!pdev)
6083 return;
6084 pci_dev_put(pdev);
6085
6086 /* System Management Registers. Might be hidden, in which case
6087 we can't do the sanity check. But that's OK, because the
6088 known-broken BIOSes _don't_ actually hide it, so far. */
6089 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6090 if (!pdev)
6091 return;
6092
6093 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6094 pci_dev_put(pdev);
6095 return;
6096 }
6097
6098 pci_dev_put(pdev);
6099
6100 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6101 if (vtisochctrl & 1)
6102 return;
6103
6104 /* Drop all bits other than the number of TLB entries */
6105 vtisochctrl &= 0x1c;
6106
6107 /* If we have the recommended number of TLB entries (16), fine. */
6108 if (vtisochctrl == 0x10)
6109 return;
6110
6111 /* Zero TLB entries? You get to ride the short bus to school. */
6112 if (!vtisochctrl) {
6113 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6114 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6115 dmi_get_system_info(DMI_BIOS_VENDOR),
6116 dmi_get_system_info(DMI_BIOS_VERSION),
6117 dmi_get_system_info(DMI_PRODUCT_VERSION));
6118 iommu_identity_mapping |= IDENTMAP_AZALIA;
6119 return;
6120 }
6121
6122 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6123 vtisochctrl);
6124}