Loading...
Note: File does not exist in v3.15.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/init.h>
17#include <linux/bitmap.h>
18#include <linux/debugfs.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/irq.h>
22#include <linux/interrupt.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25#include <linux/dmar.h>
26#include <linux/dma-mapping.h>
27#include <linux/mempool.h>
28#include <linux/memory.h>
29#include <linux/cpu.h>
30#include <linux/timer.h>
31#include <linux/io.h>
32#include <linux/iova.h>
33#include <linux/iommu.h>
34#include <linux/intel-iommu.h>
35#include <linux/syscore_ops.h>
36#include <linux/tboot.h>
37#include <linux/dmi.h>
38#include <linux/pci-ats.h>
39#include <linux/memblock.h>
40#include <linux/dma-contiguous.h>
41#include <linux/dma-direct.h>
42#include <linux/crash_dump.h>
43#include <linux/numa.h>
44#include <linux/swiotlb.h>
45#include <asm/irq_remapping.h>
46#include <asm/cacheflush.h>
47#include <asm/iommu.h>
48#include <trace/events/intel_iommu.h>
49
50#include "../irq_remapping.h"
51#include "pasid.h"
52
53#define ROOT_SIZE VTD_PAGE_SIZE
54#define CONTEXT_SIZE VTD_PAGE_SIZE
55
56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61#define IOAPIC_RANGE_START (0xfee00000)
62#define IOAPIC_RANGE_END (0xfeefffff)
63#define IOVA_START_ADDR (0x1000)
64
65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67#define MAX_AGAW_WIDTH 64
68#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79/* IO virtual address start page frame number */
80#define IOVA_START_PFN (1)
81
82#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84/* page table handling */
85#define LEVEL_STRIDE (9)
86#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88/*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
106static inline int agaw_to_level(int agaw)
107{
108 return agaw + 2;
109}
110
111static inline int agaw_to_width(int agaw)
112{
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114}
115
116static inline int width_to_agaw(int width)
117{
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119}
120
121static inline unsigned int level_to_offset_bits(int level)
122{
123 return (level - 1) * LEVEL_STRIDE;
124}
125
126static inline int pfn_level_offset(u64 pfn, int level)
127{
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129}
130
131static inline u64 level_mask(int level)
132{
133 return -1ULL << level_to_offset_bits(level);
134}
135
136static inline u64 level_size(int level)
137{
138 return 1ULL << level_to_offset_bits(level);
139}
140
141static inline u64 align_to_level(u64 pfn, int level)
142{
143 return (pfn + level_size(level) - 1) & level_mask(level);
144}
145
146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147{
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149}
150
151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154{
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156}
157
158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159{
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161}
162static inline unsigned long page_to_dma_pfn(struct page *pg)
163{
164 return mm_to_dma_pfn(page_to_pfn(pg));
165}
166static inline unsigned long virt_to_dma_pfn(void *p)
167{
168 return page_to_dma_pfn(virt_to_page(p));
169}
170
171/* global iommu list, set NULL for ignored DMAR units */
172static struct intel_iommu **g_iommus;
173
174static void __init check_tylersburg_isoch(void);
175static int rwbf_quirk;
176
177/*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181static int force_on = 0;
182int intel_iommu_tboot_noforce;
183static int no_platform_optin;
184
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187/*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191static phys_addr_t root_entry_lctp(struct root_entry *re)
192{
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197}
198
199/*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203static phys_addr_t root_entry_uctp(struct root_entry *re)
204{
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209}
210
211static inline void context_clear_pasid_enable(struct context_entry *context)
212{
213 context->lo &= ~(1ULL << 11);
214}
215
216static inline bool context_pasid_enabled(struct context_entry *context)
217{
218 return !!(context->lo & (1ULL << 11));
219}
220
221static inline void context_set_copied(struct context_entry *context)
222{
223 context->hi |= (1ull << 3);
224}
225
226static inline bool context_copied(struct context_entry *context)
227{
228 return !!(context->hi & (1ULL << 3));
229}
230
231static inline bool __context_present(struct context_entry *context)
232{
233 return (context->lo & 1);
234}
235
236bool context_present(struct context_entry *context)
237{
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241}
242
243static inline void context_set_present(struct context_entry *context)
244{
245 context->lo |= 1;
246}
247
248static inline void context_set_fault_enable(struct context_entry *context)
249{
250 context->lo &= (((u64)-1) << 2) | 1;
251}
252
253static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255{
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258}
259
260static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262{
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265}
266
267static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269{
270 context->hi |= value & 7;
271}
272
273static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275{
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277}
278
279static inline int context_domain_id(struct context_entry *c)
280{
281 return((c->hi >> 8) & 0xffff);
282}
283
284static inline void context_clear_entry(struct context_entry *context)
285{
286 context->lo = 0;
287 context->hi = 0;
288}
289
290/*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296static struct dmar_domain *si_domain;
297static int hw_pass_through = 1;
298
299#define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
302
303struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
310};
311
312struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
318};
319
320static LIST_HEAD(dmar_atsr_units);
321static LIST_HEAD(dmar_rmrr_units);
322
323#define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326/* bitmap for indexing intel_iommus */
327static int g_num_of_iommus;
328
329static void domain_exit(struct dmar_domain *domain);
330static void domain_remove_dev_info(struct dmar_domain *domain);
331static void dmar_remove_one_dev_info(struct device *dev);
332static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333static int intel_iommu_attach_device(struct iommu_domain *domain,
334 struct device *dev);
335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 dma_addr_t iova);
337
338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339int dmar_disabled = 0;
340#else
341int dmar_disabled = 1;
342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345int intel_iommu_sm = 1;
346#else
347int intel_iommu_sm;
348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350int intel_iommu_enabled = 0;
351EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353static int dmar_map_gfx = 1;
354static int dmar_forcedac;
355static int intel_iommu_strict;
356static int intel_iommu_superpage = 1;
357static int iommu_identity_mapping;
358static int intel_no_bounce;
359static int iommu_skip_te_disable;
360
361#define IDENTMAP_GFX 2
362#define IDENTMAP_AZALIA 4
363
364int intel_iommu_gfx_mapped;
365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368struct device_domain_info *get_domain_info(struct device *dev)
369{
370 struct device_domain_info *info;
371
372 if (!dev)
373 return NULL;
374
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 return NULL;
378
379 return info;
380}
381
382DEFINE_SPINLOCK(device_domain_lock);
383static LIST_HEAD(device_domain_list);
384
385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
387
388/*
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
391 */
392int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
394{
395 int ret = 0;
396 unsigned long flags;
397 struct device_domain_info *info;
398
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
402 if (ret) {
403 spin_unlock_irqrestore(&device_domain_lock, flags);
404 return ret;
405 }
406 }
407 spin_unlock_irqrestore(&device_domain_lock, flags);
408
409 return 0;
410}
411
412const struct iommu_ops intel_iommu_ops;
413
414static bool translation_pre_enabled(struct intel_iommu *iommu)
415{
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417}
418
419static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420{
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422}
423
424static void init_translation_status(struct intel_iommu *iommu)
425{
426 u32 gsts;
427
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431}
432
433static int __init intel_iommu_setup(char *str)
434{
435 if (!str)
436 return -EINVAL;
437 while (*str) {
438 if (!strncmp(str, "on", 2)) {
439 dmar_disabled = 0;
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
442 dmar_disabled = 1;
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
446 dmar_map_gfx = 0;
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
450 dmar_forcedac = 1;
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
459 intel_iommu_sm = 1;
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 intel_no_bounce = 1;
466 }
467
468 str += strcspn(str, ",");
469 while (*str == ',')
470 str++;
471 }
472 return 0;
473}
474__setup("intel_iommu=", intel_iommu_setup);
475
476static struct kmem_cache *iommu_domain_cache;
477static struct kmem_cache *iommu_devinfo_cache;
478
479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480{
481 struct dmar_domain **domains;
482 int idx = did >> 8;
483
484 domains = iommu->domains[idx];
485 if (!domains)
486 return NULL;
487
488 return domains[did & 0xff];
489}
490
491static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
493{
494 struct dmar_domain **domains;
495 int idx = did >> 8;
496
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 }
501
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
504 return;
505 else
506 domains[did & 0xff] = domain;
507}
508
509void *alloc_pgtable_page(int node)
510{
511 struct page *page;
512 void *vaddr = NULL;
513
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 if (page)
516 vaddr = page_address(page);
517 return vaddr;
518}
519
520void free_pgtable_page(void *vaddr)
521{
522 free_page((unsigned long)vaddr);
523}
524
525static inline void *alloc_domain_mem(void)
526{
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528}
529
530static void free_domain_mem(void *vaddr)
531{
532 kmem_cache_free(iommu_domain_cache, vaddr);
533}
534
535static inline void * alloc_devinfo_mem(void)
536{
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538}
539
540static inline void free_devinfo_mem(void *vaddr)
541{
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
543}
544
545static inline int domain_type_is_si(struct dmar_domain *domain)
546{
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548}
549
550static inline bool domain_use_first_level(struct dmar_domain *domain)
551{
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553}
554
555static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
557{
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561}
562
563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564{
565 unsigned long sagaw;
566 int agaw = -1;
567
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
573 }
574
575 return agaw;
576}
577
578/*
579 * Calculate max SAGAW for each iommu.
580 */
581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582{
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584}
585
586/*
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
590 */
591int iommu_calculate_agaw(struct intel_iommu *iommu)
592{
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594}
595
596/* This functionin only returns single iommu in a domain */
597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598{
599 int iommu_id;
600
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
604
605 for_each_domain_iommu(iommu_id, domain)
606 break;
607
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
610
611 return g_iommus[iommu_id];
612}
613
614static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615{
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618}
619
620static void domain_update_iommu_coherency(struct dmar_domain *domain)
621{
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
624 bool found = false;
625 int i;
626
627 domain->iommu_coherency = 1;
628
629 for_each_domain_iommu(i, domain) {
630 found = true;
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
633 break;
634 }
635 }
636 if (found)
637 return;
638
639 /* No hardware attached; use lowest common denominator */
640 rcu_read_lock();
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
644 break;
645 }
646 }
647 rcu_read_unlock();
648}
649
650static int domain_update_iommu_snooping(struct intel_iommu *skip)
651{
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
654 int ret = 1;
655
656 rcu_read_lock();
657 for_each_active_iommu(iommu, drhd) {
658 if (iommu != skip) {
659 if (!ecap_sc_support(iommu->ecap)) {
660 ret = 0;
661 break;
662 }
663 }
664 }
665 rcu_read_unlock();
666
667 return ret;
668}
669
670static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 struct intel_iommu *skip)
672{
673 struct dmar_drhd_unit *drhd;
674 struct intel_iommu *iommu;
675 int mask = 0x3;
676
677 if (!intel_iommu_superpage) {
678 return 0;
679 }
680
681 /* set iommu_superpage to the smallest common denominator */
682 rcu_read_lock();
683 for_each_active_iommu(iommu, drhd) {
684 if (iommu != skip) {
685 if (domain && domain_use_first_level(domain)) {
686 if (!cap_fl1gp_support(iommu->cap))
687 mask = 0x1;
688 } else {
689 mask &= cap_super_page_val(iommu->cap);
690 }
691
692 if (!mask)
693 break;
694 }
695 }
696 rcu_read_unlock();
697
698 return fls(mask);
699}
700
701/* Some capabilities may be different across iommus */
702static void domain_update_iommu_cap(struct dmar_domain *domain)
703{
704 domain_update_iommu_coherency(domain);
705 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
706 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
707}
708
709struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
710 u8 devfn, int alloc)
711{
712 struct root_entry *root = &iommu->root_entry[bus];
713 struct context_entry *context;
714 u64 *entry;
715
716 entry = &root->lo;
717 if (sm_supported(iommu)) {
718 if (devfn >= 0x80) {
719 devfn -= 0x80;
720 entry = &root->hi;
721 }
722 devfn *= 2;
723 }
724 if (*entry & 1)
725 context = phys_to_virt(*entry & VTD_PAGE_MASK);
726 else {
727 unsigned long phy_addr;
728 if (!alloc)
729 return NULL;
730
731 context = alloc_pgtable_page(iommu->node);
732 if (!context)
733 return NULL;
734
735 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
736 phy_addr = virt_to_phys((void *)context);
737 *entry = phy_addr | 1;
738 __iommu_flush_cache(iommu, entry, sizeof(*entry));
739 }
740 return &context[devfn];
741}
742
743static bool attach_deferred(struct device *dev)
744{
745 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
746}
747
748/**
749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750 * sub-hierarchy of a candidate PCI-PCI bridge
751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752 * @bridge: the candidate PCI-PCI bridge
753 *
754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755 */
756static bool
757is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758{
759 struct pci_dev *pdev, *pbridge;
760
761 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762 return false;
763
764 pdev = to_pci_dev(dev);
765 pbridge = to_pci_dev(bridge);
766
767 if (pbridge->subordinate &&
768 pbridge->subordinate->number <= pdev->bus->number &&
769 pbridge->subordinate->busn_res.end >= pdev->bus->number)
770 return true;
771
772 return false;
773}
774
775static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
776{
777 struct dmar_drhd_unit *drhd;
778 u32 vtbar;
779 int rc;
780
781 /* We know that this device on this chipset has its own IOMMU.
782 * If we find it under a different IOMMU, then the BIOS is lying
783 * to us. Hope that the IOMMU for this device is actually
784 * disabled, and it needs no translation...
785 */
786 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
787 if (rc) {
788 /* "can't" happen */
789 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
790 return false;
791 }
792 vtbar &= 0xffff0000;
793
794 /* we know that the this iommu should be at offset 0xa000 from vtbar */
795 drhd = dmar_find_matched_drhd_unit(pdev);
796 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
797 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
798 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
799 return true;
800 }
801
802 return false;
803}
804
805static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
806{
807 if (!iommu || iommu->drhd->ignored)
808 return true;
809
810 if (dev_is_pci(dev)) {
811 struct pci_dev *pdev = to_pci_dev(dev);
812
813 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
814 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
815 quirk_ioat_snb_local_iommu(pdev))
816 return true;
817 }
818
819 return false;
820}
821
822struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
823{
824 struct dmar_drhd_unit *drhd = NULL;
825 struct pci_dev *pdev = NULL;
826 struct intel_iommu *iommu;
827 struct device *tmp;
828 u16 segment = 0;
829 int i;
830
831 if (!dev)
832 return NULL;
833
834 if (dev_is_pci(dev)) {
835 struct pci_dev *pf_pdev;
836
837 pdev = pci_real_dma_dev(to_pci_dev(dev));
838
839 /* VFs aren't listed in scope tables; we need to look up
840 * the PF instead to find the IOMMU. */
841 pf_pdev = pci_physfn(pdev);
842 dev = &pf_pdev->dev;
843 segment = pci_domain_nr(pdev->bus);
844 } else if (has_acpi_companion(dev))
845 dev = &ACPI_COMPANION(dev)->dev;
846
847 rcu_read_lock();
848 for_each_iommu(iommu, drhd) {
849 if (pdev && segment != drhd->segment)
850 continue;
851
852 for_each_active_dev_scope(drhd->devices,
853 drhd->devices_cnt, i, tmp) {
854 if (tmp == dev) {
855 /* For a VF use its original BDF# not that of the PF
856 * which we used for the IOMMU lookup. Strictly speaking
857 * we could do this for all PCI devices; we only need to
858 * get the BDF# from the scope table for ACPI matches. */
859 if (pdev && pdev->is_virtfn)
860 goto got_pdev;
861
862 if (bus && devfn) {
863 *bus = drhd->devices[i].bus;
864 *devfn = drhd->devices[i].devfn;
865 }
866 goto out;
867 }
868
869 if (is_downstream_to_pci_bridge(dev, tmp))
870 goto got_pdev;
871 }
872
873 if (pdev && drhd->include_all) {
874 got_pdev:
875 if (bus && devfn) {
876 *bus = pdev->bus->number;
877 *devfn = pdev->devfn;
878 }
879 goto out;
880 }
881 }
882 iommu = NULL;
883 out:
884 if (iommu_is_dummy(iommu, dev))
885 iommu = NULL;
886
887 rcu_read_unlock();
888
889 return iommu;
890}
891
892static void domain_flush_cache(struct dmar_domain *domain,
893 void *addr, int size)
894{
895 if (!domain->iommu_coherency)
896 clflush_cache_range(addr, size);
897}
898
899static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
900{
901 struct context_entry *context;
902 int ret = 0;
903 unsigned long flags;
904
905 spin_lock_irqsave(&iommu->lock, flags);
906 context = iommu_context_addr(iommu, bus, devfn, 0);
907 if (context)
908 ret = context_present(context);
909 spin_unlock_irqrestore(&iommu->lock, flags);
910 return ret;
911}
912
913static void free_context_table(struct intel_iommu *iommu)
914{
915 int i;
916 unsigned long flags;
917 struct context_entry *context;
918
919 spin_lock_irqsave(&iommu->lock, flags);
920 if (!iommu->root_entry) {
921 goto out;
922 }
923 for (i = 0; i < ROOT_ENTRY_NR; i++) {
924 context = iommu_context_addr(iommu, i, 0, 0);
925 if (context)
926 free_pgtable_page(context);
927
928 if (!sm_supported(iommu))
929 continue;
930
931 context = iommu_context_addr(iommu, i, 0x80, 0);
932 if (context)
933 free_pgtable_page(context);
934
935 }
936 free_pgtable_page(iommu->root_entry);
937 iommu->root_entry = NULL;
938out:
939 spin_unlock_irqrestore(&iommu->lock, flags);
940}
941
942static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
943 unsigned long pfn, int *target_level)
944{
945 struct dma_pte *parent, *pte;
946 int level = agaw_to_level(domain->agaw);
947 int offset;
948
949 BUG_ON(!domain->pgd);
950
951 if (!domain_pfn_supported(domain, pfn))
952 /* Address beyond IOMMU's addressing capabilities. */
953 return NULL;
954
955 parent = domain->pgd;
956
957 while (1) {
958 void *tmp_page;
959
960 offset = pfn_level_offset(pfn, level);
961 pte = &parent[offset];
962 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
963 break;
964 if (level == *target_level)
965 break;
966
967 if (!dma_pte_present(pte)) {
968 uint64_t pteval;
969
970 tmp_page = alloc_pgtable_page(domain->nid);
971
972 if (!tmp_page)
973 return NULL;
974
975 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
976 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
977 if (domain_use_first_level(domain))
978 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
979 if (cmpxchg64(&pte->val, 0ULL, pteval))
980 /* Someone else set it while we were thinking; use theirs. */
981 free_pgtable_page(tmp_page);
982 else
983 domain_flush_cache(domain, pte, sizeof(*pte));
984 }
985 if (level == 1)
986 break;
987
988 parent = phys_to_virt(dma_pte_addr(pte));
989 level--;
990 }
991
992 if (!*target_level)
993 *target_level = level;
994
995 return pte;
996}
997
998/* return address's pte at specific level */
999static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000 unsigned long pfn,
1001 int level, int *large_page)
1002{
1003 struct dma_pte *parent, *pte;
1004 int total = agaw_to_level(domain->agaw);
1005 int offset;
1006
1007 parent = domain->pgd;
1008 while (level <= total) {
1009 offset = pfn_level_offset(pfn, total);
1010 pte = &parent[offset];
1011 if (level == total)
1012 return pte;
1013
1014 if (!dma_pte_present(pte)) {
1015 *large_page = total;
1016 break;
1017 }
1018
1019 if (dma_pte_superpage(pte)) {
1020 *large_page = total;
1021 return pte;
1022 }
1023
1024 parent = phys_to_virt(dma_pte_addr(pte));
1025 total--;
1026 }
1027 return NULL;
1028}
1029
1030/* clear last level pte, a tlb flush should be followed */
1031static void dma_pte_clear_range(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn)
1034{
1035 unsigned int large_page;
1036 struct dma_pte *first_pte, *pte;
1037
1038 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040 BUG_ON(start_pfn > last_pfn);
1041
1042 /* we don't need lock here; nobody else touches the iova range */
1043 do {
1044 large_page = 1;
1045 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046 if (!pte) {
1047 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048 continue;
1049 }
1050 do {
1051 dma_clear_pte(pte);
1052 start_pfn += lvl_to_nr_pages(large_page);
1053 pte++;
1054 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055
1056 domain_flush_cache(domain, first_pte,
1057 (void *)pte - (void *)first_pte);
1058
1059 } while (start_pfn && start_pfn <= last_pfn);
1060}
1061
1062static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063 int retain_level, struct dma_pte *pte,
1064 unsigned long pfn, unsigned long start_pfn,
1065 unsigned long last_pfn)
1066{
1067 pfn = max(start_pfn, pfn);
1068 pte = &pte[pfn_level_offset(pfn, level)];
1069
1070 do {
1071 unsigned long level_pfn;
1072 struct dma_pte *level_pte;
1073
1074 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075 goto next;
1076
1077 level_pfn = pfn & level_mask(level);
1078 level_pte = phys_to_virt(dma_pte_addr(pte));
1079
1080 if (level > 2) {
1081 dma_pte_free_level(domain, level - 1, retain_level,
1082 level_pte, level_pfn, start_pfn,
1083 last_pfn);
1084 }
1085
1086 /*
1087 * Free the page table if we're below the level we want to
1088 * retain and the range covers the entire table.
1089 */
1090 if (level < retain_level && !(start_pfn > level_pfn ||
1091 last_pfn < level_pfn + level_size(level) - 1)) {
1092 dma_clear_pte(pte);
1093 domain_flush_cache(domain, pte, sizeof(*pte));
1094 free_pgtable_page(level_pte);
1095 }
1096next:
1097 pfn += level_size(level);
1098 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099}
1100
1101/*
1102 * clear last level (leaf) ptes and free page table pages below the
1103 * level we wish to keep intact.
1104 */
1105static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106 unsigned long start_pfn,
1107 unsigned long last_pfn,
1108 int retain_level)
1109{
1110 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112 BUG_ON(start_pfn > last_pfn);
1113
1114 dma_pte_clear_range(domain, start_pfn, last_pfn);
1115
1116 /* We don't need lock here; nobody else touches the iova range */
1117 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118 domain->pgd, 0, start_pfn, last_pfn);
1119
1120 /* free pgd */
1121 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122 free_pgtable_page(domain->pgd);
1123 domain->pgd = NULL;
1124 }
1125}
1126
1127/* When a page at a given level is being unlinked from its parent, we don't
1128 need to *modify* it at all. All we need to do is make a list of all the
1129 pages which can be freed just as soon as we've flushed the IOTLB and we
1130 know the hardware page-walk will no longer touch them.
1131 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132 be freed. */
1133static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134 int level, struct dma_pte *pte,
1135 struct page *freelist)
1136{
1137 struct page *pg;
1138
1139 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140 pg->freelist = freelist;
1141 freelist = pg;
1142
1143 if (level == 1)
1144 return freelist;
1145
1146 pte = page_address(pg);
1147 do {
1148 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149 freelist = dma_pte_list_pagetables(domain, level - 1,
1150 pte, freelist);
1151 pte++;
1152 } while (!first_pte_in_page(pte));
1153
1154 return freelist;
1155}
1156
1157static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158 struct dma_pte *pte, unsigned long pfn,
1159 unsigned long start_pfn,
1160 unsigned long last_pfn,
1161 struct page *freelist)
1162{
1163 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164
1165 pfn = max(start_pfn, pfn);
1166 pte = &pte[pfn_level_offset(pfn, level)];
1167
1168 do {
1169 unsigned long level_pfn;
1170
1171 if (!dma_pte_present(pte))
1172 goto next;
1173
1174 level_pfn = pfn & level_mask(level);
1175
1176 /* If range covers entire pagetable, free it */
1177 if (start_pfn <= level_pfn &&
1178 last_pfn >= level_pfn + level_size(level) - 1) {
1179 /* These suborbinate page tables are going away entirely. Don't
1180 bother to clear them; we're just going to *free* them. */
1181 if (level > 1 && !dma_pte_superpage(pte))
1182 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183
1184 dma_clear_pte(pte);
1185 if (!first_pte)
1186 first_pte = pte;
1187 last_pte = pte;
1188 } else if (level > 1) {
1189 /* Recurse down into a level that isn't *entirely* obsolete */
1190 freelist = dma_pte_clear_level(domain, level - 1,
1191 phys_to_virt(dma_pte_addr(pte)),
1192 level_pfn, start_pfn, last_pfn,
1193 freelist);
1194 }
1195next:
1196 pfn += level_size(level);
1197 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198
1199 if (first_pte)
1200 domain_flush_cache(domain, first_pte,
1201 (void *)++last_pte - (void *)first_pte);
1202
1203 return freelist;
1204}
1205
1206/* We can't just free the pages because the IOMMU may still be walking
1207 the page tables, and may have cached the intermediate levels. The
1208 pages can only be freed after the IOTLB flush has been done. */
1209static struct page *domain_unmap(struct dmar_domain *domain,
1210 unsigned long start_pfn,
1211 unsigned long last_pfn)
1212{
1213 struct page *freelist;
1214
1215 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217 BUG_ON(start_pfn > last_pfn);
1218
1219 /* we don't need lock here; nobody else touches the iova range */
1220 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221 domain->pgd, 0, start_pfn, last_pfn, NULL);
1222
1223 /* free pgd */
1224 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225 struct page *pgd_page = virt_to_page(domain->pgd);
1226 pgd_page->freelist = freelist;
1227 freelist = pgd_page;
1228
1229 domain->pgd = NULL;
1230 }
1231
1232 return freelist;
1233}
1234
1235static void dma_free_pagelist(struct page *freelist)
1236{
1237 struct page *pg;
1238
1239 while ((pg = freelist)) {
1240 freelist = pg->freelist;
1241 free_pgtable_page(page_address(pg));
1242 }
1243}
1244
1245static void iova_entry_free(unsigned long data)
1246{
1247 struct page *freelist = (struct page *)data;
1248
1249 dma_free_pagelist(freelist);
1250}
1251
1252/* iommu handling */
1253static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254{
1255 struct root_entry *root;
1256 unsigned long flags;
1257
1258 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259 if (!root) {
1260 pr_err("Allocating root entry for %s failed\n",
1261 iommu->name);
1262 return -ENOMEM;
1263 }
1264
1265 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1266
1267 spin_lock_irqsave(&iommu->lock, flags);
1268 iommu->root_entry = root;
1269 spin_unlock_irqrestore(&iommu->lock, flags);
1270
1271 return 0;
1272}
1273
1274static void iommu_set_root_entry(struct intel_iommu *iommu)
1275{
1276 u64 addr;
1277 u32 sts;
1278 unsigned long flag;
1279
1280 addr = virt_to_phys(iommu->root_entry);
1281 if (sm_supported(iommu))
1282 addr |= DMA_RTADDR_SMT;
1283
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286
1287 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288
1289 /* Make sure hardware complete it */
1290 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291 readl, (sts & DMA_GSTS_RTPS), sts);
1292
1293 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294}
1295
1296void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297{
1298 u32 val;
1299 unsigned long flag;
1300
1301 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302 return;
1303
1304 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306
1307 /* Make sure hardware complete it */
1308 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309 readl, (!(val & DMA_GSTS_WBFS)), val);
1310
1311 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312}
1313
1314/* return value determine if we need a write buffer flush */
1315static void __iommu_flush_context(struct intel_iommu *iommu,
1316 u16 did, u16 source_id, u8 function_mask,
1317 u64 type)
1318{
1319 u64 val = 0;
1320 unsigned long flag;
1321
1322 switch (type) {
1323 case DMA_CCMD_GLOBAL_INVL:
1324 val = DMA_CCMD_GLOBAL_INVL;
1325 break;
1326 case DMA_CCMD_DOMAIN_INVL:
1327 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328 break;
1329 case DMA_CCMD_DEVICE_INVL:
1330 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332 break;
1333 default:
1334 BUG();
1335 }
1336 val |= DMA_CCMD_ICC;
1337
1338 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340
1341 /* Make sure hardware complete it */
1342 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344
1345 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346}
1347
1348/* return value determine if we need a write buffer flush */
1349static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350 u64 addr, unsigned int size_order, u64 type)
1351{
1352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353 u64 val = 0, val_iva = 0;
1354 unsigned long flag;
1355
1356 switch (type) {
1357 case DMA_TLB_GLOBAL_FLUSH:
1358 /* global flush doesn't need set IVA_REG */
1359 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360 break;
1361 case DMA_TLB_DSI_FLUSH:
1362 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363 break;
1364 case DMA_TLB_PSI_FLUSH:
1365 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366 /* IH bit is passed in as part of address */
1367 val_iva = size_order | addr;
1368 break;
1369 default:
1370 BUG();
1371 }
1372 /* Note: set drain read/write */
1373#if 0
1374 /*
1375 * This is probably to be super secure.. Looks like we can
1376 * ignore it without any impact.
1377 */
1378 if (cap_read_drain(iommu->cap))
1379 val |= DMA_TLB_READ_DRAIN;
1380#endif
1381 if (cap_write_drain(iommu->cap))
1382 val |= DMA_TLB_WRITE_DRAIN;
1383
1384 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 /* Note: Only uses first TLB reg currently */
1386 if (val_iva)
1387 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389
1390 /* Make sure hardware complete it */
1391 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393
1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395
1396 /* check IOTLB invalidation granularity */
1397 if (DMA_TLB_IAIG(val) == 0)
1398 pr_err("Flush IOTLB failed\n");
1399 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400 pr_debug("TLB flush request %Lx, actual %Lx\n",
1401 (unsigned long long)DMA_TLB_IIRG(type),
1402 (unsigned long long)DMA_TLB_IAIG(val));
1403}
1404
1405static struct device_domain_info *
1406iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407 u8 bus, u8 devfn)
1408{
1409 struct device_domain_info *info;
1410
1411 assert_spin_locked(&device_domain_lock);
1412
1413 if (!iommu->qi)
1414 return NULL;
1415
1416 list_for_each_entry(info, &domain->devices, link)
1417 if (info->iommu == iommu && info->bus == bus &&
1418 info->devfn == devfn) {
1419 if (info->ats_supported && info->dev)
1420 return info;
1421 break;
1422 }
1423
1424 return NULL;
1425}
1426
1427static void domain_update_iotlb(struct dmar_domain *domain)
1428{
1429 struct device_domain_info *info;
1430 bool has_iotlb_device = false;
1431
1432 assert_spin_locked(&device_domain_lock);
1433
1434 list_for_each_entry(info, &domain->devices, link) {
1435 struct pci_dev *pdev;
1436
1437 if (!info->dev || !dev_is_pci(info->dev))
1438 continue;
1439
1440 pdev = to_pci_dev(info->dev);
1441 if (pdev->ats_enabled) {
1442 has_iotlb_device = true;
1443 break;
1444 }
1445 }
1446
1447 domain->has_iotlb_device = has_iotlb_device;
1448}
1449
1450static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451{
1452 struct pci_dev *pdev;
1453
1454 assert_spin_locked(&device_domain_lock);
1455
1456 if (!info || !dev_is_pci(info->dev))
1457 return;
1458
1459 pdev = to_pci_dev(info->dev);
1460 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463 * reserved, which should be set to 0.
1464 */
1465 if (!ecap_dit(info->iommu->ecap))
1466 info->pfsid = 0;
1467 else {
1468 struct pci_dev *pf_pdev;
1469
1470 /* pdev will be returned if device is not a vf */
1471 pf_pdev = pci_physfn(pdev);
1472 info->pfsid = pci_dev_id(pf_pdev);
1473 }
1474
1475#ifdef CONFIG_INTEL_IOMMU_SVM
1476 /* The PCIe spec, in its wisdom, declares that the behaviour of
1477 the device if you enable PASID support after ATS support is
1478 undefined. So always enable PASID support on devices which
1479 have it, even if we can't yet know if we're ever going to
1480 use it. */
1481 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482 info->pasid_enabled = 1;
1483
1484 if (info->pri_supported &&
1485 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1486 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487 info->pri_enabled = 1;
1488#endif
1489 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491 info->ats_enabled = 1;
1492 domain_update_iotlb(info->domain);
1493 info->ats_qdep = pci_ats_queue_depth(pdev);
1494 }
1495}
1496
1497static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498{
1499 struct pci_dev *pdev;
1500
1501 assert_spin_locked(&device_domain_lock);
1502
1503 if (!dev_is_pci(info->dev))
1504 return;
1505
1506 pdev = to_pci_dev(info->dev);
1507
1508 if (info->ats_enabled) {
1509 pci_disable_ats(pdev);
1510 info->ats_enabled = 0;
1511 domain_update_iotlb(info->domain);
1512 }
1513#ifdef CONFIG_INTEL_IOMMU_SVM
1514 if (info->pri_enabled) {
1515 pci_disable_pri(pdev);
1516 info->pri_enabled = 0;
1517 }
1518 if (info->pasid_enabled) {
1519 pci_disable_pasid(pdev);
1520 info->pasid_enabled = 0;
1521 }
1522#endif
1523}
1524
1525static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526 u64 addr, unsigned mask)
1527{
1528 u16 sid, qdep;
1529 unsigned long flags;
1530 struct device_domain_info *info;
1531
1532 if (!domain->has_iotlb_device)
1533 return;
1534
1535 spin_lock_irqsave(&device_domain_lock, flags);
1536 list_for_each_entry(info, &domain->devices, link) {
1537 if (!info->ats_enabled)
1538 continue;
1539
1540 sid = info->bus << 8 | info->devfn;
1541 qdep = info->ats_qdep;
1542 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543 qdep, addr, mask);
1544 }
1545 spin_unlock_irqrestore(&device_domain_lock, flags);
1546}
1547
1548static void domain_flush_piotlb(struct intel_iommu *iommu,
1549 struct dmar_domain *domain,
1550 u64 addr, unsigned long npages, bool ih)
1551{
1552 u16 did = domain->iommu_did[iommu->seq_id];
1553
1554 if (domain->default_pasid)
1555 qi_flush_piotlb(iommu, did, domain->default_pasid,
1556 addr, npages, ih);
1557
1558 if (!list_empty(&domain->devices))
1559 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1560}
1561
1562static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563 struct dmar_domain *domain,
1564 unsigned long pfn, unsigned int pages,
1565 int ih, int map)
1566{
1567 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569 u16 did = domain->iommu_did[iommu->seq_id];
1570
1571 BUG_ON(pages == 0);
1572
1573 if (ih)
1574 ih = 1 << 6;
1575
1576 if (domain_use_first_level(domain)) {
1577 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578 } else {
1579 /*
1580 * Fallback to domain selective flush if no PSI support or
1581 * the size is too big. PSI requires page size to be 2 ^ x,
1582 * and the base address is naturally aligned to the size.
1583 */
1584 if (!cap_pgsel_inv(iommu->cap) ||
1585 mask > cap_max_amask_val(iommu->cap))
1586 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587 DMA_TLB_DSI_FLUSH);
1588 else
1589 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590 DMA_TLB_PSI_FLUSH);
1591 }
1592
1593 /*
1594 * In caching mode, changes of pages from non-present to present require
1595 * flush. However, device IOTLB doesn't need to be flushed in this case.
1596 */
1597 if (!cap_caching_mode(iommu->cap) || !map)
1598 iommu_flush_dev_iotlb(domain, addr, mask);
1599}
1600
1601/* Notification for newly created mappings */
1602static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603 struct dmar_domain *domain,
1604 unsigned long pfn, unsigned int pages)
1605{
1606 /*
1607 * It's a non-present to present mapping. Only flush if caching mode
1608 * and second level.
1609 */
1610 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612 else
1613 iommu_flush_write_buffer(iommu);
1614}
1615
1616static void iommu_flush_iova(struct iova_domain *iovad)
1617{
1618 struct dmar_domain *domain;
1619 int idx;
1620
1621 domain = container_of(iovad, struct dmar_domain, iovad);
1622
1623 for_each_domain_iommu(idx, domain) {
1624 struct intel_iommu *iommu = g_iommus[idx];
1625 u16 did = domain->iommu_did[iommu->seq_id];
1626
1627 if (domain_use_first_level(domain))
1628 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629 else
1630 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631 DMA_TLB_DSI_FLUSH);
1632
1633 if (!cap_caching_mode(iommu->cap))
1634 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635 0, MAX_AGAW_PFN_WIDTH);
1636 }
1637}
1638
1639static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640{
1641 u32 pmen;
1642 unsigned long flags;
1643
1644 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645 return;
1646
1647 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649 pmen &= ~DMA_PMEN_EPM;
1650 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651
1652 /* wait for the protected region status bit to clear */
1653 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654 readl, !(pmen & DMA_PMEN_PRS), pmen);
1655
1656 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657}
1658
1659static void iommu_enable_translation(struct intel_iommu *iommu)
1660{
1661 u32 sts;
1662 unsigned long flags;
1663
1664 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665 iommu->gcmd |= DMA_GCMD_TE;
1666 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667
1668 /* Make sure hardware complete it */
1669 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670 readl, (sts & DMA_GSTS_TES), sts);
1671
1672 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673}
1674
1675static void iommu_disable_translation(struct intel_iommu *iommu)
1676{
1677 u32 sts;
1678 unsigned long flag;
1679
1680 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682 return;
1683
1684 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685 iommu->gcmd &= ~DMA_GCMD_TE;
1686 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688 /* Make sure hardware complete it */
1689 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690 readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693}
1694
1695static int iommu_init_domains(struct intel_iommu *iommu)
1696{
1697 u32 ndomains, nlongs;
1698 size_t size;
1699
1700 ndomains = cap_ndoms(iommu->cap);
1701 pr_debug("%s: Number of Domains supported <%d>\n",
1702 iommu->name, ndomains);
1703 nlongs = BITS_TO_LONGS(ndomains);
1704
1705 spin_lock_init(&iommu->lock);
1706
1707 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708 if (!iommu->domain_ids) {
1709 pr_err("%s: Allocating domain id array failed\n",
1710 iommu->name);
1711 return -ENOMEM;
1712 }
1713
1714 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715 iommu->domains = kzalloc(size, GFP_KERNEL);
1716
1717 if (iommu->domains) {
1718 size = 256 * sizeof(struct dmar_domain *);
1719 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720 }
1721
1722 if (!iommu->domains || !iommu->domains[0]) {
1723 pr_err("%s: Allocating domain array failed\n",
1724 iommu->name);
1725 kfree(iommu->domain_ids);
1726 kfree(iommu->domains);
1727 iommu->domain_ids = NULL;
1728 iommu->domains = NULL;
1729 return -ENOMEM;
1730 }
1731
1732 /*
1733 * If Caching mode is set, then invalid translations are tagged
1734 * with domain-id 0, hence we need to pre-allocate it. We also
1735 * use domain-id 0 as a marker for non-allocated domain-id, so
1736 * make sure it is not used for a real domain.
1737 */
1738 set_bit(0, iommu->domain_ids);
1739
1740 /*
1741 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742 * entry for first-level or pass-through translation modes should
1743 * be programmed with a domain id different from those used for
1744 * second-level or nested translation. We reserve a domain id for
1745 * this purpose.
1746 */
1747 if (sm_supported(iommu))
1748 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749
1750 return 0;
1751}
1752
1753static void disable_dmar_iommu(struct intel_iommu *iommu)
1754{
1755 struct device_domain_info *info, *tmp;
1756 unsigned long flags;
1757
1758 if (!iommu->domains || !iommu->domain_ids)
1759 return;
1760
1761 spin_lock_irqsave(&device_domain_lock, flags);
1762 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763 if (info->iommu != iommu)
1764 continue;
1765
1766 if (!info->dev || !info->domain)
1767 continue;
1768
1769 __dmar_remove_one_dev_info(info);
1770 }
1771 spin_unlock_irqrestore(&device_domain_lock, flags);
1772
1773 if (iommu->gcmd & DMA_GCMD_TE)
1774 iommu_disable_translation(iommu);
1775}
1776
1777static void free_dmar_iommu(struct intel_iommu *iommu)
1778{
1779 if ((iommu->domains) && (iommu->domain_ids)) {
1780 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781 int i;
1782
1783 for (i = 0; i < elems; i++)
1784 kfree(iommu->domains[i]);
1785 kfree(iommu->domains);
1786 kfree(iommu->domain_ids);
1787 iommu->domains = NULL;
1788 iommu->domain_ids = NULL;
1789 }
1790
1791 g_iommus[iommu->seq_id] = NULL;
1792
1793 /* free context mapping */
1794 free_context_table(iommu);
1795
1796#ifdef CONFIG_INTEL_IOMMU_SVM
1797 if (pasid_supported(iommu)) {
1798 if (ecap_prs(iommu->ecap))
1799 intel_svm_finish_prq(iommu);
1800 }
1801 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802 ioasid_unregister_allocator(&iommu->pasid_allocator);
1803
1804#endif
1805}
1806
1807/*
1808 * Check and return whether first level is used by default for
1809 * DMA translation.
1810 */
1811static bool first_level_by_default(void)
1812{
1813 struct dmar_drhd_unit *drhd;
1814 struct intel_iommu *iommu;
1815 static int first_level_support = -1;
1816
1817 if (likely(first_level_support != -1))
1818 return first_level_support;
1819
1820 first_level_support = 1;
1821
1822 rcu_read_lock();
1823 for_each_active_iommu(iommu, drhd) {
1824 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825 first_level_support = 0;
1826 break;
1827 }
1828 }
1829 rcu_read_unlock();
1830
1831 return first_level_support;
1832}
1833
1834static struct dmar_domain *alloc_domain(int flags)
1835{
1836 struct dmar_domain *domain;
1837
1838 domain = alloc_domain_mem();
1839 if (!domain)
1840 return NULL;
1841
1842 memset(domain, 0, sizeof(*domain));
1843 domain->nid = NUMA_NO_NODE;
1844 domain->flags = flags;
1845 if (first_level_by_default())
1846 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847 domain->has_iotlb_device = false;
1848 INIT_LIST_HEAD(&domain->devices);
1849
1850 return domain;
1851}
1852
1853/* Must be called with iommu->lock */
1854static int domain_attach_iommu(struct dmar_domain *domain,
1855 struct intel_iommu *iommu)
1856{
1857 unsigned long ndomains;
1858 int num;
1859
1860 assert_spin_locked(&device_domain_lock);
1861 assert_spin_locked(&iommu->lock);
1862
1863 domain->iommu_refcnt[iommu->seq_id] += 1;
1864 domain->iommu_count += 1;
1865 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866 ndomains = cap_ndoms(iommu->cap);
1867 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1868
1869 if (num >= ndomains) {
1870 pr_err("%s: No free domain ids\n", iommu->name);
1871 domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 domain->iommu_count -= 1;
1873 return -ENOSPC;
1874 }
1875
1876 set_bit(num, iommu->domain_ids);
1877 set_iommu_domain(iommu, num, domain);
1878
1879 domain->iommu_did[iommu->seq_id] = num;
1880 domain->nid = iommu->node;
1881
1882 domain_update_iommu_cap(domain);
1883 }
1884
1885 return 0;
1886}
1887
1888static int domain_detach_iommu(struct dmar_domain *domain,
1889 struct intel_iommu *iommu)
1890{
1891 int num, count;
1892
1893 assert_spin_locked(&device_domain_lock);
1894 assert_spin_locked(&iommu->lock);
1895
1896 domain->iommu_refcnt[iommu->seq_id] -= 1;
1897 count = --domain->iommu_count;
1898 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899 num = domain->iommu_did[iommu->seq_id];
1900 clear_bit(num, iommu->domain_ids);
1901 set_iommu_domain(iommu, num, NULL);
1902
1903 domain_update_iommu_cap(domain);
1904 domain->iommu_did[iommu->seq_id] = 0;
1905 }
1906
1907 return count;
1908}
1909
1910static struct iova_domain reserved_iova_list;
1911static struct lock_class_key reserved_rbtree_key;
1912
1913static int dmar_init_reserved_ranges(void)
1914{
1915 struct pci_dev *pdev = NULL;
1916 struct iova *iova;
1917 int i;
1918
1919 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920
1921 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922 &reserved_rbtree_key);
1923
1924 /* IOAPIC ranges shouldn't be accessed by DMA */
1925 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926 IOVA_PFN(IOAPIC_RANGE_END));
1927 if (!iova) {
1928 pr_err("Reserve IOAPIC range failed\n");
1929 return -ENODEV;
1930 }
1931
1932 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1933 for_each_pci_dev(pdev) {
1934 struct resource *r;
1935
1936 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937 r = &pdev->resource[i];
1938 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939 continue;
1940 iova = reserve_iova(&reserved_iova_list,
1941 IOVA_PFN(r->start),
1942 IOVA_PFN(r->end));
1943 if (!iova) {
1944 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945 return -ENODEV;
1946 }
1947 }
1948 }
1949 return 0;
1950}
1951
1952static inline int guestwidth_to_adjustwidth(int gaw)
1953{
1954 int agaw;
1955 int r = (gaw - 12) % 9;
1956
1957 if (r == 0)
1958 agaw = gaw;
1959 else
1960 agaw = gaw + 9 - r;
1961 if (agaw > 64)
1962 agaw = 64;
1963 return agaw;
1964}
1965
1966static void domain_exit(struct dmar_domain *domain)
1967{
1968
1969 /* Remove associated devices and clear attached or cached domains */
1970 domain_remove_dev_info(domain);
1971
1972 /* destroy iovas */
1973 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974 put_iova_domain(&domain->iovad);
1975
1976 if (domain->pgd) {
1977 struct page *freelist;
1978
1979 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980 dma_free_pagelist(freelist);
1981 }
1982
1983 free_domain_mem(domain);
1984}
1985
1986/*
1987 * Get the PASID directory size for scalable mode context entry.
1988 * Value of X in the PDTS field of a scalable mode context entry
1989 * indicates PASID directory with 2^(X + 7) entries.
1990 */
1991static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992{
1993 int pds, max_pde;
1994
1995 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997 if (pds < 7)
1998 return 0;
1999
2000 return pds - 7;
2001}
2002
2003/*
2004 * Set the RID_PASID field of a scalable mode context entry. The
2005 * IOMMU hardware will use the PASID value set in this field for
2006 * DMA translations of DMA requests without PASID.
2007 */
2008static inline void
2009context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010{
2011 context->hi |= pasid & ((1 << 20) - 1);
2012}
2013
2014/*
2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016 * entry.
2017 */
2018static inline void context_set_sm_dte(struct context_entry *context)
2019{
2020 context->lo |= (1 << 2);
2021}
2022
2023/*
2024 * Set the PRE(Page Request Enable) field of a scalable mode context
2025 * entry.
2026 */
2027static inline void context_set_sm_pre(struct context_entry *context)
2028{
2029 context->lo |= (1 << 4);
2030}
2031
2032/* Convert value to context PASID directory size field coding. */
2033#define context_pdts(pds) (((pds) & 0x7) << 9)
2034
2035static int domain_context_mapping_one(struct dmar_domain *domain,
2036 struct intel_iommu *iommu,
2037 struct pasid_table *table,
2038 u8 bus, u8 devfn)
2039{
2040 u16 did = domain->iommu_did[iommu->seq_id];
2041 int translation = CONTEXT_TT_MULTI_LEVEL;
2042 struct device_domain_info *info = NULL;
2043 struct context_entry *context;
2044 unsigned long flags;
2045 int ret;
2046
2047 WARN_ON(did == 0);
2048
2049 if (hw_pass_through && domain_type_is_si(domain))
2050 translation = CONTEXT_TT_PASS_THROUGH;
2051
2052 pr_debug("Set context mapping for %02x:%02x.%d\n",
2053 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054
2055 BUG_ON(!domain->pgd);
2056
2057 spin_lock_irqsave(&device_domain_lock, flags);
2058 spin_lock(&iommu->lock);
2059
2060 ret = -ENOMEM;
2061 context = iommu_context_addr(iommu, bus, devfn, 1);
2062 if (!context)
2063 goto out_unlock;
2064
2065 ret = 0;
2066 if (context_present(context))
2067 goto out_unlock;
2068
2069 /*
2070 * For kdump cases, old valid entries may be cached due to the
2071 * in-flight DMA and copied pgtable, but there is no unmapping
2072 * behaviour for them, thus we need an explicit cache flush for
2073 * the newly-mapped device. For kdump, at this point, the device
2074 * is supposed to finish reset at its driver probe stage, so no
2075 * in-flight DMA will exist, and we don't need to worry anymore
2076 * hereafter.
2077 */
2078 if (context_copied(context)) {
2079 u16 did_old = context_domain_id(context);
2080
2081 if (did_old < cap_ndoms(iommu->cap)) {
2082 iommu->flush.flush_context(iommu, did_old,
2083 (((u16)bus) << 8) | devfn,
2084 DMA_CCMD_MASK_NOBIT,
2085 DMA_CCMD_DEVICE_INVL);
2086 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087 DMA_TLB_DSI_FLUSH);
2088 }
2089 }
2090
2091 context_clear_entry(context);
2092
2093 if (sm_supported(iommu)) {
2094 unsigned long pds;
2095
2096 WARN_ON(!table);
2097
2098 /* Setup the PASID DIR pointer: */
2099 pds = context_get_sm_pds(table);
2100 context->lo = (u64)virt_to_phys(table->table) |
2101 context_pdts(pds);
2102
2103 /* Setup the RID_PASID field: */
2104 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105
2106 /*
2107 * Setup the Device-TLB enable bit and Page request
2108 * Enable bit:
2109 */
2110 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111 if (info && info->ats_supported)
2112 context_set_sm_dte(context);
2113 if (info && info->pri_supported)
2114 context_set_sm_pre(context);
2115 } else {
2116 struct dma_pte *pgd = domain->pgd;
2117 int agaw;
2118
2119 context_set_domain_id(context, did);
2120
2121 if (translation != CONTEXT_TT_PASS_THROUGH) {
2122 /*
2123 * Skip top levels of page tables for iommu which has
2124 * less agaw than default. Unnecessary for PT mode.
2125 */
2126 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127 ret = -ENOMEM;
2128 pgd = phys_to_virt(dma_pte_addr(pgd));
2129 if (!dma_pte_present(pgd))
2130 goto out_unlock;
2131 }
2132
2133 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134 if (info && info->ats_supported)
2135 translation = CONTEXT_TT_DEV_IOTLB;
2136 else
2137 translation = CONTEXT_TT_MULTI_LEVEL;
2138
2139 context_set_address_root(context, virt_to_phys(pgd));
2140 context_set_address_width(context, agaw);
2141 } else {
2142 /*
2143 * In pass through mode, AW must be programmed to
2144 * indicate the largest AGAW value supported by
2145 * hardware. And ASR is ignored by hardware.
2146 */
2147 context_set_address_width(context, iommu->msagaw);
2148 }
2149
2150 context_set_translation_type(context, translation);
2151 }
2152
2153 context_set_fault_enable(context);
2154 context_set_present(context);
2155 if (!ecap_coherent(iommu->ecap))
2156 clflush_cache_range(context, sizeof(*context));
2157
2158 /*
2159 * It's a non-present to present mapping. If hardware doesn't cache
2160 * non-present entry we only need to flush the write-buffer. If the
2161 * _does_ cache non-present entries, then it does so in the special
2162 * domain #0, which we have to flush:
2163 */
2164 if (cap_caching_mode(iommu->cap)) {
2165 iommu->flush.flush_context(iommu, 0,
2166 (((u16)bus) << 8) | devfn,
2167 DMA_CCMD_MASK_NOBIT,
2168 DMA_CCMD_DEVICE_INVL);
2169 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170 } else {
2171 iommu_flush_write_buffer(iommu);
2172 }
2173 iommu_enable_dev_iotlb(info);
2174
2175 ret = 0;
2176
2177out_unlock:
2178 spin_unlock(&iommu->lock);
2179 spin_unlock_irqrestore(&device_domain_lock, flags);
2180
2181 return ret;
2182}
2183
2184struct domain_context_mapping_data {
2185 struct dmar_domain *domain;
2186 struct intel_iommu *iommu;
2187 struct pasid_table *table;
2188};
2189
2190static int domain_context_mapping_cb(struct pci_dev *pdev,
2191 u16 alias, void *opaque)
2192{
2193 struct domain_context_mapping_data *data = opaque;
2194
2195 return domain_context_mapping_one(data->domain, data->iommu,
2196 data->table, PCI_BUS_NUM(alias),
2197 alias & 0xff);
2198}
2199
2200static int
2201domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202{
2203 struct domain_context_mapping_data data;
2204 struct pasid_table *table;
2205 struct intel_iommu *iommu;
2206 u8 bus, devfn;
2207
2208 iommu = device_to_iommu(dev, &bus, &devfn);
2209 if (!iommu)
2210 return -ENODEV;
2211
2212 table = intel_pasid_get_table(dev);
2213
2214 if (!dev_is_pci(dev))
2215 return domain_context_mapping_one(domain, iommu, table,
2216 bus, devfn);
2217
2218 data.domain = domain;
2219 data.iommu = iommu;
2220 data.table = table;
2221
2222 return pci_for_each_dma_alias(to_pci_dev(dev),
2223 &domain_context_mapping_cb, &data);
2224}
2225
2226static int domain_context_mapped_cb(struct pci_dev *pdev,
2227 u16 alias, void *opaque)
2228{
2229 struct intel_iommu *iommu = opaque;
2230
2231 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232}
2233
2234static int domain_context_mapped(struct device *dev)
2235{
2236 struct intel_iommu *iommu;
2237 u8 bus, devfn;
2238
2239 iommu = device_to_iommu(dev, &bus, &devfn);
2240 if (!iommu)
2241 return -ENODEV;
2242
2243 if (!dev_is_pci(dev))
2244 return device_context_mapped(iommu, bus, devfn);
2245
2246 return !pci_for_each_dma_alias(to_pci_dev(dev),
2247 domain_context_mapped_cb, iommu);
2248}
2249
2250/* Returns a number of VTD pages, but aligned to MM page size */
2251static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252 size_t size)
2253{
2254 host_addr &= ~PAGE_MASK;
2255 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256}
2257
2258/* Return largest possible superpage level for a given mapping */
2259static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260 unsigned long iov_pfn,
2261 unsigned long phy_pfn,
2262 unsigned long pages)
2263{
2264 int support, level = 1;
2265 unsigned long pfnmerge;
2266
2267 support = domain->iommu_superpage;
2268
2269 /* To use a large page, the virtual *and* physical addresses
2270 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271 of them will mean we have to use smaller pages. So just
2272 merge them and check both at once. */
2273 pfnmerge = iov_pfn | phy_pfn;
2274
2275 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276 pages >>= VTD_STRIDE_SHIFT;
2277 if (!pages)
2278 break;
2279 pfnmerge >>= VTD_STRIDE_SHIFT;
2280 level++;
2281 support--;
2282 }
2283 return level;
2284}
2285
2286static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287 struct scatterlist *sg, unsigned long phys_pfn,
2288 unsigned long nr_pages, int prot)
2289{
2290 struct dma_pte *first_pte = NULL, *pte = NULL;
2291 phys_addr_t pteval;
2292 unsigned long sg_res = 0;
2293 unsigned int largepage_lvl = 0;
2294 unsigned long lvl_pages = 0;
2295 u64 attr;
2296
2297 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298
2299 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300 return -EINVAL;
2301
2302 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303 if (domain_use_first_level(domain))
2304 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2305
2306 if (!sg) {
2307 sg_res = nr_pages;
2308 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309 }
2310
2311 while (nr_pages > 0) {
2312 uint64_t tmp;
2313
2314 if (!sg_res) {
2315 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316
2317 sg_res = aligned_nrpages(sg->offset, sg->length);
2318 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319 sg->dma_length = sg->length;
2320 pteval = (sg_phys(sg) - pgoff) | attr;
2321 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322 }
2323
2324 if (!pte) {
2325 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2326
2327 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328 if (!pte)
2329 return -ENOMEM;
2330 /* It is large page*/
2331 if (largepage_lvl > 1) {
2332 unsigned long nr_superpages, end_pfn;
2333
2334 pteval |= DMA_PTE_LARGE_PAGE;
2335 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336
2337 nr_superpages = sg_res / lvl_pages;
2338 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339
2340 /*
2341 * Ensure that old small page tables are
2342 * removed to make room for superpage(s).
2343 * We're adding new large pages, so make sure
2344 * we don't remove their parent tables.
2345 */
2346 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347 largepage_lvl + 1);
2348 } else {
2349 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350 }
2351
2352 }
2353 /* We don't need lock here, nobody else
2354 * touches the iova range
2355 */
2356 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357 if (tmp) {
2358 static int dumps = 5;
2359 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360 iov_pfn, tmp, (unsigned long long)pteval);
2361 if (dumps) {
2362 dumps--;
2363 debug_dma_dump_mappings(NULL);
2364 }
2365 WARN_ON(1);
2366 }
2367
2368 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369
2370 BUG_ON(nr_pages < lvl_pages);
2371 BUG_ON(sg_res < lvl_pages);
2372
2373 nr_pages -= lvl_pages;
2374 iov_pfn += lvl_pages;
2375 phys_pfn += lvl_pages;
2376 pteval += lvl_pages * VTD_PAGE_SIZE;
2377 sg_res -= lvl_pages;
2378
2379 /* If the next PTE would be the first in a new page, then we
2380 need to flush the cache on the entries we've just written.
2381 And then we'll need to recalculate 'pte', so clear it and
2382 let it get set again in the if (!pte) block above.
2383
2384 If we're done (!nr_pages) we need to flush the cache too.
2385
2386 Also if we've been setting superpages, we may need to
2387 recalculate 'pte' and switch back to smaller pages for the
2388 end of the mapping, if the trailing size is not enough to
2389 use another superpage (i.e. sg_res < lvl_pages). */
2390 pte++;
2391 if (!nr_pages || first_pte_in_page(pte) ||
2392 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393 domain_flush_cache(domain, first_pte,
2394 (void *)pte - (void *)first_pte);
2395 pte = NULL;
2396 }
2397
2398 if (!sg_res && nr_pages)
2399 sg = sg_next(sg);
2400 }
2401 return 0;
2402}
2403
2404static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405 struct scatterlist *sg, unsigned long phys_pfn,
2406 unsigned long nr_pages, int prot)
2407{
2408 int iommu_id, ret;
2409 struct intel_iommu *iommu;
2410
2411 /* Do the real mapping first */
2412 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413 if (ret)
2414 return ret;
2415
2416 for_each_domain_iommu(iommu_id, domain) {
2417 iommu = g_iommus[iommu_id];
2418 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419 }
2420
2421 return 0;
2422}
2423
2424static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425 struct scatterlist *sg, unsigned long nr_pages,
2426 int prot)
2427{
2428 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429}
2430
2431static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432 unsigned long phys_pfn, unsigned long nr_pages,
2433 int prot)
2434{
2435 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436}
2437
2438static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439{
2440 unsigned long flags;
2441 struct context_entry *context;
2442 u16 did_old;
2443
2444 if (!iommu)
2445 return;
2446
2447 spin_lock_irqsave(&iommu->lock, flags);
2448 context = iommu_context_addr(iommu, bus, devfn, 0);
2449 if (!context) {
2450 spin_unlock_irqrestore(&iommu->lock, flags);
2451 return;
2452 }
2453 did_old = context_domain_id(context);
2454 context_clear_entry(context);
2455 __iommu_flush_cache(iommu, context, sizeof(*context));
2456 spin_unlock_irqrestore(&iommu->lock, flags);
2457 iommu->flush.flush_context(iommu,
2458 did_old,
2459 (((u16)bus) << 8) | devfn,
2460 DMA_CCMD_MASK_NOBIT,
2461 DMA_CCMD_DEVICE_INVL);
2462 iommu->flush.flush_iotlb(iommu,
2463 did_old,
2464 0,
2465 0,
2466 DMA_TLB_DSI_FLUSH);
2467}
2468
2469static inline void unlink_domain_info(struct device_domain_info *info)
2470{
2471 assert_spin_locked(&device_domain_lock);
2472 list_del(&info->link);
2473 list_del(&info->global);
2474 if (info->dev)
2475 dev_iommu_priv_set(info->dev, NULL);
2476}
2477
2478static void domain_remove_dev_info(struct dmar_domain *domain)
2479{
2480 struct device_domain_info *info, *tmp;
2481 unsigned long flags;
2482
2483 spin_lock_irqsave(&device_domain_lock, flags);
2484 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485 __dmar_remove_one_dev_info(info);
2486 spin_unlock_irqrestore(&device_domain_lock, flags);
2487}
2488
2489struct dmar_domain *find_domain(struct device *dev)
2490{
2491 struct device_domain_info *info;
2492
2493 if (unlikely(attach_deferred(dev)))
2494 return NULL;
2495
2496 /* No lock here, assumes no domain exit in normal case */
2497 info = get_domain_info(dev);
2498 if (likely(info))
2499 return info->domain;
2500
2501 return NULL;
2502}
2503
2504static void do_deferred_attach(struct device *dev)
2505{
2506 struct iommu_domain *domain;
2507
2508 dev_iommu_priv_set(dev, NULL);
2509 domain = iommu_get_domain_for_dev(dev);
2510 if (domain)
2511 intel_iommu_attach_device(domain, dev);
2512}
2513
2514static inline struct device_domain_info *
2515dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516{
2517 struct device_domain_info *info;
2518
2519 list_for_each_entry(info, &device_domain_list, global)
2520 if (info->segment == segment && info->bus == bus &&
2521 info->devfn == devfn)
2522 return info;
2523
2524 return NULL;
2525}
2526
2527static int domain_setup_first_level(struct intel_iommu *iommu,
2528 struct dmar_domain *domain,
2529 struct device *dev,
2530 int pasid)
2531{
2532 int flags = PASID_FLAG_SUPERVISOR_MODE;
2533 struct dma_pte *pgd = domain->pgd;
2534 int agaw, level;
2535
2536 /*
2537 * Skip top levels of page tables for iommu which has
2538 * less agaw than default. Unnecessary for PT mode.
2539 */
2540 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541 pgd = phys_to_virt(dma_pte_addr(pgd));
2542 if (!dma_pte_present(pgd))
2543 return -ENOMEM;
2544 }
2545
2546 level = agaw_to_level(agaw);
2547 if (level != 4 && level != 5)
2548 return -EINVAL;
2549
2550 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2551
2552 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553 domain->iommu_did[iommu->seq_id],
2554 flags);
2555}
2556
2557static bool dev_is_real_dma_subdevice(struct device *dev)
2558{
2559 return dev && dev_is_pci(dev) &&
2560 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561}
2562
2563static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564 int bus, int devfn,
2565 struct device *dev,
2566 struct dmar_domain *domain)
2567{
2568 struct dmar_domain *found = NULL;
2569 struct device_domain_info *info;
2570 unsigned long flags;
2571 int ret;
2572
2573 info = alloc_devinfo_mem();
2574 if (!info)
2575 return NULL;
2576
2577 if (!dev_is_real_dma_subdevice(dev)) {
2578 info->bus = bus;
2579 info->devfn = devfn;
2580 info->segment = iommu->segment;
2581 } else {
2582 struct pci_dev *pdev = to_pci_dev(dev);
2583
2584 info->bus = pdev->bus->number;
2585 info->devfn = pdev->devfn;
2586 info->segment = pci_domain_nr(pdev->bus);
2587 }
2588
2589 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591 info->ats_qdep = 0;
2592 info->dev = dev;
2593 info->domain = domain;
2594 info->iommu = iommu;
2595 info->pasid_table = NULL;
2596 info->auxd_enabled = 0;
2597 INIT_LIST_HEAD(&info->auxiliary_domains);
2598
2599 if (dev && dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(info->dev);
2601
2602 if (ecap_dev_iotlb_support(iommu->ecap) &&
2603 pci_ats_supported(pdev) &&
2604 dmar_find_matched_atsr_unit(pdev))
2605 info->ats_supported = 1;
2606
2607 if (sm_supported(iommu)) {
2608 if (pasid_supported(iommu)) {
2609 int features = pci_pasid_features(pdev);
2610 if (features >= 0)
2611 info->pasid_supported = features | 1;
2612 }
2613
2614 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615 pci_pri_supported(pdev))
2616 info->pri_supported = 1;
2617 }
2618 }
2619
2620 spin_lock_irqsave(&device_domain_lock, flags);
2621 if (dev)
2622 found = find_domain(dev);
2623
2624 if (!found) {
2625 struct device_domain_info *info2;
2626 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627 info->devfn);
2628 if (info2) {
2629 found = info2->domain;
2630 info2->dev = dev;
2631 }
2632 }
2633
2634 if (found) {
2635 spin_unlock_irqrestore(&device_domain_lock, flags);
2636 free_devinfo_mem(info);
2637 /* Caller must free the original domain */
2638 return found;
2639 }
2640
2641 spin_lock(&iommu->lock);
2642 ret = domain_attach_iommu(domain, iommu);
2643 spin_unlock(&iommu->lock);
2644
2645 if (ret) {
2646 spin_unlock_irqrestore(&device_domain_lock, flags);
2647 free_devinfo_mem(info);
2648 return NULL;
2649 }
2650
2651 list_add(&info->link, &domain->devices);
2652 list_add(&info->global, &device_domain_list);
2653 if (dev)
2654 dev_iommu_priv_set(dev, info);
2655 spin_unlock_irqrestore(&device_domain_lock, flags);
2656
2657 /* PASID table is mandatory for a PCI device in scalable mode. */
2658 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659 ret = intel_pasid_alloc_table(dev);
2660 if (ret) {
2661 dev_err(dev, "PASID table allocation failed\n");
2662 dmar_remove_one_dev_info(dev);
2663 return NULL;
2664 }
2665
2666 /* Setup the PASID entry for requests without PASID: */
2667 spin_lock_irqsave(&iommu->lock, flags);
2668 if (hw_pass_through && domain_type_is_si(domain))
2669 ret = intel_pasid_setup_pass_through(iommu, domain,
2670 dev, PASID_RID2PASID);
2671 else if (domain_use_first_level(domain))
2672 ret = domain_setup_first_level(iommu, domain, dev,
2673 PASID_RID2PASID);
2674 else
2675 ret = intel_pasid_setup_second_level(iommu, domain,
2676 dev, PASID_RID2PASID);
2677 spin_unlock_irqrestore(&iommu->lock, flags);
2678 if (ret) {
2679 dev_err(dev, "Setup RID2PASID failed\n");
2680 dmar_remove_one_dev_info(dev);
2681 return NULL;
2682 }
2683 }
2684
2685 if (dev && domain_context_mapping(domain, dev)) {
2686 dev_err(dev, "Domain context map failed\n");
2687 dmar_remove_one_dev_info(dev);
2688 return NULL;
2689 }
2690
2691 return domain;
2692}
2693
2694static int iommu_domain_identity_map(struct dmar_domain *domain,
2695 unsigned long first_vpfn,
2696 unsigned long last_vpfn)
2697{
2698 /*
2699 * RMRR range might have overlap with physical memory range,
2700 * clear it first
2701 */
2702 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703
2704 return __domain_mapping(domain, first_vpfn, NULL,
2705 first_vpfn, last_vpfn - first_vpfn + 1,
2706 DMA_PTE_READ|DMA_PTE_WRITE);
2707}
2708
2709static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710
2711static int __init si_domain_init(int hw)
2712{
2713 struct dmar_rmrr_unit *rmrr;
2714 struct device *dev;
2715 int i, nid, ret;
2716
2717 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718 if (!si_domain)
2719 return -EFAULT;
2720
2721 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722 domain_exit(si_domain);
2723 return -EFAULT;
2724 }
2725
2726 if (hw)
2727 return 0;
2728
2729 for_each_online_node(nid) {
2730 unsigned long start_pfn, end_pfn;
2731 int i;
2732
2733 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734 ret = iommu_domain_identity_map(si_domain,
2735 mm_to_dma_pfn(start_pfn),
2736 mm_to_dma_pfn(end_pfn));
2737 if (ret)
2738 return ret;
2739 }
2740 }
2741
2742 /*
2743 * Identity map the RMRRs so that devices with RMRRs could also use
2744 * the si_domain.
2745 */
2746 for_each_rmrr_units(rmrr) {
2747 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748 i, dev) {
2749 unsigned long long start = rmrr->base_address;
2750 unsigned long long end = rmrr->end_address;
2751
2752 if (WARN_ON(end < start ||
2753 end >> agaw_to_width(si_domain->agaw)))
2754 continue;
2755
2756 ret = iommu_domain_identity_map(si_domain,
2757 mm_to_dma_pfn(start >> PAGE_SHIFT),
2758 mm_to_dma_pfn(end >> PAGE_SHIFT));
2759 if (ret)
2760 return ret;
2761 }
2762 }
2763
2764 return 0;
2765}
2766
2767static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2768{
2769 struct dmar_domain *ndomain;
2770 struct intel_iommu *iommu;
2771 u8 bus, devfn;
2772
2773 iommu = device_to_iommu(dev, &bus, &devfn);
2774 if (!iommu)
2775 return -ENODEV;
2776
2777 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778 if (ndomain != domain)
2779 return -EBUSY;
2780
2781 return 0;
2782}
2783
2784static bool device_has_rmrr(struct device *dev)
2785{
2786 struct dmar_rmrr_unit *rmrr;
2787 struct device *tmp;
2788 int i;
2789
2790 rcu_read_lock();
2791 for_each_rmrr_units(rmrr) {
2792 /*
2793 * Return TRUE if this RMRR contains the device that
2794 * is passed in.
2795 */
2796 for_each_active_dev_scope(rmrr->devices,
2797 rmrr->devices_cnt, i, tmp)
2798 if (tmp == dev ||
2799 is_downstream_to_pci_bridge(dev, tmp)) {
2800 rcu_read_unlock();
2801 return true;
2802 }
2803 }
2804 rcu_read_unlock();
2805 return false;
2806}
2807
2808/**
2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810 * is relaxable (ie. is allowed to be not enforced under some conditions)
2811 * @dev: device handle
2812 *
2813 * We assume that PCI USB devices with RMRRs have them largely
2814 * for historical reasons and that the RMRR space is not actively used post
2815 * boot. This exclusion may change if vendors begin to abuse it.
2816 *
2817 * The same exception is made for graphics devices, with the requirement that
2818 * any use of the RMRR regions will be torn down before assigning the device
2819 * to a guest.
2820 *
2821 * Return: true if the RMRR is relaxable, false otherwise
2822 */
2823static bool device_rmrr_is_relaxable(struct device *dev)
2824{
2825 struct pci_dev *pdev;
2826
2827 if (!dev_is_pci(dev))
2828 return false;
2829
2830 pdev = to_pci_dev(dev);
2831 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832 return true;
2833 else
2834 return false;
2835}
2836
2837/*
2838 * There are a couple cases where we need to restrict the functionality of
2839 * devices associated with RMRRs. The first is when evaluating a device for
2840 * identity mapping because problems exist when devices are moved in and out
2841 * of domains and their respective RMRR information is lost. This means that
2842 * a device with associated RMRRs will never be in a "passthrough" domain.
2843 * The second is use of the device through the IOMMU API. This interface
2844 * expects to have full control of the IOVA space for the device. We cannot
2845 * satisfy both the requirement that RMRR access is maintained and have an
2846 * unencumbered IOVA space. We also have no ability to quiesce the device's
2847 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848 * We therefore prevent devices associated with an RMRR from participating in
2849 * the IOMMU API, which eliminates them from device assignment.
2850 *
2851 * In both cases, devices which have relaxable RMRRs are not concerned by this
2852 * restriction. See device_rmrr_is_relaxable comment.
2853 */
2854static bool device_is_rmrr_locked(struct device *dev)
2855{
2856 if (!device_has_rmrr(dev))
2857 return false;
2858
2859 if (device_rmrr_is_relaxable(dev))
2860 return false;
2861
2862 return true;
2863}
2864
2865/*
2866 * Return the required default domain type for a specific device.
2867 *
2868 * @dev: the device in query
2869 * @startup: true if this is during early boot
2870 *
2871 * Returns:
2872 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874 * - 0: both identity and dynamic domains work for this device
2875 */
2876static int device_def_domain_type(struct device *dev)
2877{
2878 if (dev_is_pci(dev)) {
2879 struct pci_dev *pdev = to_pci_dev(dev);
2880
2881 /*
2882 * Prevent any device marked as untrusted from getting
2883 * placed into the statically identity mapping domain.
2884 */
2885 if (pdev->untrusted)
2886 return IOMMU_DOMAIN_DMA;
2887
2888 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889 return IOMMU_DOMAIN_IDENTITY;
2890
2891 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892 return IOMMU_DOMAIN_IDENTITY;
2893 }
2894
2895 return 0;
2896}
2897
2898static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899{
2900 /*
2901 * Start from the sane iommu hardware state.
2902 * If the queued invalidation is already initialized by us
2903 * (for example, while enabling interrupt-remapping) then
2904 * we got the things already rolling from a sane state.
2905 */
2906 if (!iommu->qi) {
2907 /*
2908 * Clear any previous faults.
2909 */
2910 dmar_fault(-1, iommu);
2911 /*
2912 * Disable queued invalidation if supported and already enabled
2913 * before OS handover.
2914 */
2915 dmar_disable_qi(iommu);
2916 }
2917
2918 if (dmar_enable_qi(iommu)) {
2919 /*
2920 * Queued Invalidate not enabled, use Register Based Invalidate
2921 */
2922 iommu->flush.flush_context = __iommu_flush_context;
2923 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924 pr_info("%s: Using Register based invalidation\n",
2925 iommu->name);
2926 } else {
2927 iommu->flush.flush_context = qi_flush_context;
2928 iommu->flush.flush_iotlb = qi_flush_iotlb;
2929 pr_info("%s: Using Queued invalidation\n", iommu->name);
2930 }
2931}
2932
2933static int copy_context_table(struct intel_iommu *iommu,
2934 struct root_entry *old_re,
2935 struct context_entry **tbl,
2936 int bus, bool ext)
2937{
2938 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939 struct context_entry *new_ce = NULL, ce;
2940 struct context_entry *old_ce = NULL;
2941 struct root_entry re;
2942 phys_addr_t old_ce_phys;
2943
2944 tbl_idx = ext ? bus * 2 : bus;
2945 memcpy(&re, old_re, sizeof(re));
2946
2947 for (devfn = 0; devfn < 256; devfn++) {
2948 /* First calculate the correct index */
2949 idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951 if (idx == 0) {
2952 /* First save what we may have and clean up */
2953 if (new_ce) {
2954 tbl[tbl_idx] = new_ce;
2955 __iommu_flush_cache(iommu, new_ce,
2956 VTD_PAGE_SIZE);
2957 pos = 1;
2958 }
2959
2960 if (old_ce)
2961 memunmap(old_ce);
2962
2963 ret = 0;
2964 if (devfn < 0x80)
2965 old_ce_phys = root_entry_lctp(&re);
2966 else
2967 old_ce_phys = root_entry_uctp(&re);
2968
2969 if (!old_ce_phys) {
2970 if (ext && devfn == 0) {
2971 /* No LCTP, try UCTP */
2972 devfn = 0x7f;
2973 continue;
2974 } else {
2975 goto out;
2976 }
2977 }
2978
2979 ret = -ENOMEM;
2980 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981 MEMREMAP_WB);
2982 if (!old_ce)
2983 goto out;
2984
2985 new_ce = alloc_pgtable_page(iommu->node);
2986 if (!new_ce)
2987 goto out_unmap;
2988
2989 ret = 0;
2990 }
2991
2992 /* Now copy the context entry */
2993 memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995 if (!__context_present(&ce))
2996 continue;
2997
2998 did = context_domain_id(&ce);
2999 if (did >= 0 && did < cap_ndoms(iommu->cap))
3000 set_bit(did, iommu->domain_ids);
3001
3002 /*
3003 * We need a marker for copied context entries. This
3004 * marker needs to work for the old format as well as
3005 * for extended context entries.
3006 *
3007 * Bit 67 of the context entry is used. In the old
3008 * format this bit is available to software, in the
3009 * extended format it is the PGE bit, but PGE is ignored
3010 * by HW if PASIDs are disabled (and thus still
3011 * available).
3012 *
3013 * So disable PASIDs first and then mark the entry
3014 * copied. This means that we don't copy PASID
3015 * translations from the old kernel, but this is fine as
3016 * faults there are not fatal.
3017 */
3018 context_clear_pasid_enable(&ce);
3019 context_set_copied(&ce);
3020
3021 new_ce[idx] = ce;
3022 }
3023
3024 tbl[tbl_idx + pos] = new_ce;
3025
3026 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028out_unmap:
3029 memunmap(old_ce);
3030
3031out:
3032 return ret;
3033}
3034
3035static int copy_translation_tables(struct intel_iommu *iommu)
3036{
3037 struct context_entry **ctxt_tbls;
3038 struct root_entry *old_rt;
3039 phys_addr_t old_rt_phys;
3040 int ctxt_table_entries;
3041 unsigned long flags;
3042 u64 rtaddr_reg;
3043 int bus, ret;
3044 bool new_ext, ext;
3045
3046 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048 new_ext = !!ecap_ecs(iommu->ecap);
3049
3050 /*
3051 * The RTT bit can only be changed when translation is disabled,
3052 * but disabling translation means to open a window for data
3053 * corruption. So bail out and don't copy anything if we would
3054 * have to change the bit.
3055 */
3056 if (new_ext != ext)
3057 return -EINVAL;
3058
3059 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060 if (!old_rt_phys)
3061 return -EINVAL;
3062
3063 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064 if (!old_rt)
3065 return -ENOMEM;
3066
3067 /* This is too big for the stack - allocate it from slab */
3068 ctxt_table_entries = ext ? 512 : 256;
3069 ret = -ENOMEM;
3070 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071 if (!ctxt_tbls)
3072 goto out_unmap;
3073
3074 for (bus = 0; bus < 256; bus++) {
3075 ret = copy_context_table(iommu, &old_rt[bus],
3076 ctxt_tbls, bus, ext);
3077 if (ret) {
3078 pr_err("%s: Failed to copy context table for bus %d\n",
3079 iommu->name, bus);
3080 continue;
3081 }
3082 }
3083
3084 spin_lock_irqsave(&iommu->lock, flags);
3085
3086 /* Context tables are copied, now write them to the root_entry table */
3087 for (bus = 0; bus < 256; bus++) {
3088 int idx = ext ? bus * 2 : bus;
3089 u64 val;
3090
3091 if (ctxt_tbls[idx]) {
3092 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093 iommu->root_entry[bus].lo = val;
3094 }
3095
3096 if (!ext || !ctxt_tbls[idx + 1])
3097 continue;
3098
3099 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100 iommu->root_entry[bus].hi = val;
3101 }
3102
3103 spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105 kfree(ctxt_tbls);
3106
3107 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109 ret = 0;
3110
3111out_unmap:
3112 memunmap(old_rt);
3113
3114 return ret;
3115}
3116
3117#ifdef CONFIG_INTEL_IOMMU_SVM
3118static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119{
3120 struct intel_iommu *iommu = data;
3121 ioasid_t ioasid;
3122
3123 if (!iommu)
3124 return INVALID_IOASID;
3125 /*
3126 * VT-d virtual command interface always uses the full 20 bit
3127 * PASID range. Host can partition guest PASID range based on
3128 * policies but it is out of guest's control.
3129 */
3130 if (min < PASID_MIN || max > intel_pasid_max_id)
3131 return INVALID_IOASID;
3132
3133 if (vcmd_alloc_pasid(iommu, &ioasid))
3134 return INVALID_IOASID;
3135
3136 return ioasid;
3137}
3138
3139static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140{
3141 struct intel_iommu *iommu = data;
3142
3143 if (!iommu)
3144 return;
3145 /*
3146 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147 * We can only free the PASID when all the devices are unbound.
3148 */
3149 if (ioasid_find(NULL, ioasid, NULL)) {
3150 pr_alert("Cannot free active IOASID %d\n", ioasid);
3151 return;
3152 }
3153 vcmd_free_pasid(iommu, ioasid);
3154}
3155
3156static void register_pasid_allocator(struct intel_iommu *iommu)
3157{
3158 /*
3159 * If we are running in the host, no need for custom allocator
3160 * in that PASIDs are allocated from the host system-wide.
3161 */
3162 if (!cap_caching_mode(iommu->cap))
3163 return;
3164
3165 if (!sm_supported(iommu)) {
3166 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167 return;
3168 }
3169
3170 /*
3171 * Register a custom PASID allocator if we are running in a guest,
3172 * guest PASID must be obtained via virtual command interface.
3173 * There can be multiple vIOMMUs in each guest but only one allocator
3174 * is active. All vIOMMU allocators will eventually be calling the same
3175 * host allocator.
3176 */
3177 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178 return;
3179
3180 pr_info("Register custom PASID allocator\n");
3181 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183 iommu->pasid_allocator.pdata = (void *)iommu;
3184 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186 /*
3187 * Disable scalable mode on this IOMMU if there
3188 * is no custom allocator. Mixing SM capable vIOMMU
3189 * and non-SM vIOMMU are not supported.
3190 */
3191 intel_iommu_sm = 0;
3192 }
3193}
3194#endif
3195
3196static int __init init_dmars(void)
3197{
3198 struct dmar_drhd_unit *drhd;
3199 struct intel_iommu *iommu;
3200 int ret;
3201
3202 /*
3203 * for each drhd
3204 * allocate root
3205 * initialize and program root entry to not present
3206 * endfor
3207 */
3208 for_each_drhd_unit(drhd) {
3209 /*
3210 * lock not needed as this is only incremented in the single
3211 * threaded kernel __init code path all other access are read
3212 * only
3213 */
3214 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215 g_num_of_iommus++;
3216 continue;
3217 }
3218 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219 }
3220
3221 /* Preallocate enough resources for IOMMU hot-addition */
3222 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224
3225 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226 GFP_KERNEL);
3227 if (!g_iommus) {
3228 pr_err("Allocating global iommu array failed\n");
3229 ret = -ENOMEM;
3230 goto error;
3231 }
3232
3233 for_each_iommu(iommu, drhd) {
3234 if (drhd->ignored) {
3235 iommu_disable_translation(iommu);
3236 continue;
3237 }
3238
3239 /*
3240 * Find the max pasid size of all IOMMU's in the system.
3241 * We need to ensure the system pasid table is no bigger
3242 * than the smallest supported.
3243 */
3244 if (pasid_supported(iommu)) {
3245 u32 temp = 2 << ecap_pss(iommu->ecap);
3246
3247 intel_pasid_max_id = min_t(u32, temp,
3248 intel_pasid_max_id);
3249 }
3250
3251 g_iommus[iommu->seq_id] = iommu;
3252
3253 intel_iommu_init_qi(iommu);
3254
3255 ret = iommu_init_domains(iommu);
3256 if (ret)
3257 goto free_iommu;
3258
3259 init_translation_status(iommu);
3260
3261 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262 iommu_disable_translation(iommu);
3263 clear_translation_pre_enabled(iommu);
3264 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265 iommu->name);
3266 }
3267
3268 /*
3269 * TBD:
3270 * we could share the same root & context tables
3271 * among all IOMMU's. Need to Split it later.
3272 */
3273 ret = iommu_alloc_root_entry(iommu);
3274 if (ret)
3275 goto free_iommu;
3276
3277 if (translation_pre_enabled(iommu)) {
3278 pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280 ret = copy_translation_tables(iommu);
3281 if (ret) {
3282 /*
3283 * We found the IOMMU with translation
3284 * enabled - but failed to copy over the
3285 * old root-entry table. Try to proceed
3286 * by disabling translation now and
3287 * allocating a clean root-entry table.
3288 * This might cause DMAR faults, but
3289 * probably the dump will still succeed.
3290 */
3291 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292 iommu->name);
3293 iommu_disable_translation(iommu);
3294 clear_translation_pre_enabled(iommu);
3295 } else {
3296 pr_info("Copied translation tables from previous kernel for %s\n",
3297 iommu->name);
3298 }
3299 }
3300
3301 if (!ecap_pass_through(iommu->ecap))
3302 hw_pass_through = 0;
3303 intel_svm_check(iommu);
3304 }
3305
3306 /*
3307 * Now that qi is enabled on all iommus, set the root entry and flush
3308 * caches. This is required on some Intel X58 chipsets, otherwise the
3309 * flush_context function will loop forever and the boot hangs.
3310 */
3311 for_each_active_iommu(iommu, drhd) {
3312 iommu_flush_write_buffer(iommu);
3313#ifdef CONFIG_INTEL_IOMMU_SVM
3314 register_pasid_allocator(iommu);
3315#endif
3316 iommu_set_root_entry(iommu);
3317 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319 }
3320
3321#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322 dmar_map_gfx = 0;
3323#endif
3324
3325 if (!dmar_map_gfx)
3326 iommu_identity_mapping |= IDENTMAP_GFX;
3327
3328 check_tylersburg_isoch();
3329
3330 ret = si_domain_init(hw_pass_through);
3331 if (ret)
3332 goto free_iommu;
3333
3334 /*
3335 * for each drhd
3336 * enable fault log
3337 * global invalidate context cache
3338 * global invalidate iotlb
3339 * enable translation
3340 */
3341 for_each_iommu(iommu, drhd) {
3342 if (drhd->ignored) {
3343 /*
3344 * we always have to disable PMRs or DMA may fail on
3345 * this device
3346 */
3347 if (force_on)
3348 iommu_disable_protect_mem_regions(iommu);
3349 continue;
3350 }
3351
3352 iommu_flush_write_buffer(iommu);
3353
3354#ifdef CONFIG_INTEL_IOMMU_SVM
3355 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356 /*
3357 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358 * could cause possible lock race condition.
3359 */
3360 up_write(&dmar_global_lock);
3361 ret = intel_svm_enable_prq(iommu);
3362 down_write(&dmar_global_lock);
3363 if (ret)
3364 goto free_iommu;
3365 }
3366#endif
3367 ret = dmar_set_interrupt(iommu);
3368 if (ret)
3369 goto free_iommu;
3370 }
3371
3372 return 0;
3373
3374free_iommu:
3375 for_each_active_iommu(iommu, drhd) {
3376 disable_dmar_iommu(iommu);
3377 free_dmar_iommu(iommu);
3378 }
3379
3380 kfree(g_iommus);
3381
3382error:
3383 return ret;
3384}
3385
3386/* This takes a number of _MM_ pages, not VTD pages */
3387static unsigned long intel_alloc_iova(struct device *dev,
3388 struct dmar_domain *domain,
3389 unsigned long nrpages, uint64_t dma_mask)
3390{
3391 unsigned long iova_pfn;
3392
3393 /*
3394 * Restrict dma_mask to the width that the iommu can handle.
3395 * First-level translation restricts the input-address to a
3396 * canonical address (i.e., address bits 63:N have the same
3397 * value as address bit [N-1], where N is 48-bits with 4-level
3398 * paging and 57-bits with 5-level paging). Hence, skip bit
3399 * [N-1].
3400 */
3401 if (domain_use_first_level(domain))
3402 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403 dma_mask);
3404 else
3405 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406 dma_mask);
3407
3408 /* Ensure we reserve the whole size-aligned region */
3409 nrpages = __roundup_pow_of_two(nrpages);
3410
3411 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412 /*
3413 * First try to allocate an io virtual address in
3414 * DMA_BIT_MASK(32) and if that fails then try allocating
3415 * from higher range
3416 */
3417 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418 IOVA_PFN(DMA_BIT_MASK(32)), false);
3419 if (iova_pfn)
3420 return iova_pfn;
3421 }
3422 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423 IOVA_PFN(dma_mask), true);
3424 if (unlikely(!iova_pfn)) {
3425 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426 nrpages);
3427 return 0;
3428 }
3429
3430 return iova_pfn;
3431}
3432
3433static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434 size_t size, int dir, u64 dma_mask)
3435{
3436 struct dmar_domain *domain;
3437 phys_addr_t start_paddr;
3438 unsigned long iova_pfn;
3439 int prot = 0;
3440 int ret;
3441 struct intel_iommu *iommu;
3442 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443
3444 BUG_ON(dir == DMA_NONE);
3445
3446 if (unlikely(attach_deferred(dev)))
3447 do_deferred_attach(dev);
3448
3449 domain = find_domain(dev);
3450 if (!domain)
3451 return DMA_MAPPING_ERROR;
3452
3453 iommu = domain_get_iommu(domain);
3454 size = aligned_nrpages(paddr, size);
3455
3456 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457 if (!iova_pfn)
3458 goto error;
3459
3460 /*
3461 * Check if DMAR supports zero-length reads on write only
3462 * mappings..
3463 */
3464 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465 !cap_zlr(iommu->cap))
3466 prot |= DMA_PTE_READ;
3467 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468 prot |= DMA_PTE_WRITE;
3469 /*
3470 * paddr - (paddr + size) might be partial page, we should map the whole
3471 * page. Note: if two part of one page are separately mapped, we
3472 * might have two guest_addr mapping to the same host paddr, but this
3473 * is not a big problem
3474 */
3475 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476 mm_to_dma_pfn(paddr_pfn), size, prot);
3477 if (ret)
3478 goto error;
3479
3480 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481 start_paddr += paddr & ~PAGE_MASK;
3482
3483 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484
3485 return start_paddr;
3486
3487error:
3488 if (iova_pfn)
3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491 size, (unsigned long long)paddr, dir);
3492 return DMA_MAPPING_ERROR;
3493}
3494
3495static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496 unsigned long offset, size_t size,
3497 enum dma_data_direction dir,
3498 unsigned long attrs)
3499{
3500 return __intel_map_single(dev, page_to_phys(page) + offset,
3501 size, dir, *dev->dma_mask);
3502}
3503
3504static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505 size_t size, enum dma_data_direction dir,
3506 unsigned long attrs)
3507{
3508 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509}
3510
3511static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512{
3513 struct dmar_domain *domain;
3514 unsigned long start_pfn, last_pfn;
3515 unsigned long nrpages;
3516 unsigned long iova_pfn;
3517 struct intel_iommu *iommu;
3518 struct page *freelist;
3519 struct pci_dev *pdev = NULL;
3520
3521 domain = find_domain(dev);
3522 BUG_ON(!domain);
3523
3524 iommu = domain_get_iommu(domain);
3525
3526 iova_pfn = IOVA_PFN(dev_addr);
3527
3528 nrpages = aligned_nrpages(dev_addr, size);
3529 start_pfn = mm_to_dma_pfn(iova_pfn);
3530 last_pfn = start_pfn + nrpages - 1;
3531
3532 if (dev_is_pci(dev))
3533 pdev = to_pci_dev(dev);
3534
3535 freelist = domain_unmap(domain, start_pfn, last_pfn);
3536 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537 !has_iova_flush_queue(&domain->iovad)) {
3538 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539 nrpages, !freelist, 0);
3540 /* free iova */
3541 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542 dma_free_pagelist(freelist);
3543 } else {
3544 queue_iova(&domain->iovad, iova_pfn, nrpages,
3545 (unsigned long)freelist);
3546 /*
3547 * queue up the release of the unmap to save the 1/6th of the
3548 * cpu used up by the iotlb flush operation...
3549 */
3550 }
3551
3552 trace_unmap_single(dev, dev_addr, size);
3553}
3554
3555static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556 size_t size, enum dma_data_direction dir,
3557 unsigned long attrs)
3558{
3559 intel_unmap(dev, dev_addr, size);
3560}
3561
3562static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563 size_t size, enum dma_data_direction dir, unsigned long attrs)
3564{
3565 intel_unmap(dev, dev_addr, size);
3566}
3567
3568static void *intel_alloc_coherent(struct device *dev, size_t size,
3569 dma_addr_t *dma_handle, gfp_t flags,
3570 unsigned long attrs)
3571{
3572 struct page *page = NULL;
3573 int order;
3574
3575 if (unlikely(attach_deferred(dev)))
3576 do_deferred_attach(dev);
3577
3578 size = PAGE_ALIGN(size);
3579 order = get_order(size);
3580
3581 if (gfpflags_allow_blocking(flags)) {
3582 unsigned int count = size >> PAGE_SHIFT;
3583
3584 page = dma_alloc_from_contiguous(dev, count, order,
3585 flags & __GFP_NOWARN);
3586 }
3587
3588 if (!page)
3589 page = alloc_pages(flags, order);
3590 if (!page)
3591 return NULL;
3592 memset(page_address(page), 0, size);
3593
3594 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595 DMA_BIDIRECTIONAL,
3596 dev->coherent_dma_mask);
3597 if (*dma_handle != DMA_MAPPING_ERROR)
3598 return page_address(page);
3599 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600 __free_pages(page, order);
3601
3602 return NULL;
3603}
3604
3605static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606 dma_addr_t dma_handle, unsigned long attrs)
3607{
3608 int order;
3609 struct page *page = virt_to_page(vaddr);
3610
3611 size = PAGE_ALIGN(size);
3612 order = get_order(size);
3613
3614 intel_unmap(dev, dma_handle, size);
3615 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616 __free_pages(page, order);
3617}
3618
3619static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620 int nelems, enum dma_data_direction dir,
3621 unsigned long attrs)
3622{
3623 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624 unsigned long nrpages = 0;
3625 struct scatterlist *sg;
3626 int i;
3627
3628 for_each_sg(sglist, sg, nelems, i) {
3629 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630 }
3631
3632 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633
3634 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635}
3636
3637static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638 enum dma_data_direction dir, unsigned long attrs)
3639{
3640 int i;
3641 struct dmar_domain *domain;
3642 size_t size = 0;
3643 int prot = 0;
3644 unsigned long iova_pfn;
3645 int ret;
3646 struct scatterlist *sg;
3647 unsigned long start_vpfn;
3648 struct intel_iommu *iommu;
3649
3650 BUG_ON(dir == DMA_NONE);
3651
3652 if (unlikely(attach_deferred(dev)))
3653 do_deferred_attach(dev);
3654
3655 domain = find_domain(dev);
3656 if (!domain)
3657 return 0;
3658
3659 iommu = domain_get_iommu(domain);
3660
3661 for_each_sg(sglist, sg, nelems, i)
3662 size += aligned_nrpages(sg->offset, sg->length);
3663
3664 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665 *dev->dma_mask);
3666 if (!iova_pfn) {
3667 sglist->dma_length = 0;
3668 return 0;
3669 }
3670
3671 /*
3672 * Check if DMAR supports zero-length reads on write only
3673 * mappings..
3674 */
3675 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676 !cap_zlr(iommu->cap))
3677 prot |= DMA_PTE_READ;
3678 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679 prot |= DMA_PTE_WRITE;
3680
3681 start_vpfn = mm_to_dma_pfn(iova_pfn);
3682
3683 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684 if (unlikely(ret)) {
3685 dma_pte_free_pagetable(domain, start_vpfn,
3686 start_vpfn + size - 1,
3687 agaw_to_level(domain->agaw) + 1);
3688 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689 return 0;
3690 }
3691
3692 for_each_sg(sglist, sg, nelems, i)
3693 trace_map_sg(dev, i + 1, nelems, sg);
3694
3695 return nelems;
3696}
3697
3698static u64 intel_get_required_mask(struct device *dev)
3699{
3700 return DMA_BIT_MASK(32);
3701}
3702
3703static const struct dma_map_ops intel_dma_ops = {
3704 .alloc = intel_alloc_coherent,
3705 .free = intel_free_coherent,
3706 .map_sg = intel_map_sg,
3707 .unmap_sg = intel_unmap_sg,
3708 .map_page = intel_map_page,
3709 .unmap_page = intel_unmap_page,
3710 .map_resource = intel_map_resource,
3711 .unmap_resource = intel_unmap_resource,
3712 .dma_supported = dma_direct_supported,
3713 .mmap = dma_common_mmap,
3714 .get_sgtable = dma_common_get_sgtable,
3715 .get_required_mask = intel_get_required_mask,
3716};
3717
3718static void
3719bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720 enum dma_data_direction dir, enum dma_sync_target target)
3721{
3722 struct dmar_domain *domain;
3723 phys_addr_t tlb_addr;
3724
3725 domain = find_domain(dev);
3726 if (WARN_ON(!domain))
3727 return;
3728
3729 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730 if (is_swiotlb_buffer(tlb_addr))
3731 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732}
3733
3734static dma_addr_t
3735bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736 enum dma_data_direction dir, unsigned long attrs,
3737 u64 dma_mask)
3738{
3739 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740 struct dmar_domain *domain;
3741 struct intel_iommu *iommu;
3742 unsigned long iova_pfn;
3743 unsigned long nrpages;
3744 phys_addr_t tlb_addr;
3745 int prot = 0;
3746 int ret;
3747
3748 if (unlikely(attach_deferred(dev)))
3749 do_deferred_attach(dev);
3750
3751 domain = find_domain(dev);
3752
3753 if (WARN_ON(dir == DMA_NONE || !domain))
3754 return DMA_MAPPING_ERROR;
3755
3756 iommu = domain_get_iommu(domain);
3757 if (WARN_ON(!iommu))
3758 return DMA_MAPPING_ERROR;
3759
3760 nrpages = aligned_nrpages(0, size);
3761 iova_pfn = intel_alloc_iova(dev, domain,
3762 dma_to_mm_pfn(nrpages), dma_mask);
3763 if (!iova_pfn)
3764 return DMA_MAPPING_ERROR;
3765
3766 /*
3767 * Check if DMAR supports zero-length reads on write only
3768 * mappings..
3769 */
3770 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771 !cap_zlr(iommu->cap))
3772 prot |= DMA_PTE_READ;
3773 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774 prot |= DMA_PTE_WRITE;
3775
3776 /*
3777 * If both the physical buffer start address and size are
3778 * page aligned, we don't need to use a bounce page.
3779 */
3780 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781 tlb_addr = swiotlb_tbl_map_single(dev,
3782 __phys_to_dma(dev, io_tlb_start),
3783 paddr, size, aligned_size, dir, attrs);
3784 if (tlb_addr == DMA_MAPPING_ERROR) {
3785 goto swiotlb_error;
3786 } else {
3787 /* Cleanup the padding area. */
3788 void *padding_start = phys_to_virt(tlb_addr);
3789 size_t padding_size = aligned_size;
3790
3791 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792 (dir == DMA_TO_DEVICE ||
3793 dir == DMA_BIDIRECTIONAL)) {
3794 padding_start += size;
3795 padding_size -= size;
3796 }
3797
3798 memset(padding_start, 0, padding_size);
3799 }
3800 } else {
3801 tlb_addr = paddr;
3802 }
3803
3804 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806 if (ret)
3807 goto mapping_error;
3808
3809 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810
3811 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812
3813mapping_error:
3814 if (is_swiotlb_buffer(tlb_addr))
3815 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816 aligned_size, dir, attrs);
3817swiotlb_error:
3818 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820 size, (unsigned long long)paddr, dir);
3821
3822 return DMA_MAPPING_ERROR;
3823}
3824
3825static void
3826bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827 enum dma_data_direction dir, unsigned long attrs)
3828{
3829 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830 struct dmar_domain *domain;
3831 phys_addr_t tlb_addr;
3832
3833 domain = find_domain(dev);
3834 if (WARN_ON(!domain))
3835 return;
3836
3837 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838 if (WARN_ON(!tlb_addr))
3839 return;
3840
3841 intel_unmap(dev, dev_addr, size);
3842 if (is_swiotlb_buffer(tlb_addr))
3843 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844 aligned_size, dir, attrs);
3845
3846 trace_bounce_unmap_single(dev, dev_addr, size);
3847}
3848
3849static dma_addr_t
3850bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851 size_t size, enum dma_data_direction dir, unsigned long attrs)
3852{
3853 return bounce_map_single(dev, page_to_phys(page) + offset,
3854 size, dir, attrs, *dev->dma_mask);
3855}
3856
3857static dma_addr_t
3858bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859 enum dma_data_direction dir, unsigned long attrs)
3860{
3861 return bounce_map_single(dev, phys_addr, size,
3862 dir, attrs, *dev->dma_mask);
3863}
3864
3865static void
3866bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867 enum dma_data_direction dir, unsigned long attrs)
3868{
3869 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870}
3871
3872static void
3873bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874 enum dma_data_direction dir, unsigned long attrs)
3875{
3876 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877}
3878
3879static void
3880bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881 enum dma_data_direction dir, unsigned long attrs)
3882{
3883 struct scatterlist *sg;
3884 int i;
3885
3886 for_each_sg(sglist, sg, nelems, i)
3887 bounce_unmap_page(dev, sg->dma_address,
3888 sg_dma_len(sg), dir, attrs);
3889}
3890
3891static int
3892bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893 enum dma_data_direction dir, unsigned long attrs)
3894{
3895 int i;
3896 struct scatterlist *sg;
3897
3898 for_each_sg(sglist, sg, nelems, i) {
3899 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900 sg->offset, sg->length,
3901 dir, attrs);
3902 if (sg->dma_address == DMA_MAPPING_ERROR)
3903 goto out_unmap;
3904 sg_dma_len(sg) = sg->length;
3905 }
3906
3907 for_each_sg(sglist, sg, nelems, i)
3908 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909
3910 return nelems;
3911
3912out_unmap:
3913 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914 return 0;
3915}
3916
3917static void
3918bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919 size_t size, enum dma_data_direction dir)
3920{
3921 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922}
3923
3924static void
3925bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926 size_t size, enum dma_data_direction dir)
3927{
3928 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929}
3930
3931static void
3932bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933 int nelems, enum dma_data_direction dir)
3934{
3935 struct scatterlist *sg;
3936 int i;
3937
3938 for_each_sg(sglist, sg, nelems, i)
3939 bounce_sync_single(dev, sg_dma_address(sg),
3940 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941}
3942
3943static void
3944bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945 int nelems, enum dma_data_direction dir)
3946{
3947 struct scatterlist *sg;
3948 int i;
3949
3950 for_each_sg(sglist, sg, nelems, i)
3951 bounce_sync_single(dev, sg_dma_address(sg),
3952 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953}
3954
3955static const struct dma_map_ops bounce_dma_ops = {
3956 .alloc = intel_alloc_coherent,
3957 .free = intel_free_coherent,
3958 .map_sg = bounce_map_sg,
3959 .unmap_sg = bounce_unmap_sg,
3960 .map_page = bounce_map_page,
3961 .unmap_page = bounce_unmap_page,
3962 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3963 .sync_single_for_device = bounce_sync_single_for_device,
3964 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3965 .sync_sg_for_device = bounce_sync_sg_for_device,
3966 .map_resource = bounce_map_resource,
3967 .unmap_resource = bounce_unmap_resource,
3968 .dma_supported = dma_direct_supported,
3969};
3970
3971static inline int iommu_domain_cache_init(void)
3972{
3973 int ret = 0;
3974
3975 iommu_domain_cache = kmem_cache_create("iommu_domain",
3976 sizeof(struct dmar_domain),
3977 0,
3978 SLAB_HWCACHE_ALIGN,
3979
3980 NULL);
3981 if (!iommu_domain_cache) {
3982 pr_err("Couldn't create iommu_domain cache\n");
3983 ret = -ENOMEM;
3984 }
3985
3986 return ret;
3987}
3988
3989static inline int iommu_devinfo_cache_init(void)
3990{
3991 int ret = 0;
3992
3993 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994 sizeof(struct device_domain_info),
3995 0,
3996 SLAB_HWCACHE_ALIGN,
3997 NULL);
3998 if (!iommu_devinfo_cache) {
3999 pr_err("Couldn't create devinfo cache\n");
4000 ret = -ENOMEM;
4001 }
4002
4003 return ret;
4004}
4005
4006static int __init iommu_init_mempool(void)
4007{
4008 int ret;
4009 ret = iova_cache_get();
4010 if (ret)
4011 return ret;
4012
4013 ret = iommu_domain_cache_init();
4014 if (ret)
4015 goto domain_error;
4016
4017 ret = iommu_devinfo_cache_init();
4018 if (!ret)
4019 return ret;
4020
4021 kmem_cache_destroy(iommu_domain_cache);
4022domain_error:
4023 iova_cache_put();
4024
4025 return -ENOMEM;
4026}
4027
4028static void __init iommu_exit_mempool(void)
4029{
4030 kmem_cache_destroy(iommu_devinfo_cache);
4031 kmem_cache_destroy(iommu_domain_cache);
4032 iova_cache_put();
4033}
4034
4035static void __init init_no_remapping_devices(void)
4036{
4037 struct dmar_drhd_unit *drhd;
4038 struct device *dev;
4039 int i;
4040
4041 for_each_drhd_unit(drhd) {
4042 if (!drhd->include_all) {
4043 for_each_active_dev_scope(drhd->devices,
4044 drhd->devices_cnt, i, dev)
4045 break;
4046 /* ignore DMAR unit if no devices exist */
4047 if (i == drhd->devices_cnt)
4048 drhd->ignored = 1;
4049 }
4050 }
4051
4052 for_each_active_drhd_unit(drhd) {
4053 if (drhd->include_all)
4054 continue;
4055
4056 for_each_active_dev_scope(drhd->devices,
4057 drhd->devices_cnt, i, dev)
4058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059 break;
4060 if (i < drhd->devices_cnt)
4061 continue;
4062
4063 /* This IOMMU has *only* gfx devices. Either bypass it or
4064 set the gfx_mapped flag, as appropriate */
4065 drhd->gfx_dedicated = 1;
4066 if (!dmar_map_gfx)
4067 drhd->ignored = 1;
4068 }
4069}
4070
4071#ifdef CONFIG_SUSPEND
4072static int init_iommu_hw(void)
4073{
4074 struct dmar_drhd_unit *drhd;
4075 struct intel_iommu *iommu = NULL;
4076
4077 for_each_active_iommu(iommu, drhd)
4078 if (iommu->qi)
4079 dmar_reenable_qi(iommu);
4080
4081 for_each_iommu(iommu, drhd) {
4082 if (drhd->ignored) {
4083 /*
4084 * we always have to disable PMRs or DMA may fail on
4085 * this device
4086 */
4087 if (force_on)
4088 iommu_disable_protect_mem_regions(iommu);
4089 continue;
4090 }
4091
4092 iommu_flush_write_buffer(iommu);
4093
4094 iommu_set_root_entry(iommu);
4095
4096 iommu->flush.flush_context(iommu, 0, 0, 0,
4097 DMA_CCMD_GLOBAL_INVL);
4098 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099 iommu_enable_translation(iommu);
4100 iommu_disable_protect_mem_regions(iommu);
4101 }
4102
4103 return 0;
4104}
4105
4106static void iommu_flush_all(void)
4107{
4108 struct dmar_drhd_unit *drhd;
4109 struct intel_iommu *iommu;
4110
4111 for_each_active_iommu(iommu, drhd) {
4112 iommu->flush.flush_context(iommu, 0, 0, 0,
4113 DMA_CCMD_GLOBAL_INVL);
4114 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115 DMA_TLB_GLOBAL_FLUSH);
4116 }
4117}
4118
4119static int iommu_suspend(void)
4120{
4121 struct dmar_drhd_unit *drhd;
4122 struct intel_iommu *iommu = NULL;
4123 unsigned long flag;
4124
4125 for_each_active_iommu(iommu, drhd) {
4126 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127 GFP_ATOMIC);
4128 if (!iommu->iommu_state)
4129 goto nomem;
4130 }
4131
4132 iommu_flush_all();
4133
4134 for_each_active_iommu(iommu, drhd) {
4135 iommu_disable_translation(iommu);
4136
4137 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138
4139 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140 readl(iommu->reg + DMAR_FECTL_REG);
4141 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142 readl(iommu->reg + DMAR_FEDATA_REG);
4143 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144 readl(iommu->reg + DMAR_FEADDR_REG);
4145 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146 readl(iommu->reg + DMAR_FEUADDR_REG);
4147
4148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149 }
4150 return 0;
4151
4152nomem:
4153 for_each_active_iommu(iommu, drhd)
4154 kfree(iommu->iommu_state);
4155
4156 return -ENOMEM;
4157}
4158
4159static void iommu_resume(void)
4160{
4161 struct dmar_drhd_unit *drhd;
4162 struct intel_iommu *iommu = NULL;
4163 unsigned long flag;
4164
4165 if (init_iommu_hw()) {
4166 if (force_on)
4167 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168 else
4169 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170 return;
4171 }
4172
4173 for_each_active_iommu(iommu, drhd) {
4174
4175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176
4177 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178 iommu->reg + DMAR_FECTL_REG);
4179 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180 iommu->reg + DMAR_FEDATA_REG);
4181 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182 iommu->reg + DMAR_FEADDR_REG);
4183 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184 iommu->reg + DMAR_FEUADDR_REG);
4185
4186 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187 }
4188
4189 for_each_active_iommu(iommu, drhd)
4190 kfree(iommu->iommu_state);
4191}
4192
4193static struct syscore_ops iommu_syscore_ops = {
4194 .resume = iommu_resume,
4195 .suspend = iommu_suspend,
4196};
4197
4198static void __init init_iommu_pm_ops(void)
4199{
4200 register_syscore_ops(&iommu_syscore_ops);
4201}
4202
4203#else
4204static inline void init_iommu_pm_ops(void) {}
4205#endif /* CONFIG_PM */
4206
4207static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208{
4209 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211 rmrr->end_address <= rmrr->base_address ||
4212 arch_rmrr_sanity_check(rmrr))
4213 return -EINVAL;
4214
4215 return 0;
4216}
4217
4218int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219{
4220 struct acpi_dmar_reserved_memory *rmrr;
4221 struct dmar_rmrr_unit *rmrru;
4222
4223 rmrr = (struct acpi_dmar_reserved_memory *)header;
4224 if (rmrr_sanity_check(rmrr)) {
4225 pr_warn(FW_BUG
4226 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228 rmrr->base_address, rmrr->end_address,
4229 dmi_get_system_info(DMI_BIOS_VENDOR),
4230 dmi_get_system_info(DMI_BIOS_VERSION),
4231 dmi_get_system_info(DMI_PRODUCT_VERSION));
4232 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233 }
4234
4235 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236 if (!rmrru)
4237 goto out;
4238
4239 rmrru->hdr = header;
4240
4241 rmrru->base_address = rmrr->base_address;
4242 rmrru->end_address = rmrr->end_address;
4243
4244 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245 ((void *)rmrr) + rmrr->header.length,
4246 &rmrru->devices_cnt);
4247 if (rmrru->devices_cnt && rmrru->devices == NULL)
4248 goto free_rmrru;
4249
4250 list_add(&rmrru->list, &dmar_rmrr_units);
4251
4252 return 0;
4253free_rmrru:
4254 kfree(rmrru);
4255out:
4256 return -ENOMEM;
4257}
4258
4259static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260{
4261 struct dmar_atsr_unit *atsru;
4262 struct acpi_dmar_atsr *tmp;
4263
4264 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265 dmar_rcu_check()) {
4266 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267 if (atsr->segment != tmp->segment)
4268 continue;
4269 if (atsr->header.length != tmp->header.length)
4270 continue;
4271 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272 return atsru;
4273 }
4274
4275 return NULL;
4276}
4277
4278int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279{
4280 struct acpi_dmar_atsr *atsr;
4281 struct dmar_atsr_unit *atsru;
4282
4283 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284 return 0;
4285
4286 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287 atsru = dmar_find_atsr(atsr);
4288 if (atsru)
4289 return 0;
4290
4291 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292 if (!atsru)
4293 return -ENOMEM;
4294
4295 /*
4296 * If memory is allocated from slab by ACPI _DSM method, we need to
4297 * copy the memory content because the memory buffer will be freed
4298 * on return.
4299 */
4300 atsru->hdr = (void *)(atsru + 1);
4301 memcpy(atsru->hdr, hdr, hdr->length);
4302 atsru->include_all = atsr->flags & 0x1;
4303 if (!atsru->include_all) {
4304 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305 (void *)atsr + atsr->header.length,
4306 &atsru->devices_cnt);
4307 if (atsru->devices_cnt && atsru->devices == NULL) {
4308 kfree(atsru);
4309 return -ENOMEM;
4310 }
4311 }
4312
4313 list_add_rcu(&atsru->list, &dmar_atsr_units);
4314
4315 return 0;
4316}
4317
4318static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319{
4320 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321 kfree(atsru);
4322}
4323
4324int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325{
4326 struct acpi_dmar_atsr *atsr;
4327 struct dmar_atsr_unit *atsru;
4328
4329 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330 atsru = dmar_find_atsr(atsr);
4331 if (atsru) {
4332 list_del_rcu(&atsru->list);
4333 synchronize_rcu();
4334 intel_iommu_free_atsr(atsru);
4335 }
4336
4337 return 0;
4338}
4339
4340int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341{
4342 int i;
4343 struct device *dev;
4344 struct acpi_dmar_atsr *atsr;
4345 struct dmar_atsr_unit *atsru;
4346
4347 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348 atsru = dmar_find_atsr(atsr);
4349 if (!atsru)
4350 return 0;
4351
4352 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354 i, dev)
4355 return -EBUSY;
4356 }
4357
4358 return 0;
4359}
4360
4361static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362{
4363 int sp, ret;
4364 struct intel_iommu *iommu = dmaru->iommu;
4365
4366 if (g_iommus[iommu->seq_id])
4367 return 0;
4368
4369 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370 pr_warn("%s: Doesn't support hardware pass through.\n",
4371 iommu->name);
4372 return -ENXIO;
4373 }
4374 if (!ecap_sc_support(iommu->ecap) &&
4375 domain_update_iommu_snooping(iommu)) {
4376 pr_warn("%s: Doesn't support snooping.\n",
4377 iommu->name);
4378 return -ENXIO;
4379 }
4380 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382 pr_warn("%s: Doesn't support large page.\n",
4383 iommu->name);
4384 return -ENXIO;
4385 }
4386
4387 /*
4388 * Disable translation if already enabled prior to OS handover.
4389 */
4390 if (iommu->gcmd & DMA_GCMD_TE)
4391 iommu_disable_translation(iommu);
4392
4393 g_iommus[iommu->seq_id] = iommu;
4394 ret = iommu_init_domains(iommu);
4395 if (ret == 0)
4396 ret = iommu_alloc_root_entry(iommu);
4397 if (ret)
4398 goto out;
4399
4400 intel_svm_check(iommu);
4401
4402 if (dmaru->ignored) {
4403 /*
4404 * we always have to disable PMRs or DMA may fail on this device
4405 */
4406 if (force_on)
4407 iommu_disable_protect_mem_regions(iommu);
4408 return 0;
4409 }
4410
4411 intel_iommu_init_qi(iommu);
4412 iommu_flush_write_buffer(iommu);
4413
4414#ifdef CONFIG_INTEL_IOMMU_SVM
4415 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416 ret = intel_svm_enable_prq(iommu);
4417 if (ret)
4418 goto disable_iommu;
4419 }
4420#endif
4421 ret = dmar_set_interrupt(iommu);
4422 if (ret)
4423 goto disable_iommu;
4424
4425 iommu_set_root_entry(iommu);
4426 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428 iommu_enable_translation(iommu);
4429
4430 iommu_disable_protect_mem_regions(iommu);
4431 return 0;
4432
4433disable_iommu:
4434 disable_dmar_iommu(iommu);
4435out:
4436 free_dmar_iommu(iommu);
4437 return ret;
4438}
4439
4440int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441{
4442 int ret = 0;
4443 struct intel_iommu *iommu = dmaru->iommu;
4444
4445 if (!intel_iommu_enabled)
4446 return 0;
4447 if (iommu == NULL)
4448 return -EINVAL;
4449
4450 if (insert) {
4451 ret = intel_iommu_add(dmaru);
4452 } else {
4453 disable_dmar_iommu(iommu);
4454 free_dmar_iommu(iommu);
4455 }
4456
4457 return ret;
4458}
4459
4460static void intel_iommu_free_dmars(void)
4461{
4462 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463 struct dmar_atsr_unit *atsru, *atsr_n;
4464
4465 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466 list_del(&rmrru->list);
4467 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468 kfree(rmrru);
4469 }
4470
4471 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472 list_del(&atsru->list);
4473 intel_iommu_free_atsr(atsru);
4474 }
4475}
4476
4477int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478{
4479 int i, ret = 1;
4480 struct pci_bus *bus;
4481 struct pci_dev *bridge = NULL;
4482 struct device *tmp;
4483 struct acpi_dmar_atsr *atsr;
4484 struct dmar_atsr_unit *atsru;
4485
4486 dev = pci_physfn(dev);
4487 for (bus = dev->bus; bus; bus = bus->parent) {
4488 bridge = bus->self;
4489 /* If it's an integrated device, allow ATS */
4490 if (!bridge)
4491 return 1;
4492 /* Connected via non-PCIe: no ATS */
4493 if (!pci_is_pcie(bridge) ||
4494 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495 return 0;
4496 /* If we found the root port, look it up in the ATSR */
4497 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498 break;
4499 }
4500
4501 rcu_read_lock();
4502 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504 if (atsr->segment != pci_domain_nr(dev->bus))
4505 continue;
4506
4507 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508 if (tmp == &bridge->dev)
4509 goto out;
4510
4511 if (atsru->include_all)
4512 goto out;
4513 }
4514 ret = 0;
4515out:
4516 rcu_read_unlock();
4517
4518 return ret;
4519}
4520
4521int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522{
4523 int ret;
4524 struct dmar_rmrr_unit *rmrru;
4525 struct dmar_atsr_unit *atsru;
4526 struct acpi_dmar_atsr *atsr;
4527 struct acpi_dmar_reserved_memory *rmrr;
4528
4529 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530 return 0;
4531
4532 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533 rmrr = container_of(rmrru->hdr,
4534 struct acpi_dmar_reserved_memory, header);
4535 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537 ((void *)rmrr) + rmrr->header.length,
4538 rmrr->segment, rmrru->devices,
4539 rmrru->devices_cnt);
4540 if (ret < 0)
4541 return ret;
4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543 dmar_remove_dev_scope(info, rmrr->segment,
4544 rmrru->devices, rmrru->devices_cnt);
4545 }
4546 }
4547
4548 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549 if (atsru->include_all)
4550 continue;
4551
4552 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555 (void *)atsr + atsr->header.length,
4556 atsr->segment, atsru->devices,
4557 atsru->devices_cnt);
4558 if (ret > 0)
4559 break;
4560 else if (ret < 0)
4561 return ret;
4562 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563 if (dmar_remove_dev_scope(info, atsr->segment,
4564 atsru->devices, atsru->devices_cnt))
4565 break;
4566 }
4567 }
4568
4569 return 0;
4570}
4571
4572static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573 unsigned long val, void *v)
4574{
4575 struct memory_notify *mhp = v;
4576 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578 mhp->nr_pages - 1);
4579
4580 switch (val) {
4581 case MEM_GOING_ONLINE:
4582 if (iommu_domain_identity_map(si_domain,
4583 start_vpfn, last_vpfn)) {
4584 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585 start_vpfn, last_vpfn);
4586 return NOTIFY_BAD;
4587 }
4588 break;
4589
4590 case MEM_OFFLINE:
4591 case MEM_CANCEL_ONLINE:
4592 {
4593 struct dmar_drhd_unit *drhd;
4594 struct intel_iommu *iommu;
4595 struct page *freelist;
4596
4597 freelist = domain_unmap(si_domain,
4598 start_vpfn, last_vpfn);
4599
4600 rcu_read_lock();
4601 for_each_active_iommu(iommu, drhd)
4602 iommu_flush_iotlb_psi(iommu, si_domain,
4603 start_vpfn, mhp->nr_pages,
4604 !freelist, 0);
4605 rcu_read_unlock();
4606 dma_free_pagelist(freelist);
4607 }
4608 break;
4609 }
4610
4611 return NOTIFY_OK;
4612}
4613
4614static struct notifier_block intel_iommu_memory_nb = {
4615 .notifier_call = intel_iommu_memory_notifier,
4616 .priority = 0
4617};
4618
4619static void free_all_cpu_cached_iovas(unsigned int cpu)
4620{
4621 int i;
4622
4623 for (i = 0; i < g_num_of_iommus; i++) {
4624 struct intel_iommu *iommu = g_iommus[i];
4625 struct dmar_domain *domain;
4626 int did;
4627
4628 if (!iommu)
4629 continue;
4630
4631 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632 domain = get_iommu_domain(iommu, (u16)did);
4633
4634 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635 continue;
4636
4637 free_cpu_cached_iovas(cpu, &domain->iovad);
4638 }
4639 }
4640}
4641
4642static int intel_iommu_cpu_dead(unsigned int cpu)
4643{
4644 free_all_cpu_cached_iovas(cpu);
4645 return 0;
4646}
4647
4648static void intel_disable_iommus(void)
4649{
4650 struct intel_iommu *iommu = NULL;
4651 struct dmar_drhd_unit *drhd;
4652
4653 for_each_iommu(iommu, drhd)
4654 iommu_disable_translation(iommu);
4655}
4656
4657void intel_iommu_shutdown(void)
4658{
4659 struct dmar_drhd_unit *drhd;
4660 struct intel_iommu *iommu = NULL;
4661
4662 if (no_iommu || dmar_disabled)
4663 return;
4664
4665 down_write(&dmar_global_lock);
4666
4667 /* Disable PMRs explicitly here. */
4668 for_each_iommu(iommu, drhd)
4669 iommu_disable_protect_mem_regions(iommu);
4670
4671 /* Make sure the IOMMUs are switched off */
4672 intel_disable_iommus();
4673
4674 up_write(&dmar_global_lock);
4675}
4676
4677static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678{
4679 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680
4681 return container_of(iommu_dev, struct intel_iommu, iommu);
4682}
4683
4684static ssize_t intel_iommu_show_version(struct device *dev,
4685 struct device_attribute *attr,
4686 char *buf)
4687{
4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690 return sprintf(buf, "%d:%d\n",
4691 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692}
4693static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694
4695static ssize_t intel_iommu_show_address(struct device *dev,
4696 struct device_attribute *attr,
4697 char *buf)
4698{
4699 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700 return sprintf(buf, "%llx\n", iommu->reg_phys);
4701}
4702static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703
4704static ssize_t intel_iommu_show_cap(struct device *dev,
4705 struct device_attribute *attr,
4706 char *buf)
4707{
4708 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709 return sprintf(buf, "%llx\n", iommu->cap);
4710}
4711static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712
4713static ssize_t intel_iommu_show_ecap(struct device *dev,
4714 struct device_attribute *attr,
4715 char *buf)
4716{
4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 return sprintf(buf, "%llx\n", iommu->ecap);
4719}
4720static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721
4722static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723 struct device_attribute *attr,
4724 char *buf)
4725{
4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728}
4729static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730
4731static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732 struct device_attribute *attr,
4733 char *buf)
4734{
4735 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737 cap_ndoms(iommu->cap)));
4738}
4739static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740
4741static struct attribute *intel_iommu_attrs[] = {
4742 &dev_attr_version.attr,
4743 &dev_attr_address.attr,
4744 &dev_attr_cap.attr,
4745 &dev_attr_ecap.attr,
4746 &dev_attr_domains_supported.attr,
4747 &dev_attr_domains_used.attr,
4748 NULL,
4749};
4750
4751static struct attribute_group intel_iommu_group = {
4752 .name = "intel-iommu",
4753 .attrs = intel_iommu_attrs,
4754};
4755
4756const struct attribute_group *intel_iommu_groups[] = {
4757 &intel_iommu_group,
4758 NULL,
4759};
4760
4761static inline bool has_external_pci(void)
4762{
4763 struct pci_dev *pdev = NULL;
4764
4765 for_each_pci_dev(pdev)
4766 if (pdev->external_facing)
4767 return true;
4768
4769 return false;
4770}
4771
4772static int __init platform_optin_force_iommu(void)
4773{
4774 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775 return 0;
4776
4777 if (no_iommu || dmar_disabled)
4778 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779
4780 /*
4781 * If Intel-IOMMU is disabled by default, we will apply identity
4782 * map for all devices except those marked as being untrusted.
4783 */
4784 if (dmar_disabled)
4785 iommu_set_default_passthrough(false);
4786
4787 dmar_disabled = 0;
4788 no_iommu = 0;
4789
4790 return 1;
4791}
4792
4793static int __init probe_acpi_namespace_devices(void)
4794{
4795 struct dmar_drhd_unit *drhd;
4796 /* To avoid a -Wunused-but-set-variable warning. */
4797 struct intel_iommu *iommu __maybe_unused;
4798 struct device *dev;
4799 int i, ret = 0;
4800
4801 for_each_active_iommu(iommu, drhd) {
4802 for_each_active_dev_scope(drhd->devices,
4803 drhd->devices_cnt, i, dev) {
4804 struct acpi_device_physical_node *pn;
4805 struct iommu_group *group;
4806 struct acpi_device *adev;
4807
4808 if (dev->bus != &acpi_bus_type)
4809 continue;
4810
4811 adev = to_acpi_device(dev);
4812 mutex_lock(&adev->physical_node_lock);
4813 list_for_each_entry(pn,
4814 &adev->physical_node_list, node) {
4815 group = iommu_group_get(pn->dev);
4816 if (group) {
4817 iommu_group_put(group);
4818 continue;
4819 }
4820
4821 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822 ret = iommu_probe_device(pn->dev);
4823 if (ret)
4824 break;
4825 }
4826 mutex_unlock(&adev->physical_node_lock);
4827
4828 if (ret)
4829 return ret;
4830 }
4831 }
4832
4833 return 0;
4834}
4835
4836int __init intel_iommu_init(void)
4837{
4838 int ret = -ENODEV;
4839 struct dmar_drhd_unit *drhd;
4840 struct intel_iommu *iommu;
4841
4842 /*
4843 * Intel IOMMU is required for a TXT/tboot launch or platform
4844 * opt in, so enforce that.
4845 */
4846 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847
4848 if (iommu_init_mempool()) {
4849 if (force_on)
4850 panic("tboot: Failed to initialize iommu memory\n");
4851 return -ENOMEM;
4852 }
4853
4854 down_write(&dmar_global_lock);
4855 if (dmar_table_init()) {
4856 if (force_on)
4857 panic("tboot: Failed to initialize DMAR table\n");
4858 goto out_free_dmar;
4859 }
4860
4861 if (dmar_dev_scope_init() < 0) {
4862 if (force_on)
4863 panic("tboot: Failed to initialize DMAR device scope\n");
4864 goto out_free_dmar;
4865 }
4866
4867 up_write(&dmar_global_lock);
4868
4869 /*
4870 * The bus notifier takes the dmar_global_lock, so lockdep will
4871 * complain later when we register it under the lock.
4872 */
4873 dmar_register_bus_notifier();
4874
4875 down_write(&dmar_global_lock);
4876
4877 if (!no_iommu)
4878 intel_iommu_debugfs_init();
4879
4880 if (no_iommu || dmar_disabled) {
4881 /*
4882 * We exit the function here to ensure IOMMU's remapping and
4883 * mempool aren't setup, which means that the IOMMU's PMRs
4884 * won't be disabled via the call to init_dmars(). So disable
4885 * it explicitly here. The PMRs were setup by tboot prior to
4886 * calling SENTER, but the kernel is expected to reset/tear
4887 * down the PMRs.
4888 */
4889 if (intel_iommu_tboot_noforce) {
4890 for_each_iommu(iommu, drhd)
4891 iommu_disable_protect_mem_regions(iommu);
4892 }
4893
4894 /*
4895 * Make sure the IOMMUs are switched off, even when we
4896 * boot into a kexec kernel and the previous kernel left
4897 * them enabled
4898 */
4899 intel_disable_iommus();
4900 goto out_free_dmar;
4901 }
4902
4903 if (list_empty(&dmar_rmrr_units))
4904 pr_info("No RMRR found\n");
4905
4906 if (list_empty(&dmar_atsr_units))
4907 pr_info("No ATSR found\n");
4908
4909 if (dmar_init_reserved_ranges()) {
4910 if (force_on)
4911 panic("tboot: Failed to reserve iommu ranges\n");
4912 goto out_free_reserved_range;
4913 }
4914
4915 if (dmar_map_gfx)
4916 intel_iommu_gfx_mapped = 1;
4917
4918 init_no_remapping_devices();
4919
4920 ret = init_dmars();
4921 if (ret) {
4922 if (force_on)
4923 panic("tboot: Failed to initialize DMARs\n");
4924 pr_err("Initialization failed\n");
4925 goto out_free_reserved_range;
4926 }
4927 up_write(&dmar_global_lock);
4928
4929 init_iommu_pm_ops();
4930
4931 down_read(&dmar_global_lock);
4932 for_each_active_iommu(iommu, drhd) {
4933 iommu_device_sysfs_add(&iommu->iommu, NULL,
4934 intel_iommu_groups,
4935 "%s", iommu->name);
4936 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937 iommu_device_register(&iommu->iommu);
4938 }
4939 up_read(&dmar_global_lock);
4940
4941 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942 if (si_domain && !hw_pass_through)
4943 register_memory_notifier(&intel_iommu_memory_nb);
4944 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945 intel_iommu_cpu_dead);
4946
4947 down_read(&dmar_global_lock);
4948 if (probe_acpi_namespace_devices())
4949 pr_warn("ACPI name space devices didn't probe correctly\n");
4950
4951 /* Finally, we enable the DMA remapping hardware. */
4952 for_each_iommu(iommu, drhd) {
4953 if (!drhd->ignored && !translation_pre_enabled(iommu))
4954 iommu_enable_translation(iommu);
4955
4956 iommu_disable_protect_mem_regions(iommu);
4957 }
4958 up_read(&dmar_global_lock);
4959
4960 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961
4962 intel_iommu_enabled = 1;
4963
4964 return 0;
4965
4966out_free_reserved_range:
4967 put_iova_domain(&reserved_iova_list);
4968out_free_dmar:
4969 intel_iommu_free_dmars();
4970 up_write(&dmar_global_lock);
4971 iommu_exit_mempool();
4972 return ret;
4973}
4974
4975static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976{
4977 struct intel_iommu *iommu = opaque;
4978
4979 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980 return 0;
4981}
4982
4983/*
4984 * NB - intel-iommu lacks any sort of reference counting for the users of
4985 * dependent devices. If multiple endpoints have intersecting dependent
4986 * devices, unbinding the driver from any one of them will possibly leave
4987 * the others unable to operate.
4988 */
4989static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990{
4991 if (!iommu || !dev || !dev_is_pci(dev))
4992 return;
4993
4994 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4995}
4996
4997static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998{
4999 struct dmar_domain *domain;
5000 struct intel_iommu *iommu;
5001 unsigned long flags;
5002
5003 assert_spin_locked(&device_domain_lock);
5004
5005 if (WARN_ON(!info))
5006 return;
5007
5008 iommu = info->iommu;
5009 domain = info->domain;
5010
5011 if (info->dev) {
5012 if (dev_is_pci(info->dev) && sm_supported(iommu))
5013 intel_pasid_tear_down_entry(iommu, info->dev,
5014 PASID_RID2PASID, false);
5015
5016 iommu_disable_dev_iotlb(info);
5017 if (!dev_is_real_dma_subdevice(info->dev))
5018 domain_context_clear(iommu, info->dev);
5019 intel_pasid_free_table(info->dev);
5020 }
5021
5022 unlink_domain_info(info);
5023
5024 spin_lock_irqsave(&iommu->lock, flags);
5025 domain_detach_iommu(domain, iommu);
5026 spin_unlock_irqrestore(&iommu->lock, flags);
5027
5028 free_devinfo_mem(info);
5029}
5030
5031static void dmar_remove_one_dev_info(struct device *dev)
5032{
5033 struct device_domain_info *info;
5034 unsigned long flags;
5035
5036 spin_lock_irqsave(&device_domain_lock, flags);
5037 info = get_domain_info(dev);
5038 if (info)
5039 __dmar_remove_one_dev_info(info);
5040 spin_unlock_irqrestore(&device_domain_lock, flags);
5041}
5042
5043static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044{
5045 int adjust_width;
5046
5047 /* calculate AGAW */
5048 domain->gaw = guest_width;
5049 adjust_width = guestwidth_to_adjustwidth(guest_width);
5050 domain->agaw = width_to_agaw(adjust_width);
5051
5052 domain->iommu_coherency = 0;
5053 domain->iommu_snooping = 0;
5054 domain->iommu_superpage = 0;
5055 domain->max_addr = 0;
5056
5057 /* always allocate the top pgd */
5058 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059 if (!domain->pgd)
5060 return -ENOMEM;
5061 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062 return 0;
5063}
5064
5065static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5066{
5067 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5069
5070 if (!intel_iommu_strict &&
5071 init_iova_flush_queue(&dmar_domain->iovad,
5072 iommu_flush_iova, iova_entry_free))
5073 pr_info("iova flush queue initialization failed\n");
5074}
5075
5076static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077{
5078 struct dmar_domain *dmar_domain;
5079 struct iommu_domain *domain;
5080
5081 switch (type) {
5082 case IOMMU_DOMAIN_DMA:
5083 case IOMMU_DOMAIN_UNMANAGED:
5084 dmar_domain = alloc_domain(0);
5085 if (!dmar_domain) {
5086 pr_err("Can't allocate dmar_domain\n");
5087 return NULL;
5088 }
5089 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090 pr_err("Domain initialization failed\n");
5091 domain_exit(dmar_domain);
5092 return NULL;
5093 }
5094
5095 if (type == IOMMU_DOMAIN_DMA)
5096 intel_init_iova_domain(dmar_domain);
5097
5098 domain_update_iommu_cap(dmar_domain);
5099
5100 domain = &dmar_domain->domain;
5101 domain->geometry.aperture_start = 0;
5102 domain->geometry.aperture_end =
5103 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104 domain->geometry.force_aperture = true;
5105
5106 return domain;
5107 case IOMMU_DOMAIN_IDENTITY:
5108 return &si_domain->domain;
5109 default:
5110 return NULL;
5111 }
5112
5113 return NULL;
5114}
5115
5116static void intel_iommu_domain_free(struct iommu_domain *domain)
5117{
5118 if (domain != &si_domain->domain)
5119 domain_exit(to_dmar_domain(domain));
5120}
5121
5122/*
5123 * Check whether a @domain could be attached to the @dev through the
5124 * aux-domain attach/detach APIs.
5125 */
5126static inline bool
5127is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128{
5129 struct device_domain_info *info = get_domain_info(dev);
5130
5131 return info && info->auxd_enabled &&
5132 domain->type == IOMMU_DOMAIN_UNMANAGED;
5133}
5134
5135static void auxiliary_link_device(struct dmar_domain *domain,
5136 struct device *dev)
5137{
5138 struct device_domain_info *info = get_domain_info(dev);
5139
5140 assert_spin_locked(&device_domain_lock);
5141 if (WARN_ON(!info))
5142 return;
5143
5144 domain->auxd_refcnt++;
5145 list_add(&domain->auxd, &info->auxiliary_domains);
5146}
5147
5148static void auxiliary_unlink_device(struct dmar_domain *domain,
5149 struct device *dev)
5150{
5151 struct device_domain_info *info = get_domain_info(dev);
5152
5153 assert_spin_locked(&device_domain_lock);
5154 if (WARN_ON(!info))
5155 return;
5156
5157 list_del(&domain->auxd);
5158 domain->auxd_refcnt--;
5159
5160 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161 ioasid_free(domain->default_pasid);
5162}
5163
5164static int aux_domain_add_dev(struct dmar_domain *domain,
5165 struct device *dev)
5166{
5167 int ret;
5168 unsigned long flags;
5169 struct intel_iommu *iommu;
5170
5171 iommu = device_to_iommu(dev, NULL, NULL);
5172 if (!iommu)
5173 return -ENODEV;
5174
5175 if (domain->default_pasid <= 0) {
5176 int pasid;
5177
5178 /* No private data needed for the default pasid */
5179 pasid = ioasid_alloc(NULL, PASID_MIN,
5180 pci_max_pasids(to_pci_dev(dev)) - 1,
5181 NULL);
5182 if (pasid == INVALID_IOASID) {
5183 pr_err("Can't allocate default pasid\n");
5184 return -ENODEV;
5185 }
5186 domain->default_pasid = pasid;
5187 }
5188
5189 spin_lock_irqsave(&device_domain_lock, flags);
5190 /*
5191 * iommu->lock must be held to attach domain to iommu and setup the
5192 * pasid entry for second level translation.
5193 */
5194 spin_lock(&iommu->lock);
5195 ret = domain_attach_iommu(domain, iommu);
5196 if (ret)
5197 goto attach_failed;
5198
5199 /* Setup the PASID entry for mediated devices: */
5200 if (domain_use_first_level(domain))
5201 ret = domain_setup_first_level(iommu, domain, dev,
5202 domain->default_pasid);
5203 else
5204 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205 domain->default_pasid);
5206 if (ret)
5207 goto table_failed;
5208 spin_unlock(&iommu->lock);
5209
5210 auxiliary_link_device(domain, dev);
5211
5212 spin_unlock_irqrestore(&device_domain_lock, flags);
5213
5214 return 0;
5215
5216table_failed:
5217 domain_detach_iommu(domain, iommu);
5218attach_failed:
5219 spin_unlock(&iommu->lock);
5220 spin_unlock_irqrestore(&device_domain_lock, flags);
5221 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222 ioasid_free(domain->default_pasid);
5223
5224 return ret;
5225}
5226
5227static void aux_domain_remove_dev(struct dmar_domain *domain,
5228 struct device *dev)
5229{
5230 struct device_domain_info *info;
5231 struct intel_iommu *iommu;
5232 unsigned long flags;
5233
5234 if (!is_aux_domain(dev, &domain->domain))
5235 return;
5236
5237 spin_lock_irqsave(&device_domain_lock, flags);
5238 info = get_domain_info(dev);
5239 iommu = info->iommu;
5240
5241 auxiliary_unlink_device(domain, dev);
5242
5243 spin_lock(&iommu->lock);
5244 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245 domain_detach_iommu(domain, iommu);
5246 spin_unlock(&iommu->lock);
5247
5248 spin_unlock_irqrestore(&device_domain_lock, flags);
5249}
5250
5251static int prepare_domain_attach_device(struct iommu_domain *domain,
5252 struct device *dev)
5253{
5254 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255 struct intel_iommu *iommu;
5256 int addr_width;
5257
5258 iommu = device_to_iommu(dev, NULL, NULL);
5259 if (!iommu)
5260 return -ENODEV;
5261
5262 /* check if this iommu agaw is sufficient for max mapped address */
5263 addr_width = agaw_to_width(iommu->agaw);
5264 if (addr_width > cap_mgaw(iommu->cap))
5265 addr_width = cap_mgaw(iommu->cap);
5266
5267 if (dmar_domain->max_addr > (1LL << addr_width)) {
5268 dev_err(dev, "%s: iommu width (%d) is not "
5269 "sufficient for the mapped address (%llx)\n",
5270 __func__, addr_width, dmar_domain->max_addr);
5271 return -EFAULT;
5272 }
5273 dmar_domain->gaw = addr_width;
5274
5275 /*
5276 * Knock out extra levels of page tables if necessary
5277 */
5278 while (iommu->agaw < dmar_domain->agaw) {
5279 struct dma_pte *pte;
5280
5281 pte = dmar_domain->pgd;
5282 if (dma_pte_present(pte)) {
5283 dmar_domain->pgd = (struct dma_pte *)
5284 phys_to_virt(dma_pte_addr(pte));
5285 free_pgtable_page(pte);
5286 }
5287 dmar_domain->agaw--;
5288 }
5289
5290 return 0;
5291}
5292
5293static int intel_iommu_attach_device(struct iommu_domain *domain,
5294 struct device *dev)
5295{
5296 int ret;
5297
5298 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299 device_is_rmrr_locked(dev)) {
5300 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5301 return -EPERM;
5302 }
5303
5304 if (is_aux_domain(dev, domain))
5305 return -EPERM;
5306
5307 /* normally dev is not mapped */
5308 if (unlikely(domain_context_mapped(dev))) {
5309 struct dmar_domain *old_domain;
5310
5311 old_domain = find_domain(dev);
5312 if (old_domain)
5313 dmar_remove_one_dev_info(dev);
5314 }
5315
5316 ret = prepare_domain_attach_device(domain, dev);
5317 if (ret)
5318 return ret;
5319
5320 return domain_add_dev_info(to_dmar_domain(domain), dev);
5321}
5322
5323static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324 struct device *dev)
5325{
5326 int ret;
5327
5328 if (!is_aux_domain(dev, domain))
5329 return -EPERM;
5330
5331 ret = prepare_domain_attach_device(domain, dev);
5332 if (ret)
5333 return ret;
5334
5335 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336}
5337
5338static void intel_iommu_detach_device(struct iommu_domain *domain,
5339 struct device *dev)
5340{
5341 dmar_remove_one_dev_info(dev);
5342}
5343
5344static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345 struct device *dev)
5346{
5347 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348}
5349
5350/*
5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352 * VT-d granularity. Invalidation is typically included in the unmap operation
5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354 * owns the first level page tables. Invalidations of translation caches in the
5355 * guest are trapped and passed down to the host.
5356 *
5357 * vIOMMU in the guest will only expose first level page tables, therefore
5358 * we do not support IOTLB granularity for request without PASID (second level).
5359 *
5360 * For example, to find the VT-d granularity encoding for IOTLB
5361 * type and page selective granularity within PASID:
5362 * X: indexed by iommu cache type
5363 * Y: indexed by enum iommu_inv_granularity
5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365 */
5366
5367static const int
5368inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369 /*
5370 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371 * page selective (address granularity)
5372 */
5373 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374 /* PASID based dev TLBs */
5375 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376 /* PASID cache */
5377 {-EINVAL, -EINVAL, -EINVAL}
5378};
5379
5380static inline int to_vtd_granularity(int type, int granu)
5381{
5382 return inv_type_granu_table[type][granu];
5383}
5384
5385static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386{
5387 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388
5389 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391 * granu size in contiguous memory.
5392 */
5393 return order_base_2(nr_pages);
5394}
5395
5396#ifdef CONFIG_INTEL_IOMMU_SVM
5397static int
5398intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399 struct iommu_cache_invalidate_info *inv_info)
5400{
5401 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 struct device_domain_info *info;
5403 struct intel_iommu *iommu;
5404 unsigned long flags;
5405 int cache_type;
5406 u8 bus, devfn;
5407 u16 did, sid;
5408 int ret = 0;
5409 u64 size = 0;
5410
5411 if (!inv_info || !dmar_domain ||
5412 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413 return -EINVAL;
5414
5415 if (!dev || !dev_is_pci(dev))
5416 return -ENODEV;
5417
5418 iommu = device_to_iommu(dev, &bus, &devfn);
5419 if (!iommu)
5420 return -ENODEV;
5421
5422 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423 return -EINVAL;
5424
5425 spin_lock_irqsave(&device_domain_lock, flags);
5426 spin_lock(&iommu->lock);
5427 info = get_domain_info(dev);
5428 if (!info) {
5429 ret = -EINVAL;
5430 goto out_unlock;
5431 }
5432 did = dmar_domain->iommu_did[iommu->seq_id];
5433 sid = PCI_DEVID(bus, devfn);
5434
5435 /* Size is only valid in address selective invalidation */
5436 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437 size = to_vtd_size(inv_info->addr_info.granule_size,
5438 inv_info->addr_info.nb_granules);
5439
5440 for_each_set_bit(cache_type,
5441 (unsigned long *)&inv_info->cache,
5442 IOMMU_CACHE_INV_TYPE_NR) {
5443 int granu = 0;
5444 u64 pasid = 0;
5445 u64 addr = 0;
5446
5447 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448 if (granu == -EINVAL) {
5449 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450 cache_type, inv_info->granularity);
5451 break;
5452 }
5453
5454 /*
5455 * PASID is stored in different locations based on the
5456 * granularity.
5457 */
5458 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460 pasid = inv_info->pasid_info.pasid;
5461 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463 pasid = inv_info->addr_info.pasid;
5464
5465 switch (BIT(cache_type)) {
5466 case IOMMU_CACHE_INV_TYPE_IOTLB:
5467 /* HW will ignore LSB bits based on address mask */
5468 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469 size &&
5470 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472 inv_info->addr_info.addr, size);
5473 }
5474
5475 /*
5476 * If granu is PASID-selective, address is ignored.
5477 * We use npages = -1 to indicate that.
5478 */
5479 qi_flush_piotlb(iommu, did, pasid,
5480 mm_to_dma_pfn(inv_info->addr_info.addr),
5481 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483
5484 if (!info->ats_enabled)
5485 break;
5486 /*
5487 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488 * in the guest may assume IOTLB flush is inclusive,
5489 * which is more efficient.
5490 */
5491 fallthrough;
5492 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493 /*
5494 * PASID based device TLB invalidation does not support
5495 * IOMMU_INV_GRANU_PASID granularity but only supports
5496 * IOMMU_INV_GRANU_ADDR.
5497 * The equivalent of that is we set the size to be the
5498 * entire range of 64 bit. User only provides PASID info
5499 * without address info. So we set addr to 0.
5500 */
5501 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502 size = 64 - VTD_PAGE_SHIFT;
5503 addr = 0;
5504 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505 addr = inv_info->addr_info.addr;
5506 }
5507
5508 if (info->ats_enabled)
5509 qi_flush_dev_iotlb_pasid(iommu, sid,
5510 info->pfsid, pasid,
5511 info->ats_qdep, addr,
5512 size);
5513 else
5514 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515 break;
5516 default:
5517 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518 cache_type);
5519 ret = -EINVAL;
5520 }
5521 }
5522out_unlock:
5523 spin_unlock(&iommu->lock);
5524 spin_unlock_irqrestore(&device_domain_lock, flags);
5525
5526 return ret;
5527}
5528#endif
5529
5530static int intel_iommu_map(struct iommu_domain *domain,
5531 unsigned long iova, phys_addr_t hpa,
5532 size_t size, int iommu_prot, gfp_t gfp)
5533{
5534 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535 u64 max_addr;
5536 int prot = 0;
5537 int ret;
5538
5539 if (iommu_prot & IOMMU_READ)
5540 prot |= DMA_PTE_READ;
5541 if (iommu_prot & IOMMU_WRITE)
5542 prot |= DMA_PTE_WRITE;
5543 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544 prot |= DMA_PTE_SNP;
5545
5546 max_addr = iova + size;
5547 if (dmar_domain->max_addr < max_addr) {
5548 u64 end;
5549
5550 /* check if minimum agaw is sufficient for mapped address */
5551 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552 if (end < max_addr) {
5553 pr_err("%s: iommu width (%d) is not "
5554 "sufficient for the mapped address (%llx)\n",
5555 __func__, dmar_domain->gaw, max_addr);
5556 return -EFAULT;
5557 }
5558 dmar_domain->max_addr = max_addr;
5559 }
5560 /* Round up size to next multiple of PAGE_SIZE, if it and
5561 the low bits of hpa would take us onto the next page */
5562 size = aligned_nrpages(hpa, size);
5563 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564 hpa >> VTD_PAGE_SHIFT, size, prot);
5565 return ret;
5566}
5567
5568static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569 unsigned long iova, size_t size,
5570 struct iommu_iotlb_gather *gather)
5571{
5572 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 struct page *freelist = NULL;
5574 unsigned long start_pfn, last_pfn;
5575 unsigned int npages;
5576 int iommu_id, level = 0;
5577
5578 /* Cope with horrid API which requires us to unmap more than the
5579 size argument if it happens to be a large-page mapping. */
5580 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581
5582 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584
5585 start_pfn = iova >> VTD_PAGE_SHIFT;
5586 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587
5588 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5589
5590 npages = last_pfn - start_pfn + 1;
5591
5592 for_each_domain_iommu(iommu_id, dmar_domain)
5593 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594 start_pfn, npages, !freelist, 0);
5595
5596 dma_free_pagelist(freelist);
5597
5598 if (dmar_domain->max_addr == iova + size)
5599 dmar_domain->max_addr = iova;
5600
5601 return size;
5602}
5603
5604static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605 dma_addr_t iova)
5606{
5607 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608 struct dma_pte *pte;
5609 int level = 0;
5610 u64 phys = 0;
5611
5612 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613 if (pte && dma_pte_present(pte))
5614 phys = dma_pte_addr(pte) +
5615 (iova & (BIT_MASK(level_to_offset_bits(level) +
5616 VTD_PAGE_SHIFT) - 1));
5617
5618 return phys;
5619}
5620
5621static inline bool scalable_mode_support(void)
5622{
5623 struct dmar_drhd_unit *drhd;
5624 struct intel_iommu *iommu;
5625 bool ret = true;
5626
5627 rcu_read_lock();
5628 for_each_active_iommu(iommu, drhd) {
5629 if (!sm_supported(iommu)) {
5630 ret = false;
5631 break;
5632 }
5633 }
5634 rcu_read_unlock();
5635
5636 return ret;
5637}
5638
5639static inline bool iommu_pasid_support(void)
5640{
5641 struct dmar_drhd_unit *drhd;
5642 struct intel_iommu *iommu;
5643 bool ret = true;
5644
5645 rcu_read_lock();
5646 for_each_active_iommu(iommu, drhd) {
5647 if (!pasid_supported(iommu)) {
5648 ret = false;
5649 break;
5650 }
5651 }
5652 rcu_read_unlock();
5653
5654 return ret;
5655}
5656
5657static inline bool nested_mode_support(void)
5658{
5659 struct dmar_drhd_unit *drhd;
5660 struct intel_iommu *iommu;
5661 bool ret = true;
5662
5663 rcu_read_lock();
5664 for_each_active_iommu(iommu, drhd) {
5665 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666 ret = false;
5667 break;
5668 }
5669 }
5670 rcu_read_unlock();
5671
5672 return ret;
5673}
5674
5675static bool intel_iommu_capable(enum iommu_cap cap)
5676{
5677 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678 return domain_update_iommu_snooping(NULL) == 1;
5679 if (cap == IOMMU_CAP_INTR_REMAP)
5680 return irq_remapping_enabled == 1;
5681
5682 return false;
5683}
5684
5685static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686{
5687 struct intel_iommu *iommu;
5688
5689 iommu = device_to_iommu(dev, NULL, NULL);
5690 if (!iommu)
5691 return ERR_PTR(-ENODEV);
5692
5693 if (translation_pre_enabled(iommu))
5694 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5695
5696 return &iommu->iommu;
5697}
5698
5699static void intel_iommu_release_device(struct device *dev)
5700{
5701 struct intel_iommu *iommu;
5702
5703 iommu = device_to_iommu(dev, NULL, NULL);
5704 if (!iommu)
5705 return;
5706
5707 dmar_remove_one_dev_info(dev);
5708
5709 set_dma_ops(dev, NULL);
5710}
5711
5712static void intel_iommu_probe_finalize(struct device *dev)
5713{
5714 struct iommu_domain *domain;
5715
5716 domain = iommu_get_domain_for_dev(dev);
5717 if (device_needs_bounce(dev))
5718 set_dma_ops(dev, &bounce_dma_ops);
5719 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720 set_dma_ops(dev, &intel_dma_ops);
5721 else
5722 set_dma_ops(dev, NULL);
5723}
5724
5725static void intel_iommu_get_resv_regions(struct device *device,
5726 struct list_head *head)
5727{
5728 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729 struct iommu_resv_region *reg;
5730 struct dmar_rmrr_unit *rmrr;
5731 struct device *i_dev;
5732 int i;
5733
5734 down_read(&dmar_global_lock);
5735 for_each_rmrr_units(rmrr) {
5736 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737 i, i_dev) {
5738 struct iommu_resv_region *resv;
5739 enum iommu_resv_type type;
5740 size_t length;
5741
5742 if (i_dev != device &&
5743 !is_downstream_to_pci_bridge(device, i_dev))
5744 continue;
5745
5746 length = rmrr->end_address - rmrr->base_address + 1;
5747
5748 type = device_rmrr_is_relaxable(device) ?
5749 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750
5751 resv = iommu_alloc_resv_region(rmrr->base_address,
5752 length, prot, type);
5753 if (!resv)
5754 break;
5755
5756 list_add_tail(&resv->list, head);
5757 }
5758 }
5759 up_read(&dmar_global_lock);
5760
5761#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762 if (dev_is_pci(device)) {
5763 struct pci_dev *pdev = to_pci_dev(device);
5764
5765 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767 IOMMU_RESV_DIRECT_RELAXABLE);
5768 if (reg)
5769 list_add_tail(®->list, head);
5770 }
5771 }
5772#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773
5774 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776 0, IOMMU_RESV_MSI);
5777 if (!reg)
5778 return;
5779 list_add_tail(®->list, head);
5780}
5781
5782int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783{
5784 struct device_domain_info *info;
5785 struct context_entry *context;
5786 struct dmar_domain *domain;
5787 unsigned long flags;
5788 u64 ctx_lo;
5789 int ret;
5790
5791 domain = find_domain(dev);
5792 if (!domain)
5793 return -EINVAL;
5794
5795 spin_lock_irqsave(&device_domain_lock, flags);
5796 spin_lock(&iommu->lock);
5797
5798 ret = -EINVAL;
5799 info = get_domain_info(dev);
5800 if (!info || !info->pasid_supported)
5801 goto out;
5802
5803 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804 if (WARN_ON(!context))
5805 goto out;
5806
5807 ctx_lo = context[0].lo;
5808
5809 if (!(ctx_lo & CONTEXT_PASIDE)) {
5810 ctx_lo |= CONTEXT_PASIDE;
5811 context[0].lo = ctx_lo;
5812 wmb();
5813 iommu->flush.flush_context(iommu,
5814 domain->iommu_did[iommu->seq_id],
5815 PCI_DEVID(info->bus, info->devfn),
5816 DMA_CCMD_MASK_NOBIT,
5817 DMA_CCMD_DEVICE_INVL);
5818 }
5819
5820 /* Enable PASID support in the device, if it wasn't already */
5821 if (!info->pasid_enabled)
5822 iommu_enable_dev_iotlb(info);
5823
5824 ret = 0;
5825
5826 out:
5827 spin_unlock(&iommu->lock);
5828 spin_unlock_irqrestore(&device_domain_lock, flags);
5829
5830 return ret;
5831}
5832
5833static void intel_iommu_apply_resv_region(struct device *dev,
5834 struct iommu_domain *domain,
5835 struct iommu_resv_region *region)
5836{
5837 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838 unsigned long start, end;
5839
5840 start = IOVA_PFN(region->start);
5841 end = IOVA_PFN(region->start + region->length - 1);
5842
5843 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844}
5845
5846static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847{
5848 if (dev_is_pci(dev))
5849 return pci_device_group(dev);
5850 return generic_device_group(dev);
5851}
5852
5853static int intel_iommu_enable_auxd(struct device *dev)
5854{
5855 struct device_domain_info *info;
5856 struct intel_iommu *iommu;
5857 unsigned long flags;
5858 int ret;
5859
5860 iommu = device_to_iommu(dev, NULL, NULL);
5861 if (!iommu || dmar_disabled)
5862 return -EINVAL;
5863
5864 if (!sm_supported(iommu) || !pasid_supported(iommu))
5865 return -EINVAL;
5866
5867 ret = intel_iommu_enable_pasid(iommu, dev);
5868 if (ret)
5869 return -ENODEV;
5870
5871 spin_lock_irqsave(&device_domain_lock, flags);
5872 info = get_domain_info(dev);
5873 info->auxd_enabled = 1;
5874 spin_unlock_irqrestore(&device_domain_lock, flags);
5875
5876 return 0;
5877}
5878
5879static int intel_iommu_disable_auxd(struct device *dev)
5880{
5881 struct device_domain_info *info;
5882 unsigned long flags;
5883
5884 spin_lock_irqsave(&device_domain_lock, flags);
5885 info = get_domain_info(dev);
5886 if (!WARN_ON(!info))
5887 info->auxd_enabled = 0;
5888 spin_unlock_irqrestore(&device_domain_lock, flags);
5889
5890 return 0;
5891}
5892
5893/*
5894 * A PCI express designated vendor specific extended capability is defined
5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896 * for system software and tools to detect endpoint devices supporting the
5897 * Intel scalable IO virtualization without host driver dependency.
5898 *
5899 * Returns the address of the matching extended capability structure within
5900 * the device's PCI configuration space or 0 if the device does not support
5901 * it.
5902 */
5903static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904{
5905 int pos;
5906 u16 vendor, id;
5907
5908 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909 while (pos) {
5910 pci_read_config_word(pdev, pos + 4, &vendor);
5911 pci_read_config_word(pdev, pos + 8, &id);
5912 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913 return pos;
5914
5915 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916 }
5917
5918 return 0;
5919}
5920
5921static bool
5922intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923{
5924 if (feat == IOMMU_DEV_FEAT_AUX) {
5925 int ret;
5926
5927 if (!dev_is_pci(dev) || dmar_disabled ||
5928 !scalable_mode_support() || !iommu_pasid_support())
5929 return false;
5930
5931 ret = pci_pasid_features(to_pci_dev(dev));
5932 if (ret < 0)
5933 return false;
5934
5935 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936 }
5937
5938 if (feat == IOMMU_DEV_FEAT_SVA) {
5939 struct device_domain_info *info = get_domain_info(dev);
5940
5941 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942 info->pasid_supported && info->pri_supported &&
5943 info->ats_supported;
5944 }
5945
5946 return false;
5947}
5948
5949static int
5950intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951{
5952 if (feat == IOMMU_DEV_FEAT_AUX)
5953 return intel_iommu_enable_auxd(dev);
5954
5955 if (feat == IOMMU_DEV_FEAT_SVA) {
5956 struct device_domain_info *info = get_domain_info(dev);
5957
5958 if (!info)
5959 return -EINVAL;
5960
5961 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962 return 0;
5963 }
5964
5965 return -ENODEV;
5966}
5967
5968static int
5969intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970{
5971 if (feat == IOMMU_DEV_FEAT_AUX)
5972 return intel_iommu_disable_auxd(dev);
5973
5974 return -ENODEV;
5975}
5976
5977static bool
5978intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979{
5980 struct device_domain_info *info = get_domain_info(dev);
5981
5982 if (feat == IOMMU_DEV_FEAT_AUX)
5983 return scalable_mode_support() && info && info->auxd_enabled;
5984
5985 return false;
5986}
5987
5988static int
5989intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990{
5991 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992
5993 return dmar_domain->default_pasid > 0 ?
5994 dmar_domain->default_pasid : -EINVAL;
5995}
5996
5997static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998 struct device *dev)
5999{
6000 return attach_deferred(dev);
6001}
6002
6003static int
6004intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005 enum iommu_attr attr, void *data)
6006{
6007 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008 unsigned long flags;
6009 int ret = 0;
6010
6011 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012 return -EINVAL;
6013
6014 switch (attr) {
6015 case DOMAIN_ATTR_NESTING:
6016 spin_lock_irqsave(&device_domain_lock, flags);
6017 if (nested_mode_support() &&
6018 list_empty(&dmar_domain->devices)) {
6019 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021 } else {
6022 ret = -ENODEV;
6023 }
6024 spin_unlock_irqrestore(&device_domain_lock, flags);
6025 break;
6026 default:
6027 ret = -EINVAL;
6028 break;
6029 }
6030
6031 return ret;
6032}
6033
6034/*
6035 * Check that the device does not live on an external facing PCI port that is
6036 * marked as untrusted. Such devices should not be able to apply quirks and
6037 * thus not be able to bypass the IOMMU restrictions.
6038 */
6039static bool risky_device(struct pci_dev *pdev)
6040{
6041 if (pdev->untrusted) {
6042 pci_info(pdev,
6043 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044 pdev->vendor, pdev->device);
6045 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046 return true;
6047 }
6048 return false;
6049}
6050
6051const struct iommu_ops intel_iommu_ops = {
6052 .capable = intel_iommu_capable,
6053 .domain_alloc = intel_iommu_domain_alloc,
6054 .domain_free = intel_iommu_domain_free,
6055 .domain_set_attr = intel_iommu_domain_set_attr,
6056 .attach_dev = intel_iommu_attach_device,
6057 .detach_dev = intel_iommu_detach_device,
6058 .aux_attach_dev = intel_iommu_aux_attach_device,
6059 .aux_detach_dev = intel_iommu_aux_detach_device,
6060 .aux_get_pasid = intel_iommu_aux_get_pasid,
6061 .map = intel_iommu_map,
6062 .unmap = intel_iommu_unmap,
6063 .iova_to_phys = intel_iommu_iova_to_phys,
6064 .probe_device = intel_iommu_probe_device,
6065 .probe_finalize = intel_iommu_probe_finalize,
6066 .release_device = intel_iommu_release_device,
6067 .get_resv_regions = intel_iommu_get_resv_regions,
6068 .put_resv_regions = generic_iommu_put_resv_regions,
6069 .apply_resv_region = intel_iommu_apply_resv_region,
6070 .device_group = intel_iommu_device_group,
6071 .dev_has_feat = intel_iommu_dev_has_feat,
6072 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6073 .dev_enable_feat = intel_iommu_dev_enable_feat,
6074 .dev_disable_feat = intel_iommu_dev_disable_feat,
6075 .is_attach_deferred = intel_iommu_is_attach_deferred,
6076 .def_domain_type = device_def_domain_type,
6077 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6078#ifdef CONFIG_INTEL_IOMMU_SVM
6079 .cache_invalidate = intel_iommu_sva_invalidate,
6080 .sva_bind_gpasid = intel_svm_bind_gpasid,
6081 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6082 .sva_bind = intel_svm_bind,
6083 .sva_unbind = intel_svm_unbind,
6084 .sva_get_pasid = intel_svm_get_pasid,
6085 .page_response = intel_svm_page_response,
6086#endif
6087};
6088
6089static void quirk_iommu_igfx(struct pci_dev *dev)
6090{
6091 if (risky_device(dev))
6092 return;
6093
6094 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095 dmar_map_gfx = 0;
6096}
6097
6098/* G4x/GM45 integrated gfx dmar support is totally busted. */
6099DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106
6107/* Broadwell igfx malfunctions with dmar */
6108DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132
6133static void quirk_iommu_rwbf(struct pci_dev *dev)
6134{
6135 if (risky_device(dev))
6136 return;
6137
6138 /*
6139 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140 * but needs it. Same seems to hold for the desktop versions.
6141 */
6142 pci_info(dev, "Forcing write-buffer flush capability\n");
6143 rwbf_quirk = 1;
6144}
6145
6146DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153
6154#define GGC 0x52
6155#define GGC_MEMORY_SIZE_MASK (0xf << 8)
6156#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6157#define GGC_MEMORY_SIZE_1M (0x1 << 8)
6158#define GGC_MEMORY_SIZE_2M (0x3 << 8)
6159#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6160#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6161#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6162#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6163
6164static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165{
6166 unsigned short ggc;
6167
6168 if (risky_device(dev))
6169 return;
6170
6171 if (pci_read_config_word(dev, GGC, &ggc))
6172 return;
6173
6174 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176 dmar_map_gfx = 0;
6177 } else if (dmar_map_gfx) {
6178 /* we have to ensure the gfx device is idle before we flush */
6179 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180 intel_iommu_strict = 1;
6181 }
6182}
6183DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187
6188static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189{
6190 unsigned short ver;
6191
6192 if (!IS_GFX_DEVICE(dev))
6193 return;
6194
6195 ver = (dev->device >> 8) & 0xff;
6196 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198 ver != 0x9a)
6199 return;
6200
6201 if (risky_device(dev))
6202 return;
6203
6204 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205 iommu_skip_te_disable = 1;
6206}
6207DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208
6209/* On Tylersburg chipsets, some BIOSes have been known to enable the
6210 ISOCH DMAR unit for the Azalia sound device, but not give it any
6211 TLB entries, which causes it to deadlock. Check for that. We do
6212 this in a function called from init_dmars(), instead of in a PCI
6213 quirk, because we don't want to print the obnoxious "BIOS broken"
6214 message if VT-d is actually disabled.
6215*/
6216static void __init check_tylersburg_isoch(void)
6217{
6218 struct pci_dev *pdev;
6219 uint32_t vtisochctrl;
6220
6221 /* If there's no Azalia in the system anyway, forget it. */
6222 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223 if (!pdev)
6224 return;
6225
6226 if (risky_device(pdev)) {
6227 pci_dev_put(pdev);
6228 return;
6229 }
6230
6231 pci_dev_put(pdev);
6232
6233 /* System Management Registers. Might be hidden, in which case
6234 we can't do the sanity check. But that's OK, because the
6235 known-broken BIOSes _don't_ actually hide it, so far. */
6236 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237 if (!pdev)
6238 return;
6239
6240 if (risky_device(pdev)) {
6241 pci_dev_put(pdev);
6242 return;
6243 }
6244
6245 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246 pci_dev_put(pdev);
6247 return;
6248 }
6249
6250 pci_dev_put(pdev);
6251
6252 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253 if (vtisochctrl & 1)
6254 return;
6255
6256 /* Drop all bits other than the number of TLB entries */
6257 vtisochctrl &= 0x1c;
6258
6259 /* If we have the recommended number of TLB entries (16), fine. */
6260 if (vtisochctrl == 0x10)
6261 return;
6262
6263 /* Zero TLB entries? You get to ride the short bus to school. */
6264 if (!vtisochctrl) {
6265 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267 dmi_get_system_info(DMI_BIOS_VENDOR),
6268 dmi_get_system_info(DMI_BIOS_VERSION),
6269 dmi_get_system_info(DMI_PRODUCT_VERSION));
6270 iommu_identity_mapping |= IDENTMAP_AZALIA;
6271 return;
6272 }
6273
6274 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275 vtisochctrl);
6276}