Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/intel-svm.h>
20#include <linux/memory.h>
21#include <linux/pci.h>
22#include <linux/pci-ats.h>
23#include <linux/spinlock.h>
24#include <linux/syscore_ops.h>
25#include <linux/tboot.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33
34#define ROOT_SIZE VTD_PAGE_SIZE
35#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START (0xfee00000)
43#define IOAPIC_RANGE_END (0xfeefffff)
44#define IOVA_START_ADDR (0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define MAX_AGAW_WIDTH 64
49#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60/* IO virtual address start page frame number */
61#define IOVA_START_PFN (1)
62
63#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
64
65/* page table handling */
66#define LEVEL_STRIDE (9)
67#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
68
69static inline int agaw_to_level(int agaw)
70{
71 return agaw + 2;
72}
73
74static inline int agaw_to_width(int agaw)
75{
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77}
78
79static inline int width_to_agaw(int width)
80{
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82}
83
84static inline unsigned int level_to_offset_bits(int level)
85{
86 return (level - 1) * LEVEL_STRIDE;
87}
88
89static inline int pfn_level_offset(u64 pfn, int level)
90{
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92}
93
94static inline u64 level_mask(int level)
95{
96 return -1ULL << level_to_offset_bits(level);
97}
98
99static inline u64 level_size(int level)
100{
101 return 1ULL << level_to_offset_bits(level);
102}
103
104static inline u64 align_to_level(u64 pfn, int level)
105{
106 return (pfn + level_size(level) - 1) & level_mask(level);
107}
108
109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110{
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112}
113
114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117{
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119}
120static inline unsigned long page_to_dma_pfn(struct page *pg)
121{
122 return mm_to_dma_pfn(page_to_pfn(pg));
123}
124static inline unsigned long virt_to_dma_pfn(void *p)
125{
126 return page_to_dma_pfn(virt_to_page(p));
127}
128
129static void __init check_tylersburg_isoch(void);
130static int rwbf_quirk;
131
132/*
133 * set to 1 to panic kernel if can't successfully enable VT-d
134 * (used when kernel is launched w/ TXT)
135 */
136static int force_on = 0;
137static int intel_iommu_tboot_noforce;
138static int no_platform_optin;
139
140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142/*
143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144 * if marked present.
145 */
146static phys_addr_t root_entry_lctp(struct root_entry *re)
147{
148 if (!(re->lo & 1))
149 return 0;
150
151 return re->lo & VTD_PAGE_MASK;
152}
153
154/*
155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156 * if marked present.
157 */
158static phys_addr_t root_entry_uctp(struct root_entry *re)
159{
160 if (!(re->hi & 1))
161 return 0;
162
163 return re->hi & VTD_PAGE_MASK;
164}
165
166static inline void context_set_present(struct context_entry *context)
167{
168 context->lo |= 1;
169}
170
171static inline void context_set_fault_enable(struct context_entry *context)
172{
173 context->lo &= (((u64)-1) << 2) | 1;
174}
175
176static inline void context_set_translation_type(struct context_entry *context,
177 unsigned long value)
178{
179 context->lo &= (((u64)-1) << 4) | 3;
180 context->lo |= (value & 3) << 2;
181}
182
183static inline void context_set_address_root(struct context_entry *context,
184 unsigned long value)
185{
186 context->lo &= ~VTD_PAGE_MASK;
187 context->lo |= value & VTD_PAGE_MASK;
188}
189
190static inline void context_set_address_width(struct context_entry *context,
191 unsigned long value)
192{
193 context->hi |= value & 7;
194}
195
196static inline void context_set_domain_id(struct context_entry *context,
197 unsigned long value)
198{
199 context->hi |= (value & ((1 << 16) - 1)) << 8;
200}
201
202static inline void context_set_pasid(struct context_entry *context)
203{
204 context->lo |= CONTEXT_PASIDE;
205}
206
207static inline int context_domain_id(struct context_entry *c)
208{
209 return((c->hi >> 8) & 0xffff);
210}
211
212static inline void context_clear_entry(struct context_entry *context)
213{
214 context->lo = 0;
215 context->hi = 0;
216}
217
218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219{
220 if (!iommu->copied_tables)
221 return false;
222
223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224}
225
226static inline void
227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228{
229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230}
231
232static inline void
233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234{
235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236}
237
238/*
239 * This domain is a statically identity mapping domain.
240 * 1. This domain creats a static 1:1 mapping to all usable memory.
241 * 2. It maps to each iommu if successful.
242 * 3. Each iommu mapps to this domain if successful.
243 */
244static struct dmar_domain *si_domain;
245static int hw_pass_through = 1;
246
247struct dmar_rmrr_unit {
248 struct list_head list; /* list of rmrr units */
249 struct acpi_dmar_header *hdr; /* ACPI header */
250 u64 base_address; /* reserved base address*/
251 u64 end_address; /* reserved end address */
252 struct dmar_dev_scope *devices; /* target devices */
253 int devices_cnt; /* target device count */
254};
255
256struct dmar_atsr_unit {
257 struct list_head list; /* list of ATSR units */
258 struct acpi_dmar_header *hdr; /* ACPI header */
259 struct dmar_dev_scope *devices; /* target devices */
260 int devices_cnt; /* target device count */
261 u8 include_all:1; /* include all ports */
262};
263
264struct dmar_satc_unit {
265 struct list_head list; /* list of SATC units */
266 struct acpi_dmar_header *hdr; /* ACPI header */
267 struct dmar_dev_scope *devices; /* target devices */
268 struct intel_iommu *iommu; /* the corresponding iommu */
269 int devices_cnt; /* target device count */
270 u8 atc_required:1; /* ATS is required */
271};
272
273static LIST_HEAD(dmar_atsr_units);
274static LIST_HEAD(dmar_rmrr_units);
275static LIST_HEAD(dmar_satc_units);
276
277#define for_each_rmrr_units(rmrr) \
278 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279
280static void device_block_translation(struct device *dev);
281static void intel_iommu_domain_free(struct iommu_domain *domain);
282
283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285
286int intel_iommu_enabled = 0;
287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288
289static int dmar_map_gfx = 1;
290static int intel_iommu_superpage = 1;
291static int iommu_identity_mapping;
292static int iommu_skip_te_disable;
293
294#define IDENTMAP_GFX 2
295#define IDENTMAP_AZALIA 4
296
297const struct iommu_ops intel_iommu_ops;
298
299static bool translation_pre_enabled(struct intel_iommu *iommu)
300{
301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302}
303
304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305{
306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307}
308
309static void init_translation_status(struct intel_iommu *iommu)
310{
311 u32 gsts;
312
313 gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 if (gsts & DMA_GSTS_TES)
315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316}
317
318static int __init intel_iommu_setup(char *str)
319{
320 if (!str)
321 return -EINVAL;
322
323 while (*str) {
324 if (!strncmp(str, "on", 2)) {
325 dmar_disabled = 0;
326 pr_info("IOMMU enabled\n");
327 } else if (!strncmp(str, "off", 3)) {
328 dmar_disabled = 1;
329 no_platform_optin = 1;
330 pr_info("IOMMU disabled\n");
331 } else if (!strncmp(str, "igfx_off", 8)) {
332 dmar_map_gfx = 0;
333 pr_info("Disable GFX device mapping\n");
334 } else if (!strncmp(str, "forcedac", 8)) {
335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 iommu_dma_forcedac = true;
337 } else if (!strncmp(str, "strict", 6)) {
338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 iommu_set_dma_strict();
340 } else if (!strncmp(str, "sp_off", 6)) {
341 pr_info("Disable supported super page\n");
342 intel_iommu_superpage = 0;
343 } else if (!strncmp(str, "sm_on", 5)) {
344 pr_info("Enable scalable mode if hardware supports\n");
345 intel_iommu_sm = 1;
346 } else if (!strncmp(str, "sm_off", 6)) {
347 pr_info("Scalable mode is disallowed\n");
348 intel_iommu_sm = 0;
349 } else if (!strncmp(str, "tboot_noforce", 13)) {
350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 intel_iommu_tboot_noforce = 1;
352 } else {
353 pr_notice("Unknown option - '%s'\n", str);
354 }
355
356 str += strcspn(str, ",");
357 while (*str == ',')
358 str++;
359 }
360
361 return 1;
362}
363__setup("intel_iommu=", intel_iommu_setup);
364
365void *alloc_pgtable_page(int node)
366{
367 struct page *page;
368 void *vaddr = NULL;
369
370 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
371 if (page)
372 vaddr = page_address(page);
373 return vaddr;
374}
375
376void free_pgtable_page(void *vaddr)
377{
378 free_page((unsigned long)vaddr);
379}
380
381static inline int domain_type_is_si(struct dmar_domain *domain)
382{
383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384}
385
386static inline int domain_pfn_supported(struct dmar_domain *domain,
387 unsigned long pfn)
388{
389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390
391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392}
393
394/*
395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397 * the returned SAGAW.
398 */
399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400{
401 unsigned long fl_sagaw, sl_sagaw;
402
403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 sl_sagaw = cap_sagaw(iommu->cap);
405
406 /* Second level only. */
407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 return sl_sagaw;
409
410 /* First level only. */
411 if (!ecap_slts(iommu->ecap))
412 return fl_sagaw;
413
414 return fl_sagaw & sl_sagaw;
415}
416
417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418{
419 unsigned long sagaw;
420 int agaw;
421
422 sagaw = __iommu_calculate_sagaw(iommu);
423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 if (test_bit(agaw, &sagaw))
425 break;
426 }
427
428 return agaw;
429}
430
431/*
432 * Calculate max SAGAW for each iommu.
433 */
434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435{
436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437}
438
439/*
440 * calculate agaw for each iommu.
441 * "SAGAW" may be different across iommus, use a default agaw, and
442 * get a supported less agaw for iommus that don't support the default agaw.
443 */
444int iommu_calculate_agaw(struct intel_iommu *iommu)
445{
446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447}
448
449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450{
451 return sm_supported(iommu) ?
452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453}
454
455static void domain_update_iommu_coherency(struct dmar_domain *domain)
456{
457 struct iommu_domain_info *info;
458 struct dmar_drhd_unit *drhd;
459 struct intel_iommu *iommu;
460 bool found = false;
461 unsigned long i;
462
463 domain->iommu_coherency = true;
464 xa_for_each(&domain->iommu_array, i, info) {
465 found = true;
466 if (!iommu_paging_structure_coherency(info->iommu)) {
467 domain->iommu_coherency = false;
468 break;
469 }
470 }
471 if (found)
472 return;
473
474 /* No hardware attached; use lowest common denominator */
475 rcu_read_lock();
476 for_each_active_iommu(iommu, drhd) {
477 if (!iommu_paging_structure_coherency(iommu)) {
478 domain->iommu_coherency = false;
479 break;
480 }
481 }
482 rcu_read_unlock();
483}
484
485static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 struct intel_iommu *skip)
487{
488 struct dmar_drhd_unit *drhd;
489 struct intel_iommu *iommu;
490 int mask = 0x3;
491
492 if (!intel_iommu_superpage)
493 return 0;
494
495 /* set iommu_superpage to the smallest common denominator */
496 rcu_read_lock();
497 for_each_active_iommu(iommu, drhd) {
498 if (iommu != skip) {
499 if (domain && domain->use_first_level) {
500 if (!cap_fl1gp_support(iommu->cap))
501 mask = 0x1;
502 } else {
503 mask &= cap_super_page_val(iommu->cap);
504 }
505
506 if (!mask)
507 break;
508 }
509 }
510 rcu_read_unlock();
511
512 return fls(mask);
513}
514
515static int domain_update_device_node(struct dmar_domain *domain)
516{
517 struct device_domain_info *info;
518 int nid = NUMA_NO_NODE;
519 unsigned long flags;
520
521 spin_lock_irqsave(&domain->lock, flags);
522 list_for_each_entry(info, &domain->devices, link) {
523 /*
524 * There could possibly be multiple device numa nodes as devices
525 * within the same domain may sit behind different IOMMUs. There
526 * isn't perfect answer in such situation, so we select first
527 * come first served policy.
528 */
529 nid = dev_to_node(info->dev);
530 if (nid != NUMA_NO_NODE)
531 break;
532 }
533 spin_unlock_irqrestore(&domain->lock, flags);
534
535 return nid;
536}
537
538static void domain_update_iotlb(struct dmar_domain *domain);
539
540/* Return the super pagesize bitmap if supported. */
541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542{
543 unsigned long bitmap = 0;
544
545 /*
546 * 1-level super page supports page size of 2MiB, 2-level super page
547 * supports page size of both 2MiB and 1GiB.
548 */
549 if (domain->iommu_superpage == 1)
550 bitmap |= SZ_2M;
551 else if (domain->iommu_superpage == 2)
552 bitmap |= SZ_2M | SZ_1G;
553
554 return bitmap;
555}
556
557/* Some capabilities may be different across iommus */
558static void domain_update_iommu_cap(struct dmar_domain *domain)
559{
560 domain_update_iommu_coherency(domain);
561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562
563 /*
564 * If RHSA is missing, we should default to the device numa domain
565 * as fall back.
566 */
567 if (domain->nid == NUMA_NO_NODE)
568 domain->nid = domain_update_device_node(domain);
569
570 /*
571 * First-level translation restricts the input-address to a
572 * canonical address (i.e., address bits 63:N have the same
573 * value as address bit [N-1], where N is 48-bits with 4-level
574 * paging and 57-bits with 5-level paging). Hence, skip bit
575 * [N-1].
576 */
577 if (domain->use_first_level)
578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 else
580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581
582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 domain_update_iotlb(domain);
584}
585
586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 u8 devfn, int alloc)
588{
589 struct root_entry *root = &iommu->root_entry[bus];
590 struct context_entry *context;
591 u64 *entry;
592
593 /*
594 * Except that the caller requested to allocate a new entry,
595 * returning a copied context entry makes no sense.
596 */
597 if (!alloc && context_copied(iommu, bus, devfn))
598 return NULL;
599
600 entry = &root->lo;
601 if (sm_supported(iommu)) {
602 if (devfn >= 0x80) {
603 devfn -= 0x80;
604 entry = &root->hi;
605 }
606 devfn *= 2;
607 }
608 if (*entry & 1)
609 context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 else {
611 unsigned long phy_addr;
612 if (!alloc)
613 return NULL;
614
615 context = alloc_pgtable_page(iommu->node);
616 if (!context)
617 return NULL;
618
619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 phy_addr = virt_to_phys((void *)context);
621 *entry = phy_addr | 1;
622 __iommu_flush_cache(iommu, entry, sizeof(*entry));
623 }
624 return &context[devfn];
625}
626
627/**
628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629 * sub-hierarchy of a candidate PCI-PCI bridge
630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631 * @bridge: the candidate PCI-PCI bridge
632 *
633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634 */
635static bool
636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637{
638 struct pci_dev *pdev, *pbridge;
639
640 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 return false;
642
643 pdev = to_pci_dev(dev);
644 pbridge = to_pci_dev(bridge);
645
646 if (pbridge->subordinate &&
647 pbridge->subordinate->number <= pdev->bus->number &&
648 pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 return true;
650
651 return false;
652}
653
654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655{
656 struct dmar_drhd_unit *drhd;
657 u32 vtbar;
658 int rc;
659
660 /* We know that this device on this chipset has its own IOMMU.
661 * If we find it under a different IOMMU, then the BIOS is lying
662 * to us. Hope that the IOMMU for this device is actually
663 * disabled, and it needs no translation...
664 */
665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 if (rc) {
667 /* "can't" happen */
668 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 return false;
670 }
671 vtbar &= 0xffff0000;
672
673 /* we know that the this iommu should be at offset 0xa000 from vtbar */
674 drhd = dmar_find_matched_drhd_unit(pdev);
675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 return true;
679 }
680
681 return false;
682}
683
684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685{
686 if (!iommu || iommu->drhd->ignored)
687 return true;
688
689 if (dev_is_pci(dev)) {
690 struct pci_dev *pdev = to_pci_dev(dev);
691
692 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 quirk_ioat_snb_local_iommu(pdev))
695 return true;
696 }
697
698 return false;
699}
700
701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702{
703 struct dmar_drhd_unit *drhd = NULL;
704 struct pci_dev *pdev = NULL;
705 struct intel_iommu *iommu;
706 struct device *tmp;
707 u16 segment = 0;
708 int i;
709
710 if (!dev)
711 return NULL;
712
713 if (dev_is_pci(dev)) {
714 struct pci_dev *pf_pdev;
715
716 pdev = pci_real_dma_dev(to_pci_dev(dev));
717
718 /* VFs aren't listed in scope tables; we need to look up
719 * the PF instead to find the IOMMU. */
720 pf_pdev = pci_physfn(pdev);
721 dev = &pf_pdev->dev;
722 segment = pci_domain_nr(pdev->bus);
723 } else if (has_acpi_companion(dev))
724 dev = &ACPI_COMPANION(dev)->dev;
725
726 rcu_read_lock();
727 for_each_iommu(iommu, drhd) {
728 if (pdev && segment != drhd->segment)
729 continue;
730
731 for_each_active_dev_scope(drhd->devices,
732 drhd->devices_cnt, i, tmp) {
733 if (tmp == dev) {
734 /* For a VF use its original BDF# not that of the PF
735 * which we used for the IOMMU lookup. Strictly speaking
736 * we could do this for all PCI devices; we only need to
737 * get the BDF# from the scope table for ACPI matches. */
738 if (pdev && pdev->is_virtfn)
739 goto got_pdev;
740
741 if (bus && devfn) {
742 *bus = drhd->devices[i].bus;
743 *devfn = drhd->devices[i].devfn;
744 }
745 goto out;
746 }
747
748 if (is_downstream_to_pci_bridge(dev, tmp))
749 goto got_pdev;
750 }
751
752 if (pdev && drhd->include_all) {
753got_pdev:
754 if (bus && devfn) {
755 *bus = pdev->bus->number;
756 *devfn = pdev->devfn;
757 }
758 goto out;
759 }
760 }
761 iommu = NULL;
762out:
763 if (iommu_is_dummy(iommu, dev))
764 iommu = NULL;
765
766 rcu_read_unlock();
767
768 return iommu;
769}
770
771static void domain_flush_cache(struct dmar_domain *domain,
772 void *addr, int size)
773{
774 if (!domain->iommu_coherency)
775 clflush_cache_range(addr, size);
776}
777
778static void free_context_table(struct intel_iommu *iommu)
779{
780 struct context_entry *context;
781 int i;
782
783 if (!iommu->root_entry)
784 return;
785
786 for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 context = iommu_context_addr(iommu, i, 0, 0);
788 if (context)
789 free_pgtable_page(context);
790
791 if (!sm_supported(iommu))
792 continue;
793
794 context = iommu_context_addr(iommu, i, 0x80, 0);
795 if (context)
796 free_pgtable_page(context);
797 }
798
799 free_pgtable_page(iommu->root_entry);
800 iommu->root_entry = NULL;
801}
802
803#ifdef CONFIG_DMAR_DEBUG
804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806{
807 struct dma_pte *pte;
808 int offset;
809
810 while (1) {
811 offset = pfn_level_offset(pfn, level);
812 pte = &parent[offset];
813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 pr_info("PTE not present at level %d\n", level);
815 break;
816 }
817
818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819
820 if (level == 1)
821 break;
822
823 parent = phys_to_virt(dma_pte_addr(pte));
824 level--;
825 }
826}
827
828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 unsigned long long addr, u32 pasid)
830{
831 struct pasid_dir_entry *dir, *pde;
832 struct pasid_entry *entries, *pte;
833 struct context_entry *ctx_entry;
834 struct root_entry *rt_entry;
835 int i, dir_index, index, level;
836 u8 devfn = source_id & 0xff;
837 u8 bus = source_id >> 8;
838 struct dma_pte *pgtable;
839
840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841
842 /* root entry dump */
843 rt_entry = &iommu->root_entry[bus];
844 if (!rt_entry) {
845 pr_info("root table entry is not present\n");
846 return;
847 }
848
849 if (sm_supported(iommu))
850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 rt_entry->hi, rt_entry->lo);
852 else
853 pr_info("root entry: 0x%016llx", rt_entry->lo);
854
855 /* context entry dump */
856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 if (!ctx_entry) {
858 pr_info("context table entry is not present\n");
859 return;
860 }
861
862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 ctx_entry->hi, ctx_entry->lo);
864
865 /* legacy mode does not require PASID entries */
866 if (!sm_supported(iommu)) {
867 level = agaw_to_level(ctx_entry->hi & 7);
868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 goto pgtable_walk;
870 }
871
872 /* get the pointer to pasid directory entry */
873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 if (!dir) {
875 pr_info("pasid directory entry is not present\n");
876 return;
877 }
878 /* For request-without-pasid, get the pasid from context entry */
879 if (intel_iommu_sm && pasid == INVALID_IOASID)
880 pasid = PASID_RID2PASID;
881
882 dir_index = pasid >> PASID_PDE_SHIFT;
883 pde = &dir[dir_index];
884 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885
886 /* get the pointer to the pasid table entry */
887 entries = get_pasid_table_from_pde(pde);
888 if (!entries) {
889 pr_info("pasid table entry is not present\n");
890 return;
891 }
892 index = pasid & PASID_PTE_MASK;
893 pte = &entries[index];
894 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896
897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 } else {
901 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 }
904
905pgtable_walk:
906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907}
908#endif
909
910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
912{
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
915 int offset;
916
917 BUG_ON(!domain->pgd);
918
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
921 return NULL;
922
923 parent = domain->pgd;
924
925 while (1) {
926 void *tmp_page;
927
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 break;
932 if (level == *target_level)
933 break;
934
935 if (!dma_pte_present(pte)) {
936 uint64_t pteval;
937
938 tmp_page = alloc_pgtable_page(domain->nid);
939
940 if (!tmp_page)
941 return NULL;
942
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain->use_first_level)
946 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
947
948 if (cmpxchg64(&pte->val, 0ULL, pteval))
949 /* Someone else set it while we were thinking; use theirs. */
950 free_pgtable_page(tmp_page);
951 else
952 domain_flush_cache(domain, pte, sizeof(*pte));
953 }
954 if (level == 1)
955 break;
956
957 parent = phys_to_virt(dma_pte_addr(pte));
958 level--;
959 }
960
961 if (!*target_level)
962 *target_level = level;
963
964 return pte;
965}
966
967/* return address's pte at specific level */
968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 unsigned long pfn,
970 int level, int *large_page)
971{
972 struct dma_pte *parent, *pte;
973 int total = agaw_to_level(domain->agaw);
974 int offset;
975
976 parent = domain->pgd;
977 while (level <= total) {
978 offset = pfn_level_offset(pfn, total);
979 pte = &parent[offset];
980 if (level == total)
981 return pte;
982
983 if (!dma_pte_present(pte)) {
984 *large_page = total;
985 break;
986 }
987
988 if (dma_pte_superpage(pte)) {
989 *large_page = total;
990 return pte;
991 }
992
993 parent = phys_to_virt(dma_pte_addr(pte));
994 total--;
995 }
996 return NULL;
997}
998
999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001 unsigned long start_pfn,
1002 unsigned long last_pfn)
1003{
1004 unsigned int large_page;
1005 struct dma_pte *first_pte, *pte;
1006
1007 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 BUG_ON(start_pfn > last_pfn);
1010
1011 /* we don't need lock here; nobody else touches the iova range */
1012 do {
1013 large_page = 1;
1014 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 if (!pte) {
1016 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017 continue;
1018 }
1019 do {
1020 dma_clear_pte(pte);
1021 start_pfn += lvl_to_nr_pages(large_page);
1022 pte++;
1023 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025 domain_flush_cache(domain, first_pte,
1026 (void *)pte - (void *)first_pte);
1027
1028 } while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 int retain_level, struct dma_pte *pte,
1033 unsigned long pfn, unsigned long start_pfn,
1034 unsigned long last_pfn)
1035{
1036 pfn = max(start_pfn, pfn);
1037 pte = &pte[pfn_level_offset(pfn, level)];
1038
1039 do {
1040 unsigned long level_pfn;
1041 struct dma_pte *level_pte;
1042
1043 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044 goto next;
1045
1046 level_pfn = pfn & level_mask(level);
1047 level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049 if (level > 2) {
1050 dma_pte_free_level(domain, level - 1, retain_level,
1051 level_pte, level_pfn, start_pfn,
1052 last_pfn);
1053 }
1054
1055 /*
1056 * Free the page table if we're below the level we want to
1057 * retain and the range covers the entire table.
1058 */
1059 if (level < retain_level && !(start_pfn > level_pfn ||
1060 last_pfn < level_pfn + level_size(level) - 1)) {
1061 dma_clear_pte(pte);
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1063 free_pgtable_page(level_pte);
1064 }
1065next:
1066 pfn += level_size(level);
1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn,
1077 int retain_level)
1078{
1079 dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081 /* We don't need lock here; nobody else touches the iova range */
1082 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083 domain->pgd, 0, start_pfn, last_pfn);
1084
1085 /* free pgd */
1086 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 free_pgtable_page(domain->pgd);
1088 domain->pgd = NULL;
1089 }
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093 need to *modify* it at all. All we need to do is make a list of all the
1094 pages which can be freed just as soon as we've flushed the IOTLB and we
1095 know the hardware page-walk will no longer touch them.
1096 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097 be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099 int level, struct dma_pte *pte,
1100 struct list_head *freelist)
1101{
1102 struct page *pg;
1103
1104 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105 list_add_tail(&pg->lru, freelist);
1106
1107 if (level == 1)
1108 return;
1109
1110 pte = page_address(pg);
1111 do {
1112 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 pte++;
1115 } while (!first_pte_in_page(pte));
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119 struct dma_pte *pte, unsigned long pfn,
1120 unsigned long start_pfn, unsigned long last_pfn,
1121 struct list_head *freelist)
1122{
1123 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125 pfn = max(start_pfn, pfn);
1126 pte = &pte[pfn_level_offset(pfn, level)];
1127
1128 do {
1129 unsigned long level_pfn = pfn & level_mask(level);
1130
1131 if (!dma_pte_present(pte))
1132 goto next;
1133
1134 /* If range covers entire pagetable, free it */
1135 if (start_pfn <= level_pfn &&
1136 last_pfn >= level_pfn + level_size(level) - 1) {
1137 /* These suborbinate page tables are going away entirely. Don't
1138 bother to clear them; we're just going to *free* them. */
1139 if (level > 1 && !dma_pte_superpage(pte))
1140 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142 dma_clear_pte(pte);
1143 if (!first_pte)
1144 first_pte = pte;
1145 last_pte = pte;
1146 } else if (level > 1) {
1147 /* Recurse down into a level that isn't *entirely* obsolete */
1148 dma_pte_clear_level(domain, level - 1,
1149 phys_to_virt(dma_pte_addr(pte)),
1150 level_pfn, start_pfn, last_pfn,
1151 freelist);
1152 }
1153next:
1154 pfn = level_pfn + level_size(level);
1155 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157 if (first_pte)
1158 domain_flush_cache(domain, first_pte,
1159 (void *)++last_pte - (void *)first_pte);
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163 the page tables, and may have cached the intermediate levels. The
1164 pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166 unsigned long last_pfn, struct list_head *freelist)
1167{
1168 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170 BUG_ON(start_pfn > last_pfn);
1171
1172 /* we don't need lock here; nobody else touches the iova range */
1173 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174 domain->pgd, 0, start_pfn, last_pfn, freelist);
1175
1176 /* free pgd */
1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 struct page *pgd_page = virt_to_page(domain->pgd);
1179 list_add_tail(&pgd_page->lru, freelist);
1180 domain->pgd = NULL;
1181 }
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187 struct root_entry *root;
1188
1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190 if (!root) {
1191 pr_err("Allocating root entry for %s failed\n",
1192 iommu->name);
1193 return -ENOMEM;
1194 }
1195
1196 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1197 iommu->root_entry = root;
1198
1199 return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204 u64 addr;
1205 u32 sts;
1206 unsigned long flag;
1207
1208 addr = virt_to_phys(iommu->root_entry);
1209 if (sm_supported(iommu))
1210 addr |= DMA_RTADDR_SMT;
1211
1212 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217 /* Make sure hardware complete it */
1218 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219 readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223 /*
1224 * Hardware invalidates all DMA remapping hardware translation
1225 * caches as part of SRTP flow.
1226 */
1227 if (cap_esrtps(iommu->cap))
1228 return;
1229
1230 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231 if (sm_supported(iommu))
1232 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238 u32 val;
1239 unsigned long flag;
1240
1241 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242 return;
1243
1244 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247 /* Make sure hardware complete it */
1248 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249 readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256 u16 did, u16 source_id, u8 function_mask,
1257 u64 type)
1258{
1259 u64 val = 0;
1260 unsigned long flag;
1261
1262 switch (type) {
1263 case DMA_CCMD_GLOBAL_INVL:
1264 val = DMA_CCMD_GLOBAL_INVL;
1265 break;
1266 case DMA_CCMD_DOMAIN_INVL:
1267 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268 break;
1269 case DMA_CCMD_DEVICE_INVL:
1270 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272 break;
1273 default:
1274 BUG();
1275 }
1276 val |= DMA_CCMD_ICC;
1277
1278 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281 /* Make sure hardware complete it */
1282 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290 u64 addr, unsigned int size_order, u64 type)
1291{
1292 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293 u64 val = 0, val_iva = 0;
1294 unsigned long flag;
1295
1296 switch (type) {
1297 case DMA_TLB_GLOBAL_FLUSH:
1298 /* global flush doesn't need set IVA_REG */
1299 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300 break;
1301 case DMA_TLB_DSI_FLUSH:
1302 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 break;
1304 case DMA_TLB_PSI_FLUSH:
1305 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 /* IH bit is passed in as part of address */
1307 val_iva = size_order | addr;
1308 break;
1309 default:
1310 BUG();
1311 }
1312 /* Note: set drain read/write */
1313#if 0
1314 /*
1315 * This is probably to be super secure.. Looks like we can
1316 * ignore it without any impact.
1317 */
1318 if (cap_read_drain(iommu->cap))
1319 val |= DMA_TLB_READ_DRAIN;
1320#endif
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1323
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 /* Note: Only uses first TLB reg currently */
1326 if (val_iva)
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
1338 pr_err("Flush IOTLB failed\n");
1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349 struct device_domain_info *info;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&domain->lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 spin_unlock_irqrestore(&domain->lock, flags);
1357 return info;
1358 }
1359 }
1360 spin_unlock_irqrestore(&domain->lock, flags);
1361
1362 return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367 struct device_domain_info *info;
1368 bool has_iotlb_device = false;
1369 unsigned long flags;
1370
1371 spin_lock_irqsave(&domain->lock, flags);
1372 list_for_each_entry(info, &domain->devices, link) {
1373 if (info->ats_enabled) {
1374 has_iotlb_device = true;
1375 break;
1376 }
1377 }
1378 domain->has_iotlb_device = has_iotlb_device;
1379 spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392 return false;
1393
1394 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395 return false;
1396
1397 return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402 struct pci_dev *pdev;
1403
1404 if (!dev_is_pci(info->dev))
1405 return;
1406
1407 pdev = to_pci_dev(info->dev);
1408 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411 * reserved, which should be set to 0.
1412 */
1413 if (!ecap_dit(info->iommu->ecap))
1414 info->pfsid = 0;
1415 else {
1416 struct pci_dev *pf_pdev;
1417
1418 /* pdev will be returned if device is not a vf */
1419 pf_pdev = pci_physfn(pdev);
1420 info->pfsid = pci_dev_id(pf_pdev);
1421 }
1422
1423 /* The PCIe spec, in its wisdom, declares that the behaviour of
1424 the device if you enable PASID support after ATS support is
1425 undefined. So always enable PASID support on devices which
1426 have it, even if we can't yet know if we're ever going to
1427 use it. */
1428 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429 info->pasid_enabled = 1;
1430
1431 if (info->pri_supported &&
1432 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1433 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434 info->pri_enabled = 1;
1435
1436 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438 info->ats_enabled = 1;
1439 domain_update_iotlb(info->domain);
1440 info->ats_qdep = pci_ats_queue_depth(pdev);
1441 }
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446 struct pci_dev *pdev;
1447
1448 if (!dev_is_pci(info->dev))
1449 return;
1450
1451 pdev = to_pci_dev(info->dev);
1452
1453 if (info->ats_enabled) {
1454 pci_disable_ats(pdev);
1455 info->ats_enabled = 0;
1456 domain_update_iotlb(info->domain);
1457 }
1458
1459 if (info->pri_enabled) {
1460 pci_disable_pri(pdev);
1461 info->pri_enabled = 0;
1462 }
1463
1464 if (info->pasid_enabled) {
1465 pci_disable_pasid(pdev);
1466 info->pasid_enabled = 0;
1467 }
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471 u64 addr, unsigned int mask)
1472{
1473 u16 sid, qdep;
1474
1475 if (!info || !info->ats_enabled)
1476 return;
1477
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 qdep, addr, mask);
1482 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 u64 addr, unsigned mask)
1487{
1488 struct device_domain_info *info;
1489 unsigned long flags;
1490
1491 if (!domain->has_iotlb_device)
1492 return;
1493
1494 spin_lock_irqsave(&domain->lock, flags);
1495 list_for_each_entry(info, &domain->devices, link)
1496 __iommu_flush_dev_iotlb(info, addr, mask);
1497 spin_unlock_irqrestore(&domain->lock, flags);
1498}
1499
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 struct dmar_domain *domain,
1502 unsigned long pfn, unsigned int pages,
1503 int ih, int map)
1504{
1505 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 unsigned int mask = ilog2(aligned_pages);
1507 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 u16 did = domain_id_iommu(domain, iommu);
1509
1510 BUG_ON(pages == 0);
1511
1512 if (ih)
1513 ih = 1 << 6;
1514
1515 if (domain->use_first_level) {
1516 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 } else {
1518 unsigned long bitmask = aligned_pages - 1;
1519
1520 /*
1521 * PSI masks the low order bits of the base address. If the
1522 * address isn't aligned to the mask, then compute a mask value
1523 * needed to ensure the target range is flushed.
1524 */
1525 if (unlikely(bitmask & pfn)) {
1526 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528 /*
1529 * Since end_pfn <= pfn + bitmask, the only way bits
1530 * higher than bitmask can differ in pfn and end_pfn is
1531 * by carrying. This means after masking out bitmask,
1532 * high bits starting with the first set bit in
1533 * shared_bits are all equal in both pfn and end_pfn.
1534 */
1535 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 }
1538
1539 /*
1540 * Fallback to domain selective flush if no PSI support or
1541 * the size is too big.
1542 */
1543 if (!cap_pgsel_inv(iommu->cap) ||
1544 mask > cap_max_amask_val(iommu->cap))
1545 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 DMA_TLB_DSI_FLUSH);
1547 else
1548 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 DMA_TLB_PSI_FLUSH);
1550 }
1551
1552 /*
1553 * In caching mode, changes of pages from non-present to present require
1554 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 */
1556 if (!cap_caching_mode(iommu->cap) || !map)
1557 iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 struct dmar_domain *domain,
1563 unsigned long pfn, unsigned int pages)
1564{
1565 /*
1566 * It's a non-present to present mapping. Only flush if caching mode
1567 * and second level.
1568 */
1569 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 else
1572 iommu_flush_write_buffer(iommu);
1573}
1574
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 struct iommu_domain_info *info;
1579 unsigned long idx;
1580
1581 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 struct intel_iommu *iommu = info->iommu;
1583 u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585 if (dmar_domain->use_first_level)
1586 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 else
1588 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 DMA_TLB_DSI_FLUSH);
1590
1591 if (!cap_caching_mode(iommu->cap))
1592 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 }
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598 u32 pmen;
1599 unsigned long flags;
1600
1601 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 return;
1603
1604 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 pmen &= ~DMA_PMEN_EPM;
1607 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609 /* wait for the protected region status bit to clear */
1610 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618 u32 sts;
1619 unsigned long flags;
1620
1621 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 iommu->gcmd |= DMA_GCMD_TE;
1623 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625 /* Make sure hardware complete it */
1626 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 readl, (sts & DMA_GSTS_TES), sts);
1628
1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634 u32 sts;
1635 unsigned long flag;
1636
1637 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 return;
1640
1641 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 iommu->gcmd &= ~DMA_GCMD_TE;
1643 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645 /* Make sure hardware complete it */
1646 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654 u32 ndomains;
1655
1656 ndomains = cap_ndoms(iommu->cap);
1657 pr_debug("%s: Number of Domains supported <%d>\n",
1658 iommu->name, ndomains);
1659
1660 spin_lock_init(&iommu->lock);
1661
1662 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 if (!iommu->domain_ids)
1664 return -ENOMEM;
1665
1666 /*
1667 * If Caching mode is set, then invalid translations are tagged
1668 * with domain-id 0, hence we need to pre-allocate it. We also
1669 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 * make sure it is not used for a real domain.
1671 */
1672 set_bit(0, iommu->domain_ids);
1673
1674 /*
1675 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 * entry for first-level or pass-through translation modes should
1677 * be programmed with a domain id different from those used for
1678 * second-level or nested translation. We reserve a domain id for
1679 * this purpose.
1680 */
1681 if (sm_supported(iommu))
1682 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684 return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689 if (!iommu->domain_ids)
1690 return;
1691
1692 /*
1693 * All iommu domains must have been detached from the devices,
1694 * hence there should be no domain IDs in use.
1695 */
1696 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 > NUM_RESERVED_DID))
1698 return;
1699
1700 if (iommu->gcmd & DMA_GCMD_TE)
1701 iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706 if (iommu->domain_ids) {
1707 bitmap_free(iommu->domain_ids);
1708 iommu->domain_ids = NULL;
1709 }
1710
1711 if (iommu->copied_tables) {
1712 bitmap_free(iommu->copied_tables);
1713 iommu->copied_tables = NULL;
1714 }
1715
1716 /* free context mapping */
1717 free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720 if (pasid_supported(iommu)) {
1721 if (ecap_prs(iommu->ecap))
1722 intel_svm_finish_prq(iommu);
1723 }
1724 if (vccap_pasid(iommu->vccap))
1725 ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736 /* Only SL is available in legacy mode */
1737 if (!scalable_mode_support())
1738 return false;
1739
1740 /* Only level (either FL or SL) is available, just use it */
1741 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 return intel_cap_flts_sanity();
1743
1744 /* Both levels are available, decide it based on domain type */
1745 return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750 struct dmar_domain *domain;
1751
1752 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 if (!domain)
1754 return NULL;
1755
1756 domain->nid = NUMA_NO_NODE;
1757 if (first_level_by_default(type))
1758 domain->use_first_level = true;
1759 domain->has_iotlb_device = false;
1760 INIT_LIST_HEAD(&domain->devices);
1761 spin_lock_init(&domain->lock);
1762 xa_init(&domain->iommu_array);
1763
1764 return domain;
1765}
1766
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768 struct intel_iommu *iommu)
1769{
1770 struct iommu_domain_info *info, *curr;
1771 unsigned long ndomains;
1772 int num, ret = -ENOSPC;
1773
1774 info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 if (!info)
1776 return -ENOMEM;
1777
1778 spin_lock(&iommu->lock);
1779 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 if (curr) {
1781 curr->refcnt++;
1782 spin_unlock(&iommu->lock);
1783 kfree(info);
1784 return 0;
1785 }
1786
1787 ndomains = cap_ndoms(iommu->cap);
1788 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 if (num >= ndomains) {
1790 pr_err("%s: No free domain ids\n", iommu->name);
1791 goto err_unlock;
1792 }
1793
1794 set_bit(num, iommu->domain_ids);
1795 info->refcnt = 1;
1796 info->did = num;
1797 info->iommu = iommu;
1798 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 NULL, info, GFP_ATOMIC);
1800 if (curr) {
1801 ret = xa_err(curr) ? : -EBUSY;
1802 goto err_clear;
1803 }
1804 domain_update_iommu_cap(domain);
1805
1806 spin_unlock(&iommu->lock);
1807 return 0;
1808
1809err_clear:
1810 clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812 spin_unlock(&iommu->lock);
1813 kfree(info);
1814 return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818 struct intel_iommu *iommu)
1819{
1820 struct iommu_domain_info *info;
1821
1822 spin_lock(&iommu->lock);
1823 info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 if (--info->refcnt == 0) {
1825 clear_bit(info->did, iommu->domain_ids);
1826 xa_erase(&domain->iommu_array, iommu->seq_id);
1827 domain->nid = NUMA_NO_NODE;
1828 domain_update_iommu_cap(domain);
1829 kfree(info);
1830 }
1831 spin_unlock(&iommu->lock);
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836 int agaw;
1837 int r = (gaw - 12) % 9;
1838
1839 if (r == 0)
1840 agaw = gaw;
1841 else
1842 agaw = gaw + 9 - r;
1843 if (agaw > 64)
1844 agaw = 64;
1845 return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
1850 if (domain->pgd) {
1851 LIST_HEAD(freelist);
1852
1853 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 put_pages_list(&freelist);
1855 }
1856
1857 if (WARN_ON(!list_empty(&domain->devices)))
1858 return;
1859
1860 kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870 unsigned long pds, max_pde;
1871
1872 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 if (pds < 7)
1875 return 0;
1876
1877 return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888 context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897 context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906 context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds) (((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913 struct intel_iommu *iommu,
1914 struct pasid_table *table,
1915 u8 bus, u8 devfn)
1916{
1917 struct device_domain_info *info =
1918 domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 u16 did = domain_id_iommu(domain, iommu);
1920 int translation = CONTEXT_TT_MULTI_LEVEL;
1921 struct context_entry *context;
1922 int ret;
1923
1924 WARN_ON(did == 0);
1925
1926 if (hw_pass_through && domain_type_is_si(domain))
1927 translation = CONTEXT_TT_PASS_THROUGH;
1928
1929 pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932 BUG_ON(!domain->pgd);
1933
1934 spin_lock(&iommu->lock);
1935 ret = -ENOMEM;
1936 context = iommu_context_addr(iommu, bus, devfn, 1);
1937 if (!context)
1938 goto out_unlock;
1939
1940 ret = 0;
1941 if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 goto out_unlock;
1943
1944 /*
1945 * For kdump cases, old valid entries may be cached due to the
1946 * in-flight DMA and copied pgtable, but there is no unmapping
1947 * behaviour for them, thus we need an explicit cache flush for
1948 * the newly-mapped device. For kdump, at this point, the device
1949 * is supposed to finish reset at its driver probe stage, so no
1950 * in-flight DMA will exist, and we don't need to worry anymore
1951 * hereafter.
1952 */
1953 if (context_copied(iommu, bus, devfn)) {
1954 u16 did_old = context_domain_id(context);
1955
1956 if (did_old < cap_ndoms(iommu->cap)) {
1957 iommu->flush.flush_context(iommu, did_old,
1958 (((u16)bus) << 8) | devfn,
1959 DMA_CCMD_MASK_NOBIT,
1960 DMA_CCMD_DEVICE_INVL);
1961 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 DMA_TLB_DSI_FLUSH);
1963 }
1964
1965 clear_context_copied(iommu, bus, devfn);
1966 }
1967
1968 context_clear_entry(context);
1969
1970 if (sm_supported(iommu)) {
1971 unsigned long pds;
1972
1973 WARN_ON(!table);
1974
1975 /* Setup the PASID DIR pointer: */
1976 pds = context_get_sm_pds(table);
1977 context->lo = (u64)virt_to_phys(table->table) |
1978 context_pdts(pds);
1979
1980 /* Setup the RID_PASID field: */
1981 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983 /*
1984 * Setup the Device-TLB enable bit and Page request
1985 * Enable bit:
1986 */
1987 if (info && info->ats_supported)
1988 context_set_sm_dte(context);
1989 if (info && info->pri_supported)
1990 context_set_sm_pre(context);
1991 if (info && info->pasid_supported)
1992 context_set_pasid(context);
1993 } else {
1994 struct dma_pte *pgd = domain->pgd;
1995 int agaw;
1996
1997 context_set_domain_id(context, did);
1998
1999 if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 /*
2001 * Skip top levels of page tables for iommu which has
2002 * less agaw than default. Unnecessary for PT mode.
2003 */
2004 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 ret = -ENOMEM;
2006 pgd = phys_to_virt(dma_pte_addr(pgd));
2007 if (!dma_pte_present(pgd))
2008 goto out_unlock;
2009 }
2010
2011 if (info && info->ats_supported)
2012 translation = CONTEXT_TT_DEV_IOTLB;
2013 else
2014 translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016 context_set_address_root(context, virt_to_phys(pgd));
2017 context_set_address_width(context, agaw);
2018 } else {
2019 /*
2020 * In pass through mode, AW must be programmed to
2021 * indicate the largest AGAW value supported by
2022 * hardware. And ASR is ignored by hardware.
2023 */
2024 context_set_address_width(context, iommu->msagaw);
2025 }
2026
2027 context_set_translation_type(context, translation);
2028 }
2029
2030 context_set_fault_enable(context);
2031 context_set_present(context);
2032 if (!ecap_coherent(iommu->ecap))
2033 clflush_cache_range(context, sizeof(*context));
2034
2035 /*
2036 * It's a non-present to present mapping. If hardware doesn't cache
2037 * non-present entry we only need to flush the write-buffer. If the
2038 * _does_ cache non-present entries, then it does so in the special
2039 * domain #0, which we have to flush:
2040 */
2041 if (cap_caching_mode(iommu->cap)) {
2042 iommu->flush.flush_context(iommu, 0,
2043 (((u16)bus) << 8) | devfn,
2044 DMA_CCMD_MASK_NOBIT,
2045 DMA_CCMD_DEVICE_INVL);
2046 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 } else {
2048 iommu_flush_write_buffer(iommu);
2049 }
2050
2051 ret = 0;
2052
2053out_unlock:
2054 spin_unlock(&iommu->lock);
2055
2056 return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060 struct dmar_domain *domain;
2061 struct intel_iommu *iommu;
2062 struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066 u16 alias, void *opaque)
2067{
2068 struct domain_context_mapping_data *data = opaque;
2069
2070 return domain_context_mapping_one(data->domain, data->iommu,
2071 data->table, PCI_BUS_NUM(alias),
2072 alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
2078 struct domain_context_mapping_data data;
2079 struct pasid_table *table;
2080 struct intel_iommu *iommu;
2081 u8 bus, devfn;
2082
2083 iommu = device_to_iommu(dev, &bus, &devfn);
2084 if (!iommu)
2085 return -ENODEV;
2086
2087 table = intel_pasid_get_table(dev);
2088
2089 if (!dev_is_pci(dev))
2090 return domain_context_mapping_one(domain, iommu, table,
2091 bus, devfn);
2092
2093 data.domain = domain;
2094 data.iommu = iommu;
2095 data.table = table;
2096
2097 return pci_for_each_dma_alias(to_pci_dev(dev),
2098 &domain_context_mapping_cb, &data);
2099}
2100
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103 size_t size)
2104{
2105 host_addr &= ~PAGE_MASK;
2106 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111 unsigned long iov_pfn,
2112 unsigned long phy_pfn,
2113 unsigned long pages)
2114{
2115 int support, level = 1;
2116 unsigned long pfnmerge;
2117
2118 support = domain->iommu_superpage;
2119
2120 /* To use a large page, the virtual *and* physical addresses
2121 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122 of them will mean we have to use smaller pages. So just
2123 merge them and check both at once. */
2124 pfnmerge = iov_pfn | phy_pfn;
2125
2126 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127 pages >>= VTD_STRIDE_SHIFT;
2128 if (!pages)
2129 break;
2130 pfnmerge >>= VTD_STRIDE_SHIFT;
2131 level++;
2132 support--;
2133 }
2134 return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143 unsigned long start_pfn,
2144 unsigned long end_pfn, int level)
2145{
2146 unsigned long lvl_pages = lvl_to_nr_pages(level);
2147 struct iommu_domain_info *info;
2148 struct dma_pte *pte = NULL;
2149 unsigned long i;
2150
2151 while (start_pfn <= end_pfn) {
2152 if (!pte)
2153 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154
2155 if (dma_pte_present(pte)) {
2156 dma_pte_free_pagetable(domain, start_pfn,
2157 start_pfn + lvl_pages - 1,
2158 level + 1);
2159
2160 xa_for_each(&domain->iommu_array, i, info)
2161 iommu_flush_iotlb_psi(info->iommu, domain,
2162 start_pfn, lvl_pages,
2163 0, 0);
2164 }
2165
2166 pte++;
2167 start_pfn += lvl_pages;
2168 if (first_pte_in_page(pte))
2169 pte = NULL;
2170 }
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176{
2177 struct dma_pte *first_pte = NULL, *pte = NULL;
2178 unsigned int largepage_lvl = 0;
2179 unsigned long lvl_pages = 0;
2180 phys_addr_t pteval;
2181 u64 attr;
2182
2183 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184
2185 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186 return -EINVAL;
2187
2188 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189 attr |= DMA_FL_PTE_PRESENT;
2190 if (domain->use_first_level) {
2191 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192 if (prot & DMA_PTE_WRITE)
2193 attr |= DMA_FL_PTE_DIRTY;
2194 }
2195
2196 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197
2198 while (nr_pages > 0) {
2199 uint64_t tmp;
2200
2201 if (!pte) {
2202 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203 phys_pfn, nr_pages);
2204
2205 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206 if (!pte)
2207 return -ENOMEM;
2208 first_pte = pte;
2209
2210 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212 /* It is large page*/
2213 if (largepage_lvl > 1) {
2214 unsigned long end_pfn;
2215 unsigned long pages_to_remove;
2216
2217 pteval |= DMA_PTE_LARGE_PAGE;
2218 pages_to_remove = min_t(unsigned long, nr_pages,
2219 nr_pte_to_next_page(pte) * lvl_pages);
2220 end_pfn = iov_pfn + pages_to_remove - 1;
2221 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222 } else {
2223 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224 }
2225
2226 }
2227 /* We don't need lock here, nobody else
2228 * touches the iova range
2229 */
2230 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231 if (tmp) {
2232 static int dumps = 5;
2233 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234 iov_pfn, tmp, (unsigned long long)pteval);
2235 if (dumps) {
2236 dumps--;
2237 debug_dma_dump_mappings(NULL);
2238 }
2239 WARN_ON(1);
2240 }
2241
2242 nr_pages -= lvl_pages;
2243 iov_pfn += lvl_pages;
2244 phys_pfn += lvl_pages;
2245 pteval += lvl_pages * VTD_PAGE_SIZE;
2246
2247 /* If the next PTE would be the first in a new page, then we
2248 * need to flush the cache on the entries we've just written.
2249 * And then we'll need to recalculate 'pte', so clear it and
2250 * let it get set again in the if (!pte) block above.
2251 *
2252 * If we're done (!nr_pages) we need to flush the cache too.
2253 *
2254 * Also if we've been setting superpages, we may need to
2255 * recalculate 'pte' and switch back to smaller pages for the
2256 * end of the mapping, if the trailing size is not enough to
2257 * use another superpage (i.e. nr_pages < lvl_pages).
2258 */
2259 pte++;
2260 if (!nr_pages || first_pte_in_page(pte) ||
2261 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262 domain_flush_cache(domain, first_pte,
2263 (void *)pte - (void *)first_pte);
2264 pte = NULL;
2265 }
2266 }
2267
2268 return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272{
2273 struct intel_iommu *iommu = info->iommu;
2274 struct context_entry *context;
2275 u16 did_old;
2276
2277 if (!iommu)
2278 return;
2279
2280 spin_lock(&iommu->lock);
2281 context = iommu_context_addr(iommu, bus, devfn, 0);
2282 if (!context) {
2283 spin_unlock(&iommu->lock);
2284 return;
2285 }
2286
2287 if (sm_supported(iommu)) {
2288 if (hw_pass_through && domain_type_is_si(info->domain))
2289 did_old = FLPT_DEFAULT_DID;
2290 else
2291 did_old = domain_id_iommu(info->domain, iommu);
2292 } else {
2293 did_old = context_domain_id(context);
2294 }
2295
2296 context_clear_entry(context);
2297 __iommu_flush_cache(iommu, context, sizeof(*context));
2298 spin_unlock(&iommu->lock);
2299 iommu->flush.flush_context(iommu,
2300 did_old,
2301 (((u16)bus) << 8) | devfn,
2302 DMA_CCMD_MASK_NOBIT,
2303 DMA_CCMD_DEVICE_INVL);
2304
2305 if (sm_supported(iommu))
2306 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308 iommu->flush.flush_iotlb(iommu,
2309 did_old,
2310 0,
2311 0,
2312 DMA_TLB_DSI_FLUSH);
2313
2314 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318 struct dmar_domain *domain,
2319 struct device *dev,
2320 u32 pasid)
2321{
2322 struct dma_pte *pgd = domain->pgd;
2323 int agaw, level;
2324 int flags = 0;
2325
2326 /*
2327 * Skip top levels of page tables for iommu which has
2328 * less agaw than default. Unnecessary for PT mode.
2329 */
2330 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331 pgd = phys_to_virt(dma_pte_addr(pgd));
2332 if (!dma_pte_present(pgd))
2333 return -ENOMEM;
2334 }
2335
2336 level = agaw_to_level(agaw);
2337 if (level != 4 && level != 5)
2338 return -EINVAL;
2339
2340 if (pasid != PASID_RID2PASID)
2341 flags |= PASID_FLAG_SUPERVISOR_MODE;
2342 if (level == 5)
2343 flags |= PASID_FLAG_FL5LP;
2344
2345 if (domain->force_snooping)
2346 flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349 domain_id_iommu(domain, iommu),
2350 flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355 return dev && dev_is_pci(dev) &&
2356 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360 unsigned long first_vpfn,
2361 unsigned long last_vpfn)
2362{
2363 /*
2364 * RMRR range might have overlap with physical memory range,
2365 * clear it first
2366 */
2367 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369 return __domain_mapping(domain, first_vpfn,
2370 first_vpfn, last_vpfn - first_vpfn + 1,
2371 DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378 struct dmar_rmrr_unit *rmrr;
2379 struct device *dev;
2380 int i, nid, ret;
2381
2382 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383 if (!si_domain)
2384 return -EFAULT;
2385
2386 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387 domain_exit(si_domain);
2388 si_domain = NULL;
2389 return -EFAULT;
2390 }
2391
2392 if (hw)
2393 return 0;
2394
2395 for_each_online_node(nid) {
2396 unsigned long start_pfn, end_pfn;
2397 int i;
2398
2399 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400 ret = iommu_domain_identity_map(si_domain,
2401 mm_to_dma_pfn(start_pfn),
2402 mm_to_dma_pfn(end_pfn));
2403 if (ret)
2404 return ret;
2405 }
2406 }
2407
2408 /*
2409 * Identity map the RMRRs so that devices with RMRRs could also use
2410 * the si_domain.
2411 */
2412 for_each_rmrr_units(rmrr) {
2413 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414 i, dev) {
2415 unsigned long long start = rmrr->base_address;
2416 unsigned long long end = rmrr->end_address;
2417
2418 if (WARN_ON(end < start ||
2419 end >> agaw_to_width(si_domain->agaw)))
2420 continue;
2421
2422 ret = iommu_domain_identity_map(si_domain,
2423 mm_to_dma_pfn(start >> PAGE_SHIFT),
2424 mm_to_dma_pfn(end >> PAGE_SHIFT));
2425 if (ret)
2426 return ret;
2427 }
2428 }
2429
2430 return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434 struct device *dev)
2435{
2436 struct device_domain_info *info = dev_iommu_priv_get(dev);
2437 struct intel_iommu *iommu;
2438 unsigned long flags;
2439 u8 bus, devfn;
2440 int ret;
2441
2442 iommu = device_to_iommu(dev, &bus, &devfn);
2443 if (!iommu)
2444 return -ENODEV;
2445
2446 ret = domain_attach_iommu(domain, iommu);
2447 if (ret)
2448 return ret;
2449 info->domain = domain;
2450 spin_lock_irqsave(&domain->lock, flags);
2451 list_add(&info->link, &domain->devices);
2452 spin_unlock_irqrestore(&domain->lock, flags);
2453
2454 /* PASID table is mandatory for a PCI device in scalable mode. */
2455 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456 /* Setup the PASID entry for requests without PASID: */
2457 if (hw_pass_through && domain_type_is_si(domain))
2458 ret = intel_pasid_setup_pass_through(iommu, domain,
2459 dev, PASID_RID2PASID);
2460 else if (domain->use_first_level)
2461 ret = domain_setup_first_level(iommu, domain, dev,
2462 PASID_RID2PASID);
2463 else
2464 ret = intel_pasid_setup_second_level(iommu, domain,
2465 dev, PASID_RID2PASID);
2466 if (ret) {
2467 dev_err(dev, "Setup RID2PASID failed\n");
2468 device_block_translation(dev);
2469 return ret;
2470 }
2471 }
2472
2473 ret = domain_context_mapping(domain, dev);
2474 if (ret) {
2475 dev_err(dev, "Domain context map failed\n");
2476 device_block_translation(dev);
2477 return ret;
2478 }
2479
2480 iommu_enable_pci_caps(info);
2481
2482 return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487 struct dmar_rmrr_unit *rmrr;
2488 struct device *tmp;
2489 int i;
2490
2491 rcu_read_lock();
2492 for_each_rmrr_units(rmrr) {
2493 /*
2494 * Return TRUE if this RMRR contains the device that
2495 * is passed in.
2496 */
2497 for_each_active_dev_scope(rmrr->devices,
2498 rmrr->devices_cnt, i, tmp)
2499 if (tmp == dev ||
2500 is_downstream_to_pci_bridge(dev, tmp)) {
2501 rcu_read_unlock();
2502 return true;
2503 }
2504 }
2505 rcu_read_unlock();
2506 return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot. This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526 struct pci_dev *pdev;
2527
2528 if (!dev_is_pci(dev))
2529 return false;
2530
2531 pdev = to_pci_dev(dev);
2532 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533 return true;
2534 else
2535 return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs. The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost. This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API. This interface
2545 * expects to have full control of the IOVA space for the device. We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space. We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557 if (!device_has_rmrr(dev))
2558 return false;
2559
2560 if (device_rmrr_is_relaxable(dev))
2561 return false;
2562
2563 return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 * - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579 if (dev_is_pci(dev)) {
2580 struct pci_dev *pdev = to_pci_dev(dev);
2581
2582 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583 return IOMMU_DOMAIN_IDENTITY;
2584
2585 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586 return IOMMU_DOMAIN_IDENTITY;
2587 }
2588
2589 return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594 /*
2595 * Start from the sane iommu hardware state.
2596 * If the queued invalidation is already initialized by us
2597 * (for example, while enabling interrupt-remapping) then
2598 * we got the things already rolling from a sane state.
2599 */
2600 if (!iommu->qi) {
2601 /*
2602 * Clear any previous faults.
2603 */
2604 dmar_fault(-1, iommu);
2605 /*
2606 * Disable queued invalidation if supported and already enabled
2607 * before OS handover.
2608 */
2609 dmar_disable_qi(iommu);
2610 }
2611
2612 if (dmar_enable_qi(iommu)) {
2613 /*
2614 * Queued Invalidate not enabled, use Register Based Invalidate
2615 */
2616 iommu->flush.flush_context = __iommu_flush_context;
2617 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618 pr_info("%s: Using Register based invalidation\n",
2619 iommu->name);
2620 } else {
2621 iommu->flush.flush_context = qi_flush_context;
2622 iommu->flush.flush_iotlb = qi_flush_iotlb;
2623 pr_info("%s: Using Queued invalidation\n", iommu->name);
2624 }
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628 struct root_entry *old_re,
2629 struct context_entry **tbl,
2630 int bus, bool ext)
2631{
2632 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633 struct context_entry *new_ce = NULL, ce;
2634 struct context_entry *old_ce = NULL;
2635 struct root_entry re;
2636 phys_addr_t old_ce_phys;
2637
2638 tbl_idx = ext ? bus * 2 : bus;
2639 memcpy(&re, old_re, sizeof(re));
2640
2641 for (devfn = 0; devfn < 256; devfn++) {
2642 /* First calculate the correct index */
2643 idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645 if (idx == 0) {
2646 /* First save what we may have and clean up */
2647 if (new_ce) {
2648 tbl[tbl_idx] = new_ce;
2649 __iommu_flush_cache(iommu, new_ce,
2650 VTD_PAGE_SIZE);
2651 pos = 1;
2652 }
2653
2654 if (old_ce)
2655 memunmap(old_ce);
2656
2657 ret = 0;
2658 if (devfn < 0x80)
2659 old_ce_phys = root_entry_lctp(&re);
2660 else
2661 old_ce_phys = root_entry_uctp(&re);
2662
2663 if (!old_ce_phys) {
2664 if (ext && devfn == 0) {
2665 /* No LCTP, try UCTP */
2666 devfn = 0x7f;
2667 continue;
2668 } else {
2669 goto out;
2670 }
2671 }
2672
2673 ret = -ENOMEM;
2674 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675 MEMREMAP_WB);
2676 if (!old_ce)
2677 goto out;
2678
2679 new_ce = alloc_pgtable_page(iommu->node);
2680 if (!new_ce)
2681 goto out_unmap;
2682
2683 ret = 0;
2684 }
2685
2686 /* Now copy the context entry */
2687 memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689 if (!context_present(&ce))
2690 continue;
2691
2692 did = context_domain_id(&ce);
2693 if (did >= 0 && did < cap_ndoms(iommu->cap))
2694 set_bit(did, iommu->domain_ids);
2695
2696 set_context_copied(iommu, bus, devfn);
2697 new_ce[idx] = ce;
2698 }
2699
2700 tbl[tbl_idx + pos] = new_ce;
2701
2702 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705 memunmap(old_ce);
2706
2707out:
2708 return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713 struct context_entry **ctxt_tbls;
2714 struct root_entry *old_rt;
2715 phys_addr_t old_rt_phys;
2716 int ctxt_table_entries;
2717 u64 rtaddr_reg;
2718 int bus, ret;
2719 bool new_ext, ext;
2720
2721 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723 new_ext = !!sm_supported(iommu);
2724
2725 /*
2726 * The RTT bit can only be changed when translation is disabled,
2727 * but disabling translation means to open a window for data
2728 * corruption. So bail out and don't copy anything if we would
2729 * have to change the bit.
2730 */
2731 if (new_ext != ext)
2732 return -EINVAL;
2733
2734 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735 if (!iommu->copied_tables)
2736 return -ENOMEM;
2737
2738 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739 if (!old_rt_phys)
2740 return -EINVAL;
2741
2742 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743 if (!old_rt)
2744 return -ENOMEM;
2745
2746 /* This is too big for the stack - allocate it from slab */
2747 ctxt_table_entries = ext ? 512 : 256;
2748 ret = -ENOMEM;
2749 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750 if (!ctxt_tbls)
2751 goto out_unmap;
2752
2753 for (bus = 0; bus < 256; bus++) {
2754 ret = copy_context_table(iommu, &old_rt[bus],
2755 ctxt_tbls, bus, ext);
2756 if (ret) {
2757 pr_err("%s: Failed to copy context table for bus %d\n",
2758 iommu->name, bus);
2759 continue;
2760 }
2761 }
2762
2763 spin_lock(&iommu->lock);
2764
2765 /* Context tables are copied, now write them to the root_entry table */
2766 for (bus = 0; bus < 256; bus++) {
2767 int idx = ext ? bus * 2 : bus;
2768 u64 val;
2769
2770 if (ctxt_tbls[idx]) {
2771 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772 iommu->root_entry[bus].lo = val;
2773 }
2774
2775 if (!ext || !ctxt_tbls[idx + 1])
2776 continue;
2777
2778 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779 iommu->root_entry[bus].hi = val;
2780 }
2781
2782 spin_unlock(&iommu->lock);
2783
2784 kfree(ctxt_tbls);
2785
2786 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788 ret = 0;
2789
2790out_unmap:
2791 memunmap(old_rt);
2792
2793 return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799 struct intel_iommu *iommu = data;
2800 ioasid_t ioasid;
2801
2802 if (!iommu)
2803 return INVALID_IOASID;
2804 /*
2805 * VT-d virtual command interface always uses the full 20 bit
2806 * PASID range. Host can partition guest PASID range based on
2807 * policies but it is out of guest's control.
2808 */
2809 if (min < PASID_MIN || max > intel_pasid_max_id)
2810 return INVALID_IOASID;
2811
2812 if (vcmd_alloc_pasid(iommu, &ioasid))
2813 return INVALID_IOASID;
2814
2815 return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820 struct intel_iommu *iommu = data;
2821
2822 if (!iommu)
2823 return;
2824 /*
2825 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826 * We can only free the PASID when all the devices are unbound.
2827 */
2828 if (ioasid_find(NULL, ioasid, NULL)) {
2829 pr_alert("Cannot free active IOASID %d\n", ioasid);
2830 return;
2831 }
2832 vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837 /*
2838 * If we are running in the host, no need for custom allocator
2839 * in that PASIDs are allocated from the host system-wide.
2840 */
2841 if (!cap_caching_mode(iommu->cap))
2842 return;
2843
2844 if (!sm_supported(iommu)) {
2845 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846 return;
2847 }
2848
2849 /*
2850 * Register a custom PASID allocator if we are running in a guest,
2851 * guest PASID must be obtained via virtual command interface.
2852 * There can be multiple vIOMMUs in each guest but only one allocator
2853 * is active. All vIOMMU allocators will eventually be calling the same
2854 * host allocator.
2855 */
2856 if (!vccap_pasid(iommu->vccap))
2857 return;
2858
2859 pr_info("Register custom PASID allocator\n");
2860 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862 iommu->pasid_allocator.pdata = (void *)iommu;
2863 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865 /*
2866 * Disable scalable mode on this IOMMU if there
2867 * is no custom allocator. Mixing SM capable vIOMMU
2868 * and non-SM vIOMMU are not supported.
2869 */
2870 intel_iommu_sm = 0;
2871 }
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877 struct dmar_drhd_unit *drhd;
2878 struct intel_iommu *iommu;
2879 int ret;
2880
2881 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882 if (ret)
2883 goto free_iommu;
2884
2885 for_each_iommu(iommu, drhd) {
2886 if (drhd->ignored) {
2887 iommu_disable_translation(iommu);
2888 continue;
2889 }
2890
2891 /*
2892 * Find the max pasid size of all IOMMU's in the system.
2893 * We need to ensure the system pasid table is no bigger
2894 * than the smallest supported.
2895 */
2896 if (pasid_supported(iommu)) {
2897 u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899 intel_pasid_max_id = min_t(u32, temp,
2900 intel_pasid_max_id);
2901 }
2902
2903 intel_iommu_init_qi(iommu);
2904
2905 ret = iommu_init_domains(iommu);
2906 if (ret)
2907 goto free_iommu;
2908
2909 init_translation_status(iommu);
2910
2911 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912 iommu_disable_translation(iommu);
2913 clear_translation_pre_enabled(iommu);
2914 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915 iommu->name);
2916 }
2917
2918 /*
2919 * TBD:
2920 * we could share the same root & context tables
2921 * among all IOMMU's. Need to Split it later.
2922 */
2923 ret = iommu_alloc_root_entry(iommu);
2924 if (ret)
2925 goto free_iommu;
2926
2927 if (translation_pre_enabled(iommu)) {
2928 pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930 ret = copy_translation_tables(iommu);
2931 if (ret) {
2932 /*
2933 * We found the IOMMU with translation
2934 * enabled - but failed to copy over the
2935 * old root-entry table. Try to proceed
2936 * by disabling translation now and
2937 * allocating a clean root-entry table.
2938 * This might cause DMAR faults, but
2939 * probably the dump will still succeed.
2940 */
2941 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942 iommu->name);
2943 iommu_disable_translation(iommu);
2944 clear_translation_pre_enabled(iommu);
2945 } else {
2946 pr_info("Copied translation tables from previous kernel for %s\n",
2947 iommu->name);
2948 }
2949 }
2950
2951 if (!ecap_pass_through(iommu->ecap))
2952 hw_pass_through = 0;
2953 intel_svm_check(iommu);
2954 }
2955
2956 /*
2957 * Now that qi is enabled on all iommus, set the root entry and flush
2958 * caches. This is required on some Intel X58 chipsets, otherwise the
2959 * flush_context function will loop forever and the boot hangs.
2960 */
2961 for_each_active_iommu(iommu, drhd) {
2962 iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964 register_pasid_allocator(iommu);
2965#endif
2966 iommu_set_root_entry(iommu);
2967 }
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970 dmar_map_gfx = 0;
2971#endif
2972
2973 if (!dmar_map_gfx)
2974 iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976 check_tylersburg_isoch();
2977
2978 ret = si_domain_init(hw_pass_through);
2979 if (ret)
2980 goto free_iommu;
2981
2982 /*
2983 * for each drhd
2984 * enable fault log
2985 * global invalidate context cache
2986 * global invalidate iotlb
2987 * enable translation
2988 */
2989 for_each_iommu(iommu, drhd) {
2990 if (drhd->ignored) {
2991 /*
2992 * we always have to disable PMRs or DMA may fail on
2993 * this device
2994 */
2995 if (force_on)
2996 iommu_disable_protect_mem_regions(iommu);
2997 continue;
2998 }
2999
3000 iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004 /*
3005 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006 * could cause possible lock race condition.
3007 */
3008 up_write(&dmar_global_lock);
3009 ret = intel_svm_enable_prq(iommu);
3010 down_write(&dmar_global_lock);
3011 if (ret)
3012 goto free_iommu;
3013 }
3014#endif
3015 ret = dmar_set_interrupt(iommu);
3016 if (ret)
3017 goto free_iommu;
3018 }
3019
3020 return 0;
3021
3022free_iommu:
3023 for_each_active_iommu(iommu, drhd) {
3024 disable_dmar_iommu(iommu);
3025 free_dmar_iommu(iommu);
3026 }
3027 if (si_domain) {
3028 domain_exit(si_domain);
3029 si_domain = NULL;
3030 }
3031
3032 return ret;
3033}
3034
3035static void __init init_no_remapping_devices(void)
3036{
3037 struct dmar_drhd_unit *drhd;
3038 struct device *dev;
3039 int i;
3040
3041 for_each_drhd_unit(drhd) {
3042 if (!drhd->include_all) {
3043 for_each_active_dev_scope(drhd->devices,
3044 drhd->devices_cnt, i, dev)
3045 break;
3046 /* ignore DMAR unit if no devices exist */
3047 if (i == drhd->devices_cnt)
3048 drhd->ignored = 1;
3049 }
3050 }
3051
3052 for_each_active_drhd_unit(drhd) {
3053 if (drhd->include_all)
3054 continue;
3055
3056 for_each_active_dev_scope(drhd->devices,
3057 drhd->devices_cnt, i, dev)
3058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059 break;
3060 if (i < drhd->devices_cnt)
3061 continue;
3062
3063 /* This IOMMU has *only* gfx devices. Either bypass it or
3064 set the gfx_mapped flag, as appropriate */
3065 drhd->gfx_dedicated = 1;
3066 if (!dmar_map_gfx)
3067 drhd->ignored = 1;
3068 }
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074 struct dmar_drhd_unit *drhd;
3075 struct intel_iommu *iommu = NULL;
3076
3077 for_each_active_iommu(iommu, drhd)
3078 if (iommu->qi)
3079 dmar_reenable_qi(iommu);
3080
3081 for_each_iommu(iommu, drhd) {
3082 if (drhd->ignored) {
3083 /*
3084 * we always have to disable PMRs or DMA may fail on
3085 * this device
3086 */
3087 if (force_on)
3088 iommu_disable_protect_mem_regions(iommu);
3089 continue;
3090 }
3091
3092 iommu_flush_write_buffer(iommu);
3093 iommu_set_root_entry(iommu);
3094 iommu_enable_translation(iommu);
3095 iommu_disable_protect_mem_regions(iommu);
3096 }
3097
3098 return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103 struct dmar_drhd_unit *drhd;
3104 struct intel_iommu *iommu;
3105
3106 for_each_active_iommu(iommu, drhd) {
3107 iommu->flush.flush_context(iommu, 0, 0, 0,
3108 DMA_CCMD_GLOBAL_INVL);
3109 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110 DMA_TLB_GLOBAL_FLUSH);
3111 }
3112}
3113
3114static int iommu_suspend(void)
3115{
3116 struct dmar_drhd_unit *drhd;
3117 struct intel_iommu *iommu = NULL;
3118 unsigned long flag;
3119
3120 for_each_active_iommu(iommu, drhd) {
3121 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122 GFP_KERNEL);
3123 if (!iommu->iommu_state)
3124 goto nomem;
3125 }
3126
3127 iommu_flush_all();
3128
3129 for_each_active_iommu(iommu, drhd) {
3130 iommu_disable_translation(iommu);
3131
3132 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135 readl(iommu->reg + DMAR_FECTL_REG);
3136 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137 readl(iommu->reg + DMAR_FEDATA_REG);
3138 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139 readl(iommu->reg + DMAR_FEADDR_REG);
3140 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141 readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144 }
3145 return 0;
3146
3147nomem:
3148 for_each_active_iommu(iommu, drhd)
3149 kfree(iommu->iommu_state);
3150
3151 return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156 struct dmar_drhd_unit *drhd;
3157 struct intel_iommu *iommu = NULL;
3158 unsigned long flag;
3159
3160 if (init_iommu_hw()) {
3161 if (force_on)
3162 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163 else
3164 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165 return;
3166 }
3167
3168 for_each_active_iommu(iommu, drhd) {
3169
3170 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173 iommu->reg + DMAR_FECTL_REG);
3174 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175 iommu->reg + DMAR_FEDATA_REG);
3176 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177 iommu->reg + DMAR_FEADDR_REG);
3178 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179 iommu->reg + DMAR_FEUADDR_REG);
3180
3181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182 }
3183
3184 for_each_active_iommu(iommu, drhd)
3185 kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189 .resume = iommu_resume,
3190 .suspend = iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195 register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif /* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206 rmrr->end_address <= rmrr->base_address ||
3207 arch_rmrr_sanity_check(rmrr))
3208 return -EINVAL;
3209
3210 return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215 struct acpi_dmar_reserved_memory *rmrr;
3216 struct dmar_rmrr_unit *rmrru;
3217
3218 rmrr = (struct acpi_dmar_reserved_memory *)header;
3219 if (rmrr_sanity_check(rmrr)) {
3220 pr_warn(FW_BUG
3221 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223 rmrr->base_address, rmrr->end_address,
3224 dmi_get_system_info(DMI_BIOS_VENDOR),
3225 dmi_get_system_info(DMI_BIOS_VERSION),
3226 dmi_get_system_info(DMI_PRODUCT_VERSION));
3227 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228 }
3229
3230 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231 if (!rmrru)
3232 goto out;
3233
3234 rmrru->hdr = header;
3235
3236 rmrru->base_address = rmrr->base_address;
3237 rmrru->end_address = rmrr->end_address;
3238
3239 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240 ((void *)rmrr) + rmrr->header.length,
3241 &rmrru->devices_cnt);
3242 if (rmrru->devices_cnt && rmrru->devices == NULL)
3243 goto free_rmrru;
3244
3245 list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247 return 0;
3248free_rmrru:
3249 kfree(rmrru);
3250out:
3251 return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256 struct dmar_atsr_unit *atsru;
3257 struct acpi_dmar_atsr *tmp;
3258
3259 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260 dmar_rcu_check()) {
3261 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262 if (atsr->segment != tmp->segment)
3263 continue;
3264 if (atsr->header.length != tmp->header.length)
3265 continue;
3266 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267 return atsru;
3268 }
3269
3270 return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275 struct acpi_dmar_atsr *atsr;
3276 struct dmar_atsr_unit *atsru;
3277
3278 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279 return 0;
3280
3281 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282 atsru = dmar_find_atsr(atsr);
3283 if (atsru)
3284 return 0;
3285
3286 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287 if (!atsru)
3288 return -ENOMEM;
3289
3290 /*
3291 * If memory is allocated from slab by ACPI _DSM method, we need to
3292 * copy the memory content because the memory buffer will be freed
3293 * on return.
3294 */
3295 atsru->hdr = (void *)(atsru + 1);
3296 memcpy(atsru->hdr, hdr, hdr->length);
3297 atsru->include_all = atsr->flags & 0x1;
3298 if (!atsru->include_all) {
3299 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300 (void *)atsr + atsr->header.length,
3301 &atsru->devices_cnt);
3302 if (atsru->devices_cnt && atsru->devices == NULL) {
3303 kfree(atsru);
3304 return -ENOMEM;
3305 }
3306 }
3307
3308 list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310 return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316 kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321 struct acpi_dmar_atsr *atsr;
3322 struct dmar_atsr_unit *atsru;
3323
3324 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325 atsru = dmar_find_atsr(atsr);
3326 if (atsru) {
3327 list_del_rcu(&atsru->list);
3328 synchronize_rcu();
3329 intel_iommu_free_atsr(atsru);
3330 }
3331
3332 return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337 int i;
3338 struct device *dev;
3339 struct acpi_dmar_atsr *atsr;
3340 struct dmar_atsr_unit *atsru;
3341
3342 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343 atsru = dmar_find_atsr(atsr);
3344 if (!atsru)
3345 return 0;
3346
3347 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349 i, dev)
3350 return -EBUSY;
3351 }
3352
3353 return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358 struct dmar_satc_unit *satcu;
3359 struct acpi_dmar_satc *tmp;
3360
3361 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362 dmar_rcu_check()) {
3363 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364 if (satc->segment != tmp->segment)
3365 continue;
3366 if (satc->header.length != tmp->header.length)
3367 continue;
3368 if (memcmp(satc, tmp, satc->header.length) == 0)
3369 return satcu;
3370 }
3371
3372 return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377 struct acpi_dmar_satc *satc;
3378 struct dmar_satc_unit *satcu;
3379
3380 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381 return 0;
3382
3383 satc = container_of(hdr, struct acpi_dmar_satc, header);
3384 satcu = dmar_find_satc(satc);
3385 if (satcu)
3386 return 0;
3387
3388 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389 if (!satcu)
3390 return -ENOMEM;
3391
3392 satcu->hdr = (void *)(satcu + 1);
3393 memcpy(satcu->hdr, hdr, hdr->length);
3394 satcu->atc_required = satc->flags & 0x1;
3395 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396 (void *)satc + satc->header.length,
3397 &satcu->devices_cnt);
3398 if (satcu->devices_cnt && !satcu->devices) {
3399 kfree(satcu);
3400 return -ENOMEM;
3401 }
3402 list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404 return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409 int sp, ret;
3410 struct intel_iommu *iommu = dmaru->iommu;
3411
3412 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413 if (ret)
3414 goto out;
3415
3416 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417 pr_warn("%s: Doesn't support hardware pass through.\n",
3418 iommu->name);
3419 return -ENXIO;
3420 }
3421
3422 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424 pr_warn("%s: Doesn't support large page.\n",
3425 iommu->name);
3426 return -ENXIO;
3427 }
3428
3429 /*
3430 * Disable translation if already enabled prior to OS handover.
3431 */
3432 if (iommu->gcmd & DMA_GCMD_TE)
3433 iommu_disable_translation(iommu);
3434
3435 ret = iommu_init_domains(iommu);
3436 if (ret == 0)
3437 ret = iommu_alloc_root_entry(iommu);
3438 if (ret)
3439 goto out;
3440
3441 intel_svm_check(iommu);
3442
3443 if (dmaru->ignored) {
3444 /*
3445 * we always have to disable PMRs or DMA may fail on this device
3446 */
3447 if (force_on)
3448 iommu_disable_protect_mem_regions(iommu);
3449 return 0;
3450 }
3451
3452 intel_iommu_init_qi(iommu);
3453 iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457 ret = intel_svm_enable_prq(iommu);
3458 if (ret)
3459 goto disable_iommu;
3460 }
3461#endif
3462 ret = dmar_set_interrupt(iommu);
3463 if (ret)
3464 goto disable_iommu;
3465
3466 iommu_set_root_entry(iommu);
3467 iommu_enable_translation(iommu);
3468
3469 iommu_disable_protect_mem_regions(iommu);
3470 return 0;
3471
3472disable_iommu:
3473 disable_dmar_iommu(iommu);
3474out:
3475 free_dmar_iommu(iommu);
3476 return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481 int ret = 0;
3482 struct intel_iommu *iommu = dmaru->iommu;
3483
3484 if (!intel_iommu_enabled)
3485 return 0;
3486 if (iommu == NULL)
3487 return -EINVAL;
3488
3489 if (insert) {
3490 ret = intel_iommu_add(dmaru);
3491 } else {
3492 disable_dmar_iommu(iommu);
3493 free_dmar_iommu(iommu);
3494 }
3495
3496 return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502 struct dmar_atsr_unit *atsru, *atsr_n;
3503 struct dmar_satc_unit *satcu, *satc_n;
3504
3505 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506 list_del(&rmrru->list);
3507 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508 kfree(rmrru);
3509 }
3510
3511 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512 list_del(&atsru->list);
3513 intel_iommu_free_atsr(atsru);
3514 }
3515 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516 list_del(&satcu->list);
3517 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518 kfree(satcu);
3519 }
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524 struct dmar_satc_unit *satcu;
3525 struct acpi_dmar_satc *satc;
3526 struct device *tmp;
3527 int i;
3528
3529 dev = pci_physfn(dev);
3530 rcu_read_lock();
3531
3532 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534 if (satc->segment != pci_domain_nr(dev->bus))
3535 continue;
3536 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537 if (to_pci_dev(tmp) == dev)
3538 goto out;
3539 }
3540 satcu = NULL;
3541out:
3542 rcu_read_unlock();
3543 return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548 int i, ret = 1;
3549 struct pci_bus *bus;
3550 struct pci_dev *bridge = NULL;
3551 struct device *tmp;
3552 struct acpi_dmar_atsr *atsr;
3553 struct dmar_atsr_unit *atsru;
3554 struct dmar_satc_unit *satcu;
3555
3556 dev = pci_physfn(dev);
3557 satcu = dmar_find_matched_satc_unit(dev);
3558 if (satcu)
3559 /*
3560 * This device supports ATS as it is in SATC table.
3561 * When IOMMU is in legacy mode, enabling ATS is done
3562 * automatically by HW for the device that requires
3563 * ATS, hence OS should not enable this device ATS
3564 * to avoid duplicated TLB invalidation.
3565 */
3566 return !(satcu->atc_required && !sm_supported(iommu));
3567
3568 for (bus = dev->bus; bus; bus = bus->parent) {
3569 bridge = bus->self;
3570 /* If it's an integrated device, allow ATS */
3571 if (!bridge)
3572 return 1;
3573 /* Connected via non-PCIe: no ATS */
3574 if (!pci_is_pcie(bridge) ||
3575 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576 return 0;
3577 /* If we found the root port, look it up in the ATSR */
3578 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579 break;
3580 }
3581
3582 rcu_read_lock();
3583 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585 if (atsr->segment != pci_domain_nr(dev->bus))
3586 continue;
3587
3588 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589 if (tmp == &bridge->dev)
3590 goto out;
3591
3592 if (atsru->include_all)
3593 goto out;
3594 }
3595 ret = 0;
3596out:
3597 rcu_read_unlock();
3598
3599 return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604 int ret;
3605 struct dmar_rmrr_unit *rmrru;
3606 struct dmar_atsr_unit *atsru;
3607 struct dmar_satc_unit *satcu;
3608 struct acpi_dmar_atsr *atsr;
3609 struct acpi_dmar_reserved_memory *rmrr;
3610 struct acpi_dmar_satc *satc;
3611
3612 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613 return 0;
3614
3615 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616 rmrr = container_of(rmrru->hdr,
3617 struct acpi_dmar_reserved_memory, header);
3618 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620 ((void *)rmrr) + rmrr->header.length,
3621 rmrr->segment, rmrru->devices,
3622 rmrru->devices_cnt);
3623 if (ret < 0)
3624 return ret;
3625 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626 dmar_remove_dev_scope(info, rmrr->segment,
3627 rmrru->devices, rmrru->devices_cnt);
3628 }
3629 }
3630
3631 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632 if (atsru->include_all)
3633 continue;
3634
3635 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638 (void *)atsr + atsr->header.length,
3639 atsr->segment, atsru->devices,
3640 atsru->devices_cnt);
3641 if (ret > 0)
3642 break;
3643 else if (ret < 0)
3644 return ret;
3645 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646 if (dmar_remove_dev_scope(info, atsr->segment,
3647 atsru->devices, atsru->devices_cnt))
3648 break;
3649 }
3650 }
3651 list_for_each_entry(satcu, &dmar_satc_units, list) {
3652 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655 (void *)satc + satc->header.length,
3656 satc->segment, satcu->devices,
3657 satcu->devices_cnt);
3658 if (ret > 0)
3659 break;
3660 else if (ret < 0)
3661 return ret;
3662 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663 if (dmar_remove_dev_scope(info, satc->segment,
3664 satcu->devices, satcu->devices_cnt))
3665 break;
3666 }
3667 }
3668
3669 return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673 unsigned long val, void *v)
3674{
3675 struct memory_notify *mhp = v;
3676 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678 mhp->nr_pages - 1);
3679
3680 switch (val) {
3681 case MEM_GOING_ONLINE:
3682 if (iommu_domain_identity_map(si_domain,
3683 start_vpfn, last_vpfn)) {
3684 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685 start_vpfn, last_vpfn);
3686 return NOTIFY_BAD;
3687 }
3688 break;
3689
3690 case MEM_OFFLINE:
3691 case MEM_CANCEL_ONLINE:
3692 {
3693 struct dmar_drhd_unit *drhd;
3694 struct intel_iommu *iommu;
3695 LIST_HEAD(freelist);
3696
3697 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3698
3699 rcu_read_lock();
3700 for_each_active_iommu(iommu, drhd)
3701 iommu_flush_iotlb_psi(iommu, si_domain,
3702 start_vpfn, mhp->nr_pages,
3703 list_empty(&freelist), 0);
3704 rcu_read_unlock();
3705 put_pages_list(&freelist);
3706 }
3707 break;
3708 }
3709
3710 return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714 .notifier_call = intel_iommu_memory_notifier,
3715 .priority = 0
3716};
3717
3718static void intel_disable_iommus(void)
3719{
3720 struct intel_iommu *iommu = NULL;
3721 struct dmar_drhd_unit *drhd;
3722
3723 for_each_iommu(iommu, drhd)
3724 iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729 struct dmar_drhd_unit *drhd;
3730 struct intel_iommu *iommu = NULL;
3731
3732 if (no_iommu || dmar_disabled)
3733 return;
3734
3735 down_write(&dmar_global_lock);
3736
3737 /* Disable PMRs explicitly here. */
3738 for_each_iommu(iommu, drhd)
3739 iommu_disable_protect_mem_regions(iommu);
3740
3741 /* Make sure the IOMMUs are switched off */
3742 intel_disable_iommus();
3743
3744 up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751 return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755 struct device_attribute *attr, char *buf)
3756{
3757 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759 return sprintf(buf, "%d:%d\n",
3760 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765 struct device_attribute *attr, char *buf)
3766{
3767 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768 return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773 struct device_attribute *attr, char *buf)
3774{
3775 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776 return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781 struct device_attribute *attr, char *buf)
3782{
3783 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784 return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789 struct device_attribute *attr, char *buf)
3790{
3791 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797 struct device_attribute *attr, char *buf)
3798{
3799 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801 cap_ndoms(iommu->cap)));
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806 &dev_attr_version.attr,
3807 &dev_attr_address.attr,
3808 &dev_attr_cap.attr,
3809 &dev_attr_ecap.attr,
3810 &dev_attr_domains_supported.attr,
3811 &dev_attr_domains_used.attr,
3812 NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816 .name = "intel-iommu",
3817 .attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821 &intel_iommu_group,
3822 NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827 struct pci_dev *pdev = NULL;
3828
3829 for_each_pci_dev(pdev)
3830 if (pdev->external_facing) {
3831 pci_dev_put(pdev);
3832 return true;
3833 }
3834
3835 return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841 return 0;
3842
3843 if (no_iommu || dmar_disabled)
3844 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846 /*
3847 * If Intel-IOMMU is disabled by default, we will apply identity
3848 * map for all devices except those marked as being untrusted.
3849 */
3850 if (dmar_disabled)
3851 iommu_set_default_passthrough(false);
3852
3853 dmar_disabled = 0;
3854 no_iommu = 0;
3855
3856 return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861 struct dmar_drhd_unit *drhd;
3862 /* To avoid a -Wunused-but-set-variable warning. */
3863 struct intel_iommu *iommu __maybe_unused;
3864 struct device *dev;
3865 int i, ret = 0;
3866
3867 for_each_active_iommu(iommu, drhd) {
3868 for_each_active_dev_scope(drhd->devices,
3869 drhd->devices_cnt, i, dev) {
3870 struct acpi_device_physical_node *pn;
3871 struct iommu_group *group;
3872 struct acpi_device *adev;
3873
3874 if (dev->bus != &acpi_bus_type)
3875 continue;
3876
3877 adev = to_acpi_device(dev);
3878 mutex_lock(&adev->physical_node_lock);
3879 list_for_each_entry(pn,
3880 &adev->physical_node_list, node) {
3881 group = iommu_group_get(pn->dev);
3882 if (group) {
3883 iommu_group_put(group);
3884 continue;
3885 }
3886
3887 ret = iommu_probe_device(pn->dev);
3888 if (ret)
3889 break;
3890 }
3891 mutex_unlock(&adev->physical_node_lock);
3892
3893 if (ret)
3894 return ret;
3895 }
3896 }
3897
3898 return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903 if (!tboot_enabled())
3904 return 0;
3905
3906 if (no_iommu || dmar_disabled)
3907 pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909 dmar_disabled = 0;
3910 no_iommu = 0;
3911
3912 return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917 int ret = -ENODEV;
3918 struct dmar_drhd_unit *drhd;
3919 struct intel_iommu *iommu;
3920
3921 /*
3922 * Intel IOMMU is required for a TXT/tboot launch or platform
3923 * opt in, so enforce that.
3924 */
3925 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926 platform_optin_force_iommu();
3927
3928 down_write(&dmar_global_lock);
3929 if (dmar_table_init()) {
3930 if (force_on)
3931 panic("tboot: Failed to initialize DMAR table\n");
3932 goto out_free_dmar;
3933 }
3934
3935 if (dmar_dev_scope_init() < 0) {
3936 if (force_on)
3937 panic("tboot: Failed to initialize DMAR device scope\n");
3938 goto out_free_dmar;
3939 }
3940
3941 up_write(&dmar_global_lock);
3942
3943 /*
3944 * The bus notifier takes the dmar_global_lock, so lockdep will
3945 * complain later when we register it under the lock.
3946 */
3947 dmar_register_bus_notifier();
3948
3949 down_write(&dmar_global_lock);
3950
3951 if (!no_iommu)
3952 intel_iommu_debugfs_init();
3953
3954 if (no_iommu || dmar_disabled) {
3955 /*
3956 * We exit the function here to ensure IOMMU's remapping and
3957 * mempool aren't setup, which means that the IOMMU's PMRs
3958 * won't be disabled via the call to init_dmars(). So disable
3959 * it explicitly here. The PMRs were setup by tboot prior to
3960 * calling SENTER, but the kernel is expected to reset/tear
3961 * down the PMRs.
3962 */
3963 if (intel_iommu_tboot_noforce) {
3964 for_each_iommu(iommu, drhd)
3965 iommu_disable_protect_mem_regions(iommu);
3966 }
3967
3968 /*
3969 * Make sure the IOMMUs are switched off, even when we
3970 * boot into a kexec kernel and the previous kernel left
3971 * them enabled
3972 */
3973 intel_disable_iommus();
3974 goto out_free_dmar;
3975 }
3976
3977 if (list_empty(&dmar_rmrr_units))
3978 pr_info("No RMRR found\n");
3979
3980 if (list_empty(&dmar_atsr_units))
3981 pr_info("No ATSR found\n");
3982
3983 if (list_empty(&dmar_satc_units))
3984 pr_info("No SATC found\n");
3985
3986 init_no_remapping_devices();
3987
3988 ret = init_dmars();
3989 if (ret) {
3990 if (force_on)
3991 panic("tboot: Failed to initialize DMARs\n");
3992 pr_err("Initialization failed\n");
3993 goto out_free_dmar;
3994 }
3995 up_write(&dmar_global_lock);
3996
3997 init_iommu_pm_ops();
3998
3999 down_read(&dmar_global_lock);
4000 for_each_active_iommu(iommu, drhd) {
4001 /*
4002 * The flush queue implementation does not perform
4003 * page-selective invalidations that are required for efficient
4004 * TLB flushes in virtual environments. The benefit of batching
4005 * is likely to be much lower than the overhead of synchronizing
4006 * the virtual and physical IOMMU page-tables.
4007 */
4008 if (cap_caching_mode(iommu->cap)) {
4009 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010 iommu_set_dma_strict();
4011 }
4012 iommu_device_sysfs_add(&iommu->iommu, NULL,
4013 intel_iommu_groups,
4014 "%s", iommu->name);
4015 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4016 }
4017 up_read(&dmar_global_lock);
4018
4019 if (si_domain && !hw_pass_through)
4020 register_memory_notifier(&intel_iommu_memory_nb);
4021
4022 down_read(&dmar_global_lock);
4023 if (probe_acpi_namespace_devices())
4024 pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026 /* Finally, we enable the DMA remapping hardware. */
4027 for_each_iommu(iommu, drhd) {
4028 if (!drhd->ignored && !translation_pre_enabled(iommu))
4029 iommu_enable_translation(iommu);
4030
4031 iommu_disable_protect_mem_regions(iommu);
4032 }
4033 up_read(&dmar_global_lock);
4034
4035 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037 intel_iommu_enabled = 1;
4038
4039 return 0;
4040
4041out_free_dmar:
4042 intel_iommu_free_dmars();
4043 up_write(&dmar_global_lock);
4044 return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049 struct device_domain_info *info = opaque;
4050
4051 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052 return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices. If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064 return;
4065
4066 pci_for_each_dma_alias(to_pci_dev(info->dev),
4067 &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072 struct device_domain_info *info = dev_iommu_priv_get(dev);
4073 struct dmar_domain *domain = info->domain;
4074 struct intel_iommu *iommu = info->iommu;
4075 unsigned long flags;
4076
4077 if (!dev_is_real_dma_subdevice(info->dev)) {
4078 if (dev_is_pci(info->dev) && sm_supported(iommu))
4079 intel_pasid_tear_down_entry(iommu, info->dev,
4080 PASID_RID2PASID, false);
4081
4082 iommu_disable_pci_caps(info);
4083 domain_context_clear(info);
4084 }
4085
4086 spin_lock_irqsave(&domain->lock, flags);
4087 list_del(&info->link);
4088 spin_unlock_irqrestore(&domain->lock, flags);
4089
4090 domain_detach_iommu(domain, iommu);
4091 info->domain = NULL;
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101 struct device_domain_info *info = dev_iommu_priv_get(dev);
4102 struct intel_iommu *iommu = info->iommu;
4103 unsigned long flags;
4104
4105 iommu_disable_pci_caps(info);
4106 if (!dev_is_real_dma_subdevice(dev)) {
4107 if (sm_supported(iommu))
4108 intel_pasid_tear_down_entry(iommu, dev,
4109 PASID_RID2PASID, false);
4110 else
4111 domain_context_clear(info);
4112 }
4113
4114 if (!info->domain)
4115 return;
4116
4117 spin_lock_irqsave(&info->domain->lock, flags);
4118 list_del(&info->link);
4119 spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121 domain_detach_iommu(info->domain, iommu);
4122 info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127 int adjust_width;
4128
4129 /* calculate AGAW */
4130 domain->gaw = guest_width;
4131 adjust_width = guestwidth_to_adjustwidth(guest_width);
4132 domain->agaw = width_to_agaw(adjust_width);
4133
4134 domain->iommu_coherency = false;
4135 domain->iommu_superpage = 0;
4136 domain->max_addr = 0;
4137
4138 /* always allocate the top pgd */
4139 domain->pgd = alloc_pgtable_page(domain->nid);
4140 if (!domain->pgd)
4141 return -ENOMEM;
4142 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143 return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147 struct device *dev)
4148{
4149 device_block_translation(dev);
4150 return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
4154 .ops = &(const struct iommu_domain_ops) {
4155 .attach_dev = blocking_domain_attach_dev,
4156 .free = intel_iommu_domain_free
4157 }
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162 struct dmar_domain *dmar_domain;
4163 struct iommu_domain *domain;
4164
4165 switch (type) {
4166 case IOMMU_DOMAIN_BLOCKED:
4167 return &blocking_domain;
4168 case IOMMU_DOMAIN_DMA:
4169 case IOMMU_DOMAIN_DMA_FQ:
4170 case IOMMU_DOMAIN_UNMANAGED:
4171 dmar_domain = alloc_domain(type);
4172 if (!dmar_domain) {
4173 pr_err("Can't allocate dmar_domain\n");
4174 return NULL;
4175 }
4176 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177 pr_err("Domain initialization failed\n");
4178 domain_exit(dmar_domain);
4179 return NULL;
4180 }
4181
4182 domain = &dmar_domain->domain;
4183 domain->geometry.aperture_start = 0;
4184 domain->geometry.aperture_end =
4185 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186 domain->geometry.force_aperture = true;
4187
4188 return domain;
4189 case IOMMU_DOMAIN_IDENTITY:
4190 return &si_domain->domain;
4191 case IOMMU_DOMAIN_SVA:
4192 return intel_svm_domain_alloc();
4193 default:
4194 return NULL;
4195 }
4196
4197 return NULL;
4198}
4199
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202 if (domain != &si_domain->domain && domain != &blocking_domain)
4203 domain_exit(to_dmar_domain(domain));
4204}
4205
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207 struct device *dev)
4208{
4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 struct intel_iommu *iommu;
4211 int addr_width;
4212
4213 iommu = device_to_iommu(dev, NULL, NULL);
4214 if (!iommu)
4215 return -ENODEV;
4216
4217 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218 return -EINVAL;
4219
4220 /* check if this iommu agaw is sufficient for max mapped address */
4221 addr_width = agaw_to_width(iommu->agaw);
4222 if (addr_width > cap_mgaw(iommu->cap))
4223 addr_width = cap_mgaw(iommu->cap);
4224
4225 if (dmar_domain->max_addr > (1LL << addr_width))
4226 return -EINVAL;
4227 dmar_domain->gaw = addr_width;
4228
4229 /*
4230 * Knock out extra levels of page tables if necessary
4231 */
4232 while (iommu->agaw < dmar_domain->agaw) {
4233 struct dma_pte *pte;
4234
4235 pte = dmar_domain->pgd;
4236 if (dma_pte_present(pte)) {
4237 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238 free_pgtable_page(pte);
4239 }
4240 dmar_domain->agaw--;
4241 }
4242
4243 return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247 struct device *dev)
4248{
4249 struct device_domain_info *info = dev_iommu_priv_get(dev);
4250 int ret;
4251
4252 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253 device_is_rmrr_locked(dev)) {
4254 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4255 return -EPERM;
4256 }
4257
4258 if (info->domain)
4259 device_block_translation(dev);
4260
4261 ret = prepare_domain_attach_device(domain, dev);
4262 if (ret)
4263 return ret;
4264
4265 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4266}
4267
4268static int intel_iommu_map(struct iommu_domain *domain,
4269 unsigned long iova, phys_addr_t hpa,
4270 size_t size, int iommu_prot, gfp_t gfp)
4271{
4272 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273 u64 max_addr;
4274 int prot = 0;
4275
4276 if (iommu_prot & IOMMU_READ)
4277 prot |= DMA_PTE_READ;
4278 if (iommu_prot & IOMMU_WRITE)
4279 prot |= DMA_PTE_WRITE;
4280 if (dmar_domain->set_pte_snp)
4281 prot |= DMA_PTE_SNP;
4282
4283 max_addr = iova + size;
4284 if (dmar_domain->max_addr < max_addr) {
4285 u64 end;
4286
4287 /* check if minimum agaw is sufficient for mapped address */
4288 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289 if (end < max_addr) {
4290 pr_err("%s: iommu width (%d) is not "
4291 "sufficient for the mapped address (%llx)\n",
4292 __func__, dmar_domain->gaw, max_addr);
4293 return -EFAULT;
4294 }
4295 dmar_domain->max_addr = max_addr;
4296 }
4297 /* Round up size to next multiple of PAGE_SIZE, if it and
4298 the low bits of hpa would take us onto the next page */
4299 size = aligned_nrpages(hpa, size);
4300 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301 hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305 unsigned long iova, phys_addr_t paddr,
4306 size_t pgsize, size_t pgcount,
4307 int prot, gfp_t gfp, size_t *mapped)
4308{
4309 unsigned long pgshift = __ffs(pgsize);
4310 size_t size = pgcount << pgshift;
4311 int ret;
4312
4313 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314 return -EINVAL;
4315
4316 if (!IS_ALIGNED(iova | paddr, pgsize))
4317 return -EINVAL;
4318
4319 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320 if (!ret && mapped)
4321 *mapped = size;
4322
4323 return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327 unsigned long iova, size_t size,
4328 struct iommu_iotlb_gather *gather)
4329{
4330 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331 unsigned long start_pfn, last_pfn;
4332 int level = 0;
4333
4334 /* Cope with horrid API which requires us to unmap more than the
4335 size argument if it happens to be a large-page mapping. */
4336 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337
4338 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341 start_pfn = iova >> VTD_PAGE_SHIFT;
4342 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345
4346 if (dmar_domain->max_addr == iova + size)
4347 dmar_domain->max_addr = iova;
4348
4349 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350
4351 return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355 unsigned long iova,
4356 size_t pgsize, size_t pgcount,
4357 struct iommu_iotlb_gather *gather)
4358{
4359 unsigned long pgshift = __ffs(pgsize);
4360 size_t size = pgcount << pgshift;
4361
4362 return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366 struct iommu_iotlb_gather *gather)
4367{
4368 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369 unsigned long iova_pfn = IOVA_PFN(gather->start);
4370 size_t size = gather->end - gather->start;
4371 struct iommu_domain_info *info;
4372 unsigned long start_pfn;
4373 unsigned long nrpages;
4374 unsigned long i;
4375
4376 nrpages = aligned_nrpages(gather->start, size);
4377 start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379 xa_for_each(&dmar_domain->iommu_array, i, info)
4380 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381 start_pfn, nrpages,
4382 list_empty(&gather->freelist), 0);
4383
4384 put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388 dma_addr_t iova)
4389{
4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 struct dma_pte *pte;
4392 int level = 0;
4393 u64 phys = 0;
4394
4395 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396 if (pte && dma_pte_present(pte))
4397 phys = dma_pte_addr(pte) +
4398 (iova & (BIT_MASK(level_to_offset_bits(level) +
4399 VTD_PAGE_SHIFT) - 1));
4400
4401 return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406 struct device_domain_info *info;
4407 bool support = true;
4408
4409 assert_spin_locked(&domain->lock);
4410 list_for_each_entry(info, &domain->devices, link) {
4411 if (!ecap_sc_support(info->iommu->ecap)) {
4412 support = false;
4413 break;
4414 }
4415 }
4416
4417 return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422 struct device_domain_info *info;
4423
4424 assert_spin_locked(&domain->lock);
4425 /*
4426 * Second level page table supports per-PTE snoop control. The
4427 * iommu_map() interface will handle this by setting SNP bit.
4428 */
4429 if (!domain->use_first_level) {
4430 domain->set_pte_snp = true;
4431 return;
4432 }
4433
4434 list_for_each_entry(info, &domain->devices, link)
4435 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436 PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442 unsigned long flags;
4443
4444 if (dmar_domain->force_snooping)
4445 return true;
4446
4447 spin_lock_irqsave(&dmar_domain->lock, flags);
4448 if (!domain_support_force_snooping(dmar_domain)) {
4449 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450 return false;
4451 }
4452
4453 domain_set_force_snooping(dmar_domain);
4454 dmar_domain->force_snooping = true;
4455 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457 return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462 struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464 switch (cap) {
4465 case IOMMU_CAP_CACHE_COHERENCY:
4466 return true;
4467 case IOMMU_CAP_INTR_REMAP:
4468 return irq_remapping_enabled == 1;
4469 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470 return dmar_platform_optin();
4471 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472 return ecap_sc_support(info->iommu->ecap);
4473 default:
4474 return false;
4475 }
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481 struct device_domain_info *info;
4482 struct intel_iommu *iommu;
4483 u8 bus, devfn;
4484 int ret;
4485
4486 iommu = device_to_iommu(dev, &bus, &devfn);
4487 if (!iommu || !iommu->iommu.ops)
4488 return ERR_PTR(-ENODEV);
4489
4490 info = kzalloc(sizeof(*info), GFP_KERNEL);
4491 if (!info)
4492 return ERR_PTR(-ENOMEM);
4493
4494 if (dev_is_real_dma_subdevice(dev)) {
4495 info->bus = pdev->bus->number;
4496 info->devfn = pdev->devfn;
4497 info->segment = pci_domain_nr(pdev->bus);
4498 } else {
4499 info->bus = bus;
4500 info->devfn = devfn;
4501 info->segment = iommu->segment;
4502 }
4503
4504 info->dev = dev;
4505 info->iommu = iommu;
4506 if (dev_is_pci(dev)) {
4507 if (ecap_dev_iotlb_support(iommu->ecap) &&
4508 pci_ats_supported(pdev) &&
4509 dmar_ats_supported(pdev, iommu)) {
4510 info->ats_supported = 1;
4511 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512 }
4513 if (sm_supported(iommu)) {
4514 if (pasid_supported(iommu)) {
4515 int features = pci_pasid_features(pdev);
4516
4517 if (features >= 0)
4518 info->pasid_supported = features | 1;
4519 }
4520
4521 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522 pci_pri_supported(pdev))
4523 info->pri_supported = 1;
4524 }
4525 }
4526
4527 dev_iommu_priv_set(dev, info);
4528
4529 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530 ret = intel_pasid_alloc_table(dev);
4531 if (ret) {
4532 dev_err(dev, "PASID table allocation failed\n");
4533 dev_iommu_priv_set(dev, NULL);
4534 kfree(info);
4535 return ERR_PTR(ret);
4536 }
4537 }
4538
4539 return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544 struct device_domain_info *info = dev_iommu_priv_get(dev);
4545
4546 dmar_remove_one_dev_info(dev);
4547 intel_pasid_free_table(dev);
4548 dev_iommu_priv_set(dev, NULL);
4549 kfree(info);
4550 set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555 set_dma_ops(dev, NULL);
4556 iommu_setup_dma_ops(dev, 0, U64_MAX);
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560 struct list_head *head)
4561{
4562 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563 struct iommu_resv_region *reg;
4564 struct dmar_rmrr_unit *rmrr;
4565 struct device *i_dev;
4566 int i;
4567
4568 rcu_read_lock();
4569 for_each_rmrr_units(rmrr) {
4570 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571 i, i_dev) {
4572 struct iommu_resv_region *resv;
4573 enum iommu_resv_type type;
4574 size_t length;
4575
4576 if (i_dev != device &&
4577 !is_downstream_to_pci_bridge(device, i_dev))
4578 continue;
4579
4580 length = rmrr->end_address - rmrr->base_address + 1;
4581
4582 type = device_rmrr_is_relaxable(device) ?
4583 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585 resv = iommu_alloc_resv_region(rmrr->base_address,
4586 length, prot, type,
4587 GFP_ATOMIC);
4588 if (!resv)
4589 break;
4590
4591 list_add_tail(&resv->list, head);
4592 }
4593 }
4594 rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597 if (dev_is_pci(device)) {
4598 struct pci_dev *pdev = to_pci_dev(device);
4599
4600 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602 IOMMU_RESV_DIRECT_RELAXABLE,
4603 GFP_KERNEL);
4604 if (reg)
4605 list_add_tail(®->list, head);
4606 }
4607 }
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612 0, IOMMU_RESV_MSI, GFP_KERNEL);
4613 if (!reg)
4614 return;
4615 list_add_tail(®->list, head);
4616}
4617
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620 if (dev_is_pci(dev))
4621 return pci_device_group(dev);
4622 return generic_device_group(dev);
4623}
4624
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627 struct device_domain_info *info = dev_iommu_priv_get(dev);
4628 struct intel_iommu *iommu;
4629 int ret;
4630
4631 if (!info || dmar_disabled)
4632 return -EINVAL;
4633
4634 iommu = info->iommu;
4635 if (!iommu)
4636 return -EINVAL;
4637
4638 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639 return -ENODEV;
4640
4641 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642 return -EINVAL;
4643
4644 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645 if (!ret)
4646 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647
4648 return ret;
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
4652{
4653 struct device_domain_info *info = dev_iommu_priv_get(dev);
4654 struct intel_iommu *iommu = info->iommu;
4655 int ret;
4656
4657 ret = iommu_unregister_device_fault_handler(dev);
4658 if (!ret)
4659 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660
4661 return ret;
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
4665{
4666 struct device_domain_info *info = dev_iommu_priv_get(dev);
4667
4668 if (info && info->pri_supported)
4669 return 0;
4670
4671 return -ENODEV;
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677 switch (feat) {
4678 case IOMMU_DEV_FEAT_IOPF:
4679 return intel_iommu_enable_iopf(dev);
4680
4681 case IOMMU_DEV_FEAT_SVA:
4682 return intel_iommu_enable_sva(dev);
4683
4684 default:
4685 return -ENODEV;
4686 }
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692 switch (feat) {
4693 case IOMMU_DEV_FEAT_IOPF:
4694 return 0;
4695
4696 case IOMMU_DEV_FEAT_SVA:
4697 return intel_iommu_disable_sva(dev);
4698
4699 default:
4700 return -ENODEV;
4701 }
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
4705{
4706 struct device_domain_info *info = dev_iommu_priv_get(dev);
4707
4708 return translation_pre_enabled(info->iommu) && !info->domain;
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718 if (pdev->untrusted) {
4719 pci_info(pdev,
4720 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721 pdev->vendor, pdev->device);
4722 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723 return true;
4724 }
4725 return false;
4726}
4727
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729 unsigned long iova, size_t size)
4730{
4731 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732 unsigned long pages = aligned_nrpages(iova, size);
4733 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734 struct iommu_domain_info *info;
4735 unsigned long i;
4736
4737 xa_for_each(&dmar_domain->iommu_array, i, info)
4738 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744 struct iommu_domain *domain;
4745
4746 /* Domain type specific cleanup: */
4747 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748 if (domain) {
4749 switch (domain->type) {
4750 case IOMMU_DOMAIN_SVA:
4751 intel_svm_remove_dev_pasid(dev, pasid);
4752 break;
4753 default:
4754 /* should never reach here */
4755 WARN_ON(1);
4756 break;
4757 }
4758 }
4759
4760 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761}
4762
4763const struct iommu_ops intel_iommu_ops = {
4764 .capable = intel_iommu_capable,
4765 .domain_alloc = intel_iommu_domain_alloc,
4766 .probe_device = intel_iommu_probe_device,
4767 .probe_finalize = intel_iommu_probe_finalize,
4768 .release_device = intel_iommu_release_device,
4769 .get_resv_regions = intel_iommu_get_resv_regions,
4770 .device_group = intel_iommu_device_group,
4771 .dev_enable_feat = intel_iommu_dev_enable_feat,
4772 .dev_disable_feat = intel_iommu_dev_disable_feat,
4773 .is_attach_deferred = intel_iommu_is_attach_deferred,
4774 .def_domain_type = device_def_domain_type,
4775 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4776 .pgsize_bitmap = SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
4778 .page_response = intel_svm_page_response,
4779#endif
4780 .default_domain_ops = &(const struct iommu_domain_ops) {
4781 .attach_dev = intel_iommu_attach_device,
4782 .map_pages = intel_iommu_map_pages,
4783 .unmap_pages = intel_iommu_unmap_pages,
4784 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4785 .flush_iotlb_all = intel_flush_iotlb_all,
4786 .iotlb_sync = intel_iommu_tlb_sync,
4787 .iova_to_phys = intel_iommu_iova_to_phys,
4788 .free = intel_iommu_domain_free,
4789 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790 }
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795 if (risky_device(dev))
4796 return;
4797
4798 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799 dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839 if (risky_device(dev))
4840 return;
4841
4842 /*
4843 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844 * but needs it. Same seems to hold for the desktop versions.
4845 */
4846 pci_info(dev, "Forcing write-buffer flush capability\n");
4847 rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870 unsigned short ggc;
4871
4872 if (risky_device(dev))
4873 return;
4874
4875 if (pci_read_config_word(dev, GGC, &ggc))
4876 return;
4877
4878 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880 dmar_map_gfx = 0;
4881 } else if (dmar_map_gfx) {
4882 /* we have to ensure the gfx device is idle before we flush */
4883 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884 iommu_set_dma_strict();
4885 }
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894 unsigned short ver;
4895
4896 if (!IS_GFX_DEVICE(dev))
4897 return;
4898
4899 ver = (dev->device >> 8) & 0xff;
4900 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902 ver != 0x9a && ver != 0xa7)
4903 return;
4904
4905 if (risky_device(dev))
4906 return;
4907
4908 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909 iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914 ISOCH DMAR unit for the Azalia sound device, but not give it any
4915 TLB entries, which causes it to deadlock. Check for that. We do
4916 this in a function called from init_dmars(), instead of in a PCI
4917 quirk, because we don't want to print the obnoxious "BIOS broken"
4918 message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922 struct pci_dev *pdev;
4923 uint32_t vtisochctrl;
4924
4925 /* If there's no Azalia in the system anyway, forget it. */
4926 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927 if (!pdev)
4928 return;
4929
4930 if (risky_device(pdev)) {
4931 pci_dev_put(pdev);
4932 return;
4933 }
4934
4935 pci_dev_put(pdev);
4936
4937 /* System Management Registers. Might be hidden, in which case
4938 we can't do the sanity check. But that's OK, because the
4939 known-broken BIOSes _don't_ actually hide it, so far. */
4940 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941 if (!pdev)
4942 return;
4943
4944 if (risky_device(pdev)) {
4945 pci_dev_put(pdev);
4946 return;
4947 }
4948
4949 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950 pci_dev_put(pdev);
4951 return;
4952 }
4953
4954 pci_dev_put(pdev);
4955
4956 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957 if (vtisochctrl & 1)
4958 return;
4959
4960 /* Drop all bits other than the number of TLB entries */
4961 vtisochctrl &= 0x1c;
4962
4963 /* If we have the recommended number of TLB entries (16), fine. */
4964 if (vtisochctrl == 0x10)
4965 return;
4966
4967 /* Zero TLB entries? You get to ride the short bus to school. */
4968 if (!vtisochctrl) {
4969 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971 dmi_get_system_info(DMI_BIOS_VENDOR),
4972 dmi_get_system_info(DMI_BIOS_VERSION),
4973 dmi_get_system_info(DMI_PRODUCT_VERSION));
4974 iommu_identity_mapping |= IDENTMAP_AZALIA;
4975 return;
4976 }
4977
4978 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979 vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 * exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 * VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009 unsigned long address, unsigned long mask,
5010 u32 pasid, u16 qdep)
5011{
5012 u16 sid;
5013
5014 if (likely(!info->dtlb_extra_inval))
5015 return;
5016
5017 sid = PCI_DEVID(info->bus, info->devfn);
5018 if (pasid == PASID_RID2PASID) {
5019 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020 qdep, address, mask);
5021 } else {
5022 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023 pasid, qdep, address, mask);
5024 }
5025}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/init.h>
17#include <linux/bitmap.h>
18#include <linux/debugfs.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/irq.h>
22#include <linux/interrupt.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25#include <linux/dmar.h>
26#include <linux/dma-map-ops.h>
27#include <linux/mempool.h>
28#include <linux/memory.h>
29#include <linux/cpu.h>
30#include <linux/timer.h>
31#include <linux/io.h>
32#include <linux/iova.h>
33#include <linux/iommu.h>
34#include <linux/dma-iommu.h>
35#include <linux/intel-iommu.h>
36#include <linux/syscore_ops.h>
37#include <linux/tboot.h>
38#include <linux/dmi.h>
39#include <linux/pci-ats.h>
40#include <linux/memblock.h>
41#include <linux/dma-direct.h>
42#include <linux/crash_dump.h>
43#include <linux/numa.h>
44#include <asm/irq_remapping.h>
45#include <asm/cacheflush.h>
46#include <asm/iommu.h>
47
48#include "../irq_remapping.h"
49#include "../iommu-sva-lib.h"
50#include "pasid.h"
51#include "cap_audit.h"
52
53#define ROOT_SIZE VTD_PAGE_SIZE
54#define CONTEXT_SIZE VTD_PAGE_SIZE
55
56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61#define IOAPIC_RANGE_START (0xfee00000)
62#define IOAPIC_RANGE_END (0xfeefffff)
63#define IOVA_START_ADDR (0x1000)
64
65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67#define MAX_AGAW_WIDTH 64
68#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
72
73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79/* IO virtual address start page frame number */
80#define IOVA_START_PFN (1)
81
82#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84/* page table handling */
85#define LEVEL_STRIDE (9)
86#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88/*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
106static inline int agaw_to_level(int agaw)
107{
108 return agaw + 2;
109}
110
111static inline int agaw_to_width(int agaw)
112{
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114}
115
116static inline int width_to_agaw(int width)
117{
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119}
120
121static inline unsigned int level_to_offset_bits(int level)
122{
123 return (level - 1) * LEVEL_STRIDE;
124}
125
126static inline int pfn_level_offset(u64 pfn, int level)
127{
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129}
130
131static inline u64 level_mask(int level)
132{
133 return -1ULL << level_to_offset_bits(level);
134}
135
136static inline u64 level_size(int level)
137{
138 return 1ULL << level_to_offset_bits(level);
139}
140
141static inline u64 align_to_level(u64 pfn, int level)
142{
143 return (pfn + level_size(level) - 1) & level_mask(level);
144}
145
146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147{
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149}
150
151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154{
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156}
157
158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159{
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161}
162static inline unsigned long page_to_dma_pfn(struct page *pg)
163{
164 return mm_to_dma_pfn(page_to_pfn(pg));
165}
166static inline unsigned long virt_to_dma_pfn(void *p)
167{
168 return page_to_dma_pfn(virt_to_page(p));
169}
170
171/* global iommu list, set NULL for ignored DMAR units */
172static struct intel_iommu **g_iommus;
173
174static void __init check_tylersburg_isoch(void);
175static int rwbf_quirk;
176
177/*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181static int force_on = 0;
182static int intel_iommu_tboot_noforce;
183static int no_platform_optin;
184
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187/*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191static phys_addr_t root_entry_lctp(struct root_entry *re)
192{
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197}
198
199/*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203static phys_addr_t root_entry_uctp(struct root_entry *re)
204{
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209}
210
211static inline void context_clear_pasid_enable(struct context_entry *context)
212{
213 context->lo &= ~(1ULL << 11);
214}
215
216static inline bool context_pasid_enabled(struct context_entry *context)
217{
218 return !!(context->lo & (1ULL << 11));
219}
220
221static inline void context_set_copied(struct context_entry *context)
222{
223 context->hi |= (1ull << 3);
224}
225
226static inline bool context_copied(struct context_entry *context)
227{
228 return !!(context->hi & (1ULL << 3));
229}
230
231static inline bool __context_present(struct context_entry *context)
232{
233 return (context->lo & 1);
234}
235
236bool context_present(struct context_entry *context)
237{
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241}
242
243static inline void context_set_present(struct context_entry *context)
244{
245 context->lo |= 1;
246}
247
248static inline void context_set_fault_enable(struct context_entry *context)
249{
250 context->lo &= (((u64)-1) << 2) | 1;
251}
252
253static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255{
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258}
259
260static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262{
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265}
266
267static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269{
270 context->hi |= value & 7;
271}
272
273static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275{
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277}
278
279static inline int context_domain_id(struct context_entry *c)
280{
281 return((c->hi >> 8) & 0xffff);
282}
283
284static inline void context_clear_entry(struct context_entry *context)
285{
286 context->lo = 0;
287 context->hi = 0;
288}
289
290/*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296static struct dmar_domain *si_domain;
297static int hw_pass_through = 1;
298
299#define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
302
303struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
310};
311
312struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
318};
319
320struct dmar_satc_unit {
321 struct list_head list; /* list of SATC units */
322 struct acpi_dmar_header *hdr; /* ACPI header */
323 struct dmar_dev_scope *devices; /* target devices */
324 struct intel_iommu *iommu; /* the corresponding iommu */
325 int devices_cnt; /* target device count */
326 u8 atc_required:1; /* ATS is required */
327};
328
329static LIST_HEAD(dmar_atsr_units);
330static LIST_HEAD(dmar_rmrr_units);
331static LIST_HEAD(dmar_satc_units);
332
333#define for_each_rmrr_units(rmrr) \
334 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335
336/* bitmap for indexing intel_iommus */
337static int g_num_of_iommus;
338
339static void domain_exit(struct dmar_domain *domain);
340static void domain_remove_dev_info(struct dmar_domain *domain);
341static void dmar_remove_one_dev_info(struct device *dev);
342static void __dmar_remove_one_dev_info(struct device_domain_info *info);
343static int intel_iommu_attach_device(struct iommu_domain *domain,
344 struct device *dev);
345static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
346 dma_addr_t iova);
347
348#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349int dmar_disabled = 0;
350#else
351int dmar_disabled = 1;
352#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
353
354#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
355int intel_iommu_sm = 1;
356#else
357int intel_iommu_sm;
358#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
359
360int intel_iommu_enabled = 0;
361EXPORT_SYMBOL_GPL(intel_iommu_enabled);
362
363static int dmar_map_gfx = 1;
364static int intel_iommu_strict;
365static int intel_iommu_superpage = 1;
366static int iommu_identity_mapping;
367static int iommu_skip_te_disable;
368
369#define IDENTMAP_GFX 2
370#define IDENTMAP_AZALIA 4
371
372int intel_iommu_gfx_mapped;
373EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
374
375#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
376struct device_domain_info *get_domain_info(struct device *dev)
377{
378 struct device_domain_info *info;
379
380 if (!dev)
381 return NULL;
382
383 info = dev_iommu_priv_get(dev);
384 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
385 return NULL;
386
387 return info;
388}
389
390DEFINE_SPINLOCK(device_domain_lock);
391static LIST_HEAD(device_domain_list);
392
393/*
394 * Iterate over elements in device_domain_list and call the specified
395 * callback @fn against each element.
396 */
397int for_each_device_domain(int (*fn)(struct device_domain_info *info,
398 void *data), void *data)
399{
400 int ret = 0;
401 unsigned long flags;
402 struct device_domain_info *info;
403
404 spin_lock_irqsave(&device_domain_lock, flags);
405 list_for_each_entry(info, &device_domain_list, global) {
406 ret = fn(info, data);
407 if (ret) {
408 spin_unlock_irqrestore(&device_domain_lock, flags);
409 return ret;
410 }
411 }
412 spin_unlock_irqrestore(&device_domain_lock, flags);
413
414 return 0;
415}
416
417const struct iommu_ops intel_iommu_ops;
418
419static bool translation_pre_enabled(struct intel_iommu *iommu)
420{
421 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
422}
423
424static void clear_translation_pre_enabled(struct intel_iommu *iommu)
425{
426 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
427}
428
429static void init_translation_status(struct intel_iommu *iommu)
430{
431 u32 gsts;
432
433 gsts = readl(iommu->reg + DMAR_GSTS_REG);
434 if (gsts & DMA_GSTS_TES)
435 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
436}
437
438static int __init intel_iommu_setup(char *str)
439{
440 if (!str)
441 return -EINVAL;
442 while (*str) {
443 if (!strncmp(str, "on", 2)) {
444 dmar_disabled = 0;
445 pr_info("IOMMU enabled\n");
446 } else if (!strncmp(str, "off", 3)) {
447 dmar_disabled = 1;
448 no_platform_optin = 1;
449 pr_info("IOMMU disabled\n");
450 } else if (!strncmp(str, "igfx_off", 8)) {
451 dmar_map_gfx = 0;
452 pr_info("Disable GFX device mapping\n");
453 } else if (!strncmp(str, "forcedac", 8)) {
454 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
455 iommu_dma_forcedac = true;
456 } else if (!strncmp(str, "strict", 6)) {
457 pr_info("Disable batched IOTLB flush\n");
458 intel_iommu_strict = 1;
459 } else if (!strncmp(str, "sp_off", 6)) {
460 pr_info("Disable supported super page\n");
461 intel_iommu_superpage = 0;
462 } else if (!strncmp(str, "sm_on", 5)) {
463 pr_info("Intel-IOMMU: scalable mode supported\n");
464 intel_iommu_sm = 1;
465 } else if (!strncmp(str, "tboot_noforce", 13)) {
466 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467 intel_iommu_tboot_noforce = 1;
468 }
469
470 str += strcspn(str, ",");
471 while (*str == ',')
472 str++;
473 }
474 return 0;
475}
476__setup("intel_iommu=", intel_iommu_setup);
477
478static struct kmem_cache *iommu_domain_cache;
479static struct kmem_cache *iommu_devinfo_cache;
480
481static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482{
483 struct dmar_domain **domains;
484 int idx = did >> 8;
485
486 domains = iommu->domains[idx];
487 if (!domains)
488 return NULL;
489
490 return domains[did & 0xff];
491}
492
493static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494 struct dmar_domain *domain)
495{
496 struct dmar_domain **domains;
497 int idx = did >> 8;
498
499 if (!iommu->domains[idx]) {
500 size_t size = 256 * sizeof(struct dmar_domain *);
501 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502 }
503
504 domains = iommu->domains[idx];
505 if (WARN_ON(!domains))
506 return;
507 else
508 domains[did & 0xff] = domain;
509}
510
511void *alloc_pgtable_page(int node)
512{
513 struct page *page;
514 void *vaddr = NULL;
515
516 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 if (page)
518 vaddr = page_address(page);
519 return vaddr;
520}
521
522void free_pgtable_page(void *vaddr)
523{
524 free_page((unsigned long)vaddr);
525}
526
527static inline void *alloc_domain_mem(void)
528{
529 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530}
531
532static void free_domain_mem(void *vaddr)
533{
534 kmem_cache_free(iommu_domain_cache, vaddr);
535}
536
537static inline void * alloc_devinfo_mem(void)
538{
539 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540}
541
542static inline void free_devinfo_mem(void *vaddr)
543{
544 kmem_cache_free(iommu_devinfo_cache, vaddr);
545}
546
547static inline int domain_type_is_si(struct dmar_domain *domain)
548{
549 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550}
551
552static inline bool domain_use_first_level(struct dmar_domain *domain)
553{
554 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
555}
556
557static inline int domain_pfn_supported(struct dmar_domain *domain,
558 unsigned long pfn)
559{
560 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
561
562 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
563}
564
565static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
566{
567 unsigned long sagaw;
568 int agaw;
569
570 sagaw = cap_sagaw(iommu->cap);
571 for (agaw = width_to_agaw(max_gaw);
572 agaw >= 0; agaw--) {
573 if (test_bit(agaw, &sagaw))
574 break;
575 }
576
577 return agaw;
578}
579
580/*
581 * Calculate max SAGAW for each iommu.
582 */
583int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
584{
585 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
586}
587
588/*
589 * calculate agaw for each iommu.
590 * "SAGAW" may be different across iommus, use a default agaw, and
591 * get a supported less agaw for iommus that don't support the default agaw.
592 */
593int iommu_calculate_agaw(struct intel_iommu *iommu)
594{
595 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
596}
597
598/* This functionin only returns single iommu in a domain */
599struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
600{
601 int iommu_id;
602
603 /* si_domain and vm domain should not get here. */
604 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
605 return NULL;
606
607 for_each_domain_iommu(iommu_id, domain)
608 break;
609
610 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
611 return NULL;
612
613 return g_iommus[iommu_id];
614}
615
616static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
617{
618 return sm_supported(iommu) ?
619 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
620}
621
622static void domain_update_iommu_coherency(struct dmar_domain *domain)
623{
624 struct dmar_drhd_unit *drhd;
625 struct intel_iommu *iommu;
626 bool found = false;
627 int i;
628
629 domain->iommu_coherency = true;
630
631 for_each_domain_iommu(i, domain) {
632 found = true;
633 if (!iommu_paging_structure_coherency(g_iommus[i])) {
634 domain->iommu_coherency = false;
635 break;
636 }
637 }
638 if (found)
639 return;
640
641 /* No hardware attached; use lowest common denominator */
642 rcu_read_lock();
643 for_each_active_iommu(iommu, drhd) {
644 if (!iommu_paging_structure_coherency(iommu)) {
645 domain->iommu_coherency = false;
646 break;
647 }
648 }
649 rcu_read_unlock();
650}
651
652static bool domain_update_iommu_snooping(struct intel_iommu *skip)
653{
654 struct dmar_drhd_unit *drhd;
655 struct intel_iommu *iommu;
656 bool ret = true;
657
658 rcu_read_lock();
659 for_each_active_iommu(iommu, drhd) {
660 if (iommu != skip) {
661 /*
662 * If the hardware is operating in the scalable mode,
663 * the snooping control is always supported since we
664 * always set PASID-table-entry.PGSNP bit if the domain
665 * is managed outside (UNMANAGED).
666 */
667 if (!sm_supported(iommu) &&
668 !ecap_sc_support(iommu->ecap)) {
669 ret = false;
670 break;
671 }
672 }
673 }
674 rcu_read_unlock();
675
676 return ret;
677}
678
679static int domain_update_iommu_superpage(struct dmar_domain *domain,
680 struct intel_iommu *skip)
681{
682 struct dmar_drhd_unit *drhd;
683 struct intel_iommu *iommu;
684 int mask = 0x3;
685
686 if (!intel_iommu_superpage)
687 return 0;
688
689 /* set iommu_superpage to the smallest common denominator */
690 rcu_read_lock();
691 for_each_active_iommu(iommu, drhd) {
692 if (iommu != skip) {
693 if (domain && domain_use_first_level(domain)) {
694 if (!cap_fl1gp_support(iommu->cap))
695 mask = 0x1;
696 } else {
697 mask &= cap_super_page_val(iommu->cap);
698 }
699
700 if (!mask)
701 break;
702 }
703 }
704 rcu_read_unlock();
705
706 return fls(mask);
707}
708
709static int domain_update_device_node(struct dmar_domain *domain)
710{
711 struct device_domain_info *info;
712 int nid = NUMA_NO_NODE;
713
714 assert_spin_locked(&device_domain_lock);
715
716 if (list_empty(&domain->devices))
717 return NUMA_NO_NODE;
718
719 list_for_each_entry(info, &domain->devices, link) {
720 if (!info->dev)
721 continue;
722
723 /*
724 * There could possibly be multiple device numa nodes as devices
725 * within the same domain may sit behind different IOMMUs. There
726 * isn't perfect answer in such situation, so we select first
727 * come first served policy.
728 */
729 nid = dev_to_node(info->dev);
730 if (nid != NUMA_NO_NODE)
731 break;
732 }
733
734 return nid;
735}
736
737static void domain_update_iotlb(struct dmar_domain *domain);
738
739/* Some capabilities may be different across iommus */
740static void domain_update_iommu_cap(struct dmar_domain *domain)
741{
742 domain_update_iommu_coherency(domain);
743 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
744 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
745
746 /*
747 * If RHSA is missing, we should default to the device numa domain
748 * as fall back.
749 */
750 if (domain->nid == NUMA_NO_NODE)
751 domain->nid = domain_update_device_node(domain);
752
753 /*
754 * First-level translation restricts the input-address to a
755 * canonical address (i.e., address bits 63:N have the same
756 * value as address bit [N-1], where N is 48-bits with 4-level
757 * paging and 57-bits with 5-level paging). Hence, skip bit
758 * [N-1].
759 */
760 if (domain_use_first_level(domain))
761 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
762 else
763 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
764
765 domain_update_iotlb(domain);
766}
767
768struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
769 u8 devfn, int alloc)
770{
771 struct root_entry *root = &iommu->root_entry[bus];
772 struct context_entry *context;
773 u64 *entry;
774
775 entry = &root->lo;
776 if (sm_supported(iommu)) {
777 if (devfn >= 0x80) {
778 devfn -= 0x80;
779 entry = &root->hi;
780 }
781 devfn *= 2;
782 }
783 if (*entry & 1)
784 context = phys_to_virt(*entry & VTD_PAGE_MASK);
785 else {
786 unsigned long phy_addr;
787 if (!alloc)
788 return NULL;
789
790 context = alloc_pgtable_page(iommu->node);
791 if (!context)
792 return NULL;
793
794 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
795 phy_addr = virt_to_phys((void *)context);
796 *entry = phy_addr | 1;
797 __iommu_flush_cache(iommu, entry, sizeof(*entry));
798 }
799 return &context[devfn];
800}
801
802static bool attach_deferred(struct device *dev)
803{
804 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
805}
806
807/**
808 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
809 * sub-hierarchy of a candidate PCI-PCI bridge
810 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
811 * @bridge: the candidate PCI-PCI bridge
812 *
813 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
814 */
815static bool
816is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
817{
818 struct pci_dev *pdev, *pbridge;
819
820 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
821 return false;
822
823 pdev = to_pci_dev(dev);
824 pbridge = to_pci_dev(bridge);
825
826 if (pbridge->subordinate &&
827 pbridge->subordinate->number <= pdev->bus->number &&
828 pbridge->subordinate->busn_res.end >= pdev->bus->number)
829 return true;
830
831 return false;
832}
833
834static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
835{
836 struct dmar_drhd_unit *drhd;
837 u32 vtbar;
838 int rc;
839
840 /* We know that this device on this chipset has its own IOMMU.
841 * If we find it under a different IOMMU, then the BIOS is lying
842 * to us. Hope that the IOMMU for this device is actually
843 * disabled, and it needs no translation...
844 */
845 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
846 if (rc) {
847 /* "can't" happen */
848 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
849 return false;
850 }
851 vtbar &= 0xffff0000;
852
853 /* we know that the this iommu should be at offset 0xa000 from vtbar */
854 drhd = dmar_find_matched_drhd_unit(pdev);
855 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
856 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
857 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
858 return true;
859 }
860
861 return false;
862}
863
864static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
865{
866 if (!iommu || iommu->drhd->ignored)
867 return true;
868
869 if (dev_is_pci(dev)) {
870 struct pci_dev *pdev = to_pci_dev(dev);
871
872 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
873 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
874 quirk_ioat_snb_local_iommu(pdev))
875 return true;
876 }
877
878 return false;
879}
880
881struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
882{
883 struct dmar_drhd_unit *drhd = NULL;
884 struct pci_dev *pdev = NULL;
885 struct intel_iommu *iommu;
886 struct device *tmp;
887 u16 segment = 0;
888 int i;
889
890 if (!dev)
891 return NULL;
892
893 if (dev_is_pci(dev)) {
894 struct pci_dev *pf_pdev;
895
896 pdev = pci_real_dma_dev(to_pci_dev(dev));
897
898 /* VFs aren't listed in scope tables; we need to look up
899 * the PF instead to find the IOMMU. */
900 pf_pdev = pci_physfn(pdev);
901 dev = &pf_pdev->dev;
902 segment = pci_domain_nr(pdev->bus);
903 } else if (has_acpi_companion(dev))
904 dev = &ACPI_COMPANION(dev)->dev;
905
906 rcu_read_lock();
907 for_each_iommu(iommu, drhd) {
908 if (pdev && segment != drhd->segment)
909 continue;
910
911 for_each_active_dev_scope(drhd->devices,
912 drhd->devices_cnt, i, tmp) {
913 if (tmp == dev) {
914 /* For a VF use its original BDF# not that of the PF
915 * which we used for the IOMMU lookup. Strictly speaking
916 * we could do this for all PCI devices; we only need to
917 * get the BDF# from the scope table for ACPI matches. */
918 if (pdev && pdev->is_virtfn)
919 goto got_pdev;
920
921 if (bus && devfn) {
922 *bus = drhd->devices[i].bus;
923 *devfn = drhd->devices[i].devfn;
924 }
925 goto out;
926 }
927
928 if (is_downstream_to_pci_bridge(dev, tmp))
929 goto got_pdev;
930 }
931
932 if (pdev && drhd->include_all) {
933 got_pdev:
934 if (bus && devfn) {
935 *bus = pdev->bus->number;
936 *devfn = pdev->devfn;
937 }
938 goto out;
939 }
940 }
941 iommu = NULL;
942 out:
943 if (iommu_is_dummy(iommu, dev))
944 iommu = NULL;
945
946 rcu_read_unlock();
947
948 return iommu;
949}
950
951static void domain_flush_cache(struct dmar_domain *domain,
952 void *addr, int size)
953{
954 if (!domain->iommu_coherency)
955 clflush_cache_range(addr, size);
956}
957
958static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
959{
960 struct context_entry *context;
961 int ret = 0;
962 unsigned long flags;
963
964 spin_lock_irqsave(&iommu->lock, flags);
965 context = iommu_context_addr(iommu, bus, devfn, 0);
966 if (context)
967 ret = context_present(context);
968 spin_unlock_irqrestore(&iommu->lock, flags);
969 return ret;
970}
971
972static void free_context_table(struct intel_iommu *iommu)
973{
974 int i;
975 unsigned long flags;
976 struct context_entry *context;
977
978 spin_lock_irqsave(&iommu->lock, flags);
979 if (!iommu->root_entry) {
980 goto out;
981 }
982 for (i = 0; i < ROOT_ENTRY_NR; i++) {
983 context = iommu_context_addr(iommu, i, 0, 0);
984 if (context)
985 free_pgtable_page(context);
986
987 if (!sm_supported(iommu))
988 continue;
989
990 context = iommu_context_addr(iommu, i, 0x80, 0);
991 if (context)
992 free_pgtable_page(context);
993
994 }
995 free_pgtable_page(iommu->root_entry);
996 iommu->root_entry = NULL;
997out:
998 spin_unlock_irqrestore(&iommu->lock, flags);
999}
1000
1001static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1002 unsigned long pfn, int *target_level)
1003{
1004 struct dma_pte *parent, *pte;
1005 int level = agaw_to_level(domain->agaw);
1006 int offset;
1007
1008 BUG_ON(!domain->pgd);
1009
1010 if (!domain_pfn_supported(domain, pfn))
1011 /* Address beyond IOMMU's addressing capabilities. */
1012 return NULL;
1013
1014 parent = domain->pgd;
1015
1016 while (1) {
1017 void *tmp_page;
1018
1019 offset = pfn_level_offset(pfn, level);
1020 pte = &parent[offset];
1021 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1022 break;
1023 if (level == *target_level)
1024 break;
1025
1026 if (!dma_pte_present(pte)) {
1027 uint64_t pteval;
1028
1029 tmp_page = alloc_pgtable_page(domain->nid);
1030
1031 if (!tmp_page)
1032 return NULL;
1033
1034 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1035 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1036 if (domain_use_first_level(domain)) {
1037 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
1038 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1039 pteval |= DMA_FL_PTE_ACCESS;
1040 }
1041 if (cmpxchg64(&pte->val, 0ULL, pteval))
1042 /* Someone else set it while we were thinking; use theirs. */
1043 free_pgtable_page(tmp_page);
1044 else
1045 domain_flush_cache(domain, pte, sizeof(*pte));
1046 }
1047 if (level == 1)
1048 break;
1049
1050 parent = phys_to_virt(dma_pte_addr(pte));
1051 level--;
1052 }
1053
1054 if (!*target_level)
1055 *target_level = level;
1056
1057 return pte;
1058}
1059
1060/* return address's pte at specific level */
1061static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1062 unsigned long pfn,
1063 int level, int *large_page)
1064{
1065 struct dma_pte *parent, *pte;
1066 int total = agaw_to_level(domain->agaw);
1067 int offset;
1068
1069 parent = domain->pgd;
1070 while (level <= total) {
1071 offset = pfn_level_offset(pfn, total);
1072 pte = &parent[offset];
1073 if (level == total)
1074 return pte;
1075
1076 if (!dma_pte_present(pte)) {
1077 *large_page = total;
1078 break;
1079 }
1080
1081 if (dma_pte_superpage(pte)) {
1082 *large_page = total;
1083 return pte;
1084 }
1085
1086 parent = phys_to_virt(dma_pte_addr(pte));
1087 total--;
1088 }
1089 return NULL;
1090}
1091
1092/* clear last level pte, a tlb flush should be followed */
1093static void dma_pte_clear_range(struct dmar_domain *domain,
1094 unsigned long start_pfn,
1095 unsigned long last_pfn)
1096{
1097 unsigned int large_page;
1098 struct dma_pte *first_pte, *pte;
1099
1100 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1101 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1102 BUG_ON(start_pfn > last_pfn);
1103
1104 /* we don't need lock here; nobody else touches the iova range */
1105 do {
1106 large_page = 1;
1107 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1108 if (!pte) {
1109 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1110 continue;
1111 }
1112 do {
1113 dma_clear_pte(pte);
1114 start_pfn += lvl_to_nr_pages(large_page);
1115 pte++;
1116 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1117
1118 domain_flush_cache(domain, first_pte,
1119 (void *)pte - (void *)first_pte);
1120
1121 } while (start_pfn && start_pfn <= last_pfn);
1122}
1123
1124static void dma_pte_free_level(struct dmar_domain *domain, int level,
1125 int retain_level, struct dma_pte *pte,
1126 unsigned long pfn, unsigned long start_pfn,
1127 unsigned long last_pfn)
1128{
1129 pfn = max(start_pfn, pfn);
1130 pte = &pte[pfn_level_offset(pfn, level)];
1131
1132 do {
1133 unsigned long level_pfn;
1134 struct dma_pte *level_pte;
1135
1136 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1137 goto next;
1138
1139 level_pfn = pfn & level_mask(level);
1140 level_pte = phys_to_virt(dma_pte_addr(pte));
1141
1142 if (level > 2) {
1143 dma_pte_free_level(domain, level - 1, retain_level,
1144 level_pte, level_pfn, start_pfn,
1145 last_pfn);
1146 }
1147
1148 /*
1149 * Free the page table if we're below the level we want to
1150 * retain and the range covers the entire table.
1151 */
1152 if (level < retain_level && !(start_pfn > level_pfn ||
1153 last_pfn < level_pfn + level_size(level) - 1)) {
1154 dma_clear_pte(pte);
1155 domain_flush_cache(domain, pte, sizeof(*pte));
1156 free_pgtable_page(level_pte);
1157 }
1158next:
1159 pfn += level_size(level);
1160 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161}
1162
1163/*
1164 * clear last level (leaf) ptes and free page table pages below the
1165 * level we wish to keep intact.
1166 */
1167static void dma_pte_free_pagetable(struct dmar_domain *domain,
1168 unsigned long start_pfn,
1169 unsigned long last_pfn,
1170 int retain_level)
1171{
1172 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1173 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1174 BUG_ON(start_pfn > last_pfn);
1175
1176 dma_pte_clear_range(domain, start_pfn, last_pfn);
1177
1178 /* We don't need lock here; nobody else touches the iova range */
1179 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1180 domain->pgd, 0, start_pfn, last_pfn);
1181
1182 /* free pgd */
1183 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1184 free_pgtable_page(domain->pgd);
1185 domain->pgd = NULL;
1186 }
1187}
1188
1189/* When a page at a given level is being unlinked from its parent, we don't
1190 need to *modify* it at all. All we need to do is make a list of all the
1191 pages which can be freed just as soon as we've flushed the IOTLB and we
1192 know the hardware page-walk will no longer touch them.
1193 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1194 be freed. */
1195static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1196 int level, struct dma_pte *pte,
1197 struct page *freelist)
1198{
1199 struct page *pg;
1200
1201 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1202 pg->freelist = freelist;
1203 freelist = pg;
1204
1205 if (level == 1)
1206 return freelist;
1207
1208 pte = page_address(pg);
1209 do {
1210 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1211 freelist = dma_pte_list_pagetables(domain, level - 1,
1212 pte, freelist);
1213 pte++;
1214 } while (!first_pte_in_page(pte));
1215
1216 return freelist;
1217}
1218
1219static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1220 struct dma_pte *pte, unsigned long pfn,
1221 unsigned long start_pfn,
1222 unsigned long last_pfn,
1223 struct page *freelist)
1224{
1225 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1226
1227 pfn = max(start_pfn, pfn);
1228 pte = &pte[pfn_level_offset(pfn, level)];
1229
1230 do {
1231 unsigned long level_pfn;
1232
1233 if (!dma_pte_present(pte))
1234 goto next;
1235
1236 level_pfn = pfn & level_mask(level);
1237
1238 /* If range covers entire pagetable, free it */
1239 if (start_pfn <= level_pfn &&
1240 last_pfn >= level_pfn + level_size(level) - 1) {
1241 /* These suborbinate page tables are going away entirely. Don't
1242 bother to clear them; we're just going to *free* them. */
1243 if (level > 1 && !dma_pte_superpage(pte))
1244 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1245
1246 dma_clear_pte(pte);
1247 if (!first_pte)
1248 first_pte = pte;
1249 last_pte = pte;
1250 } else if (level > 1) {
1251 /* Recurse down into a level that isn't *entirely* obsolete */
1252 freelist = dma_pte_clear_level(domain, level - 1,
1253 phys_to_virt(dma_pte_addr(pte)),
1254 level_pfn, start_pfn, last_pfn,
1255 freelist);
1256 }
1257next:
1258 pfn += level_size(level);
1259 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1260
1261 if (first_pte)
1262 domain_flush_cache(domain, first_pte,
1263 (void *)++last_pte - (void *)first_pte);
1264
1265 return freelist;
1266}
1267
1268/* We can't just free the pages because the IOMMU may still be walking
1269 the page tables, and may have cached the intermediate levels. The
1270 pages can only be freed after the IOTLB flush has been done. */
1271static struct page *domain_unmap(struct dmar_domain *domain,
1272 unsigned long start_pfn,
1273 unsigned long last_pfn,
1274 struct page *freelist)
1275{
1276 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1277 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1278 BUG_ON(start_pfn > last_pfn);
1279
1280 /* we don't need lock here; nobody else touches the iova range */
1281 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1282 domain->pgd, 0, start_pfn, last_pfn,
1283 freelist);
1284
1285 /* free pgd */
1286 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1287 struct page *pgd_page = virt_to_page(domain->pgd);
1288 pgd_page->freelist = freelist;
1289 freelist = pgd_page;
1290
1291 domain->pgd = NULL;
1292 }
1293
1294 return freelist;
1295}
1296
1297static void dma_free_pagelist(struct page *freelist)
1298{
1299 struct page *pg;
1300
1301 while ((pg = freelist)) {
1302 freelist = pg->freelist;
1303 free_pgtable_page(page_address(pg));
1304 }
1305}
1306
1307/* iommu handling */
1308static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1309{
1310 struct root_entry *root;
1311 unsigned long flags;
1312
1313 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1314 if (!root) {
1315 pr_err("Allocating root entry for %s failed\n",
1316 iommu->name);
1317 return -ENOMEM;
1318 }
1319
1320 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1321
1322 spin_lock_irqsave(&iommu->lock, flags);
1323 iommu->root_entry = root;
1324 spin_unlock_irqrestore(&iommu->lock, flags);
1325
1326 return 0;
1327}
1328
1329static void iommu_set_root_entry(struct intel_iommu *iommu)
1330{
1331 u64 addr;
1332 u32 sts;
1333 unsigned long flag;
1334
1335 addr = virt_to_phys(iommu->root_entry);
1336 if (sm_supported(iommu))
1337 addr |= DMA_RTADDR_SMT;
1338
1339 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1340 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1341
1342 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1343
1344 /* Make sure hardware complete it */
1345 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1346 readl, (sts & DMA_GSTS_RTPS), sts);
1347
1348 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1349
1350 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1351 if (sm_supported(iommu))
1352 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1353 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1354}
1355
1356void iommu_flush_write_buffer(struct intel_iommu *iommu)
1357{
1358 u32 val;
1359 unsigned long flag;
1360
1361 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1362 return;
1363
1364 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1365 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1366
1367 /* Make sure hardware complete it */
1368 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1369 readl, (!(val & DMA_GSTS_WBFS)), val);
1370
1371 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1372}
1373
1374/* return value determine if we need a write buffer flush */
1375static void __iommu_flush_context(struct intel_iommu *iommu,
1376 u16 did, u16 source_id, u8 function_mask,
1377 u64 type)
1378{
1379 u64 val = 0;
1380 unsigned long flag;
1381
1382 switch (type) {
1383 case DMA_CCMD_GLOBAL_INVL:
1384 val = DMA_CCMD_GLOBAL_INVL;
1385 break;
1386 case DMA_CCMD_DOMAIN_INVL:
1387 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1388 break;
1389 case DMA_CCMD_DEVICE_INVL:
1390 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1391 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1392 break;
1393 default:
1394 BUG();
1395 }
1396 val |= DMA_CCMD_ICC;
1397
1398 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1399 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1400
1401 /* Make sure hardware complete it */
1402 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1403 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1404
1405 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1406}
1407
1408/* return value determine if we need a write buffer flush */
1409static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1410 u64 addr, unsigned int size_order, u64 type)
1411{
1412 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1413 u64 val = 0, val_iva = 0;
1414 unsigned long flag;
1415
1416 switch (type) {
1417 case DMA_TLB_GLOBAL_FLUSH:
1418 /* global flush doesn't need set IVA_REG */
1419 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1420 break;
1421 case DMA_TLB_DSI_FLUSH:
1422 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1423 break;
1424 case DMA_TLB_PSI_FLUSH:
1425 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1426 /* IH bit is passed in as part of address */
1427 val_iva = size_order | addr;
1428 break;
1429 default:
1430 BUG();
1431 }
1432 /* Note: set drain read/write */
1433#if 0
1434 /*
1435 * This is probably to be super secure.. Looks like we can
1436 * ignore it without any impact.
1437 */
1438 if (cap_read_drain(iommu->cap))
1439 val |= DMA_TLB_READ_DRAIN;
1440#endif
1441 if (cap_write_drain(iommu->cap))
1442 val |= DMA_TLB_WRITE_DRAIN;
1443
1444 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1445 /* Note: Only uses first TLB reg currently */
1446 if (val_iva)
1447 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1448 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1449
1450 /* Make sure hardware complete it */
1451 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1452 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1453
1454 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1455
1456 /* check IOTLB invalidation granularity */
1457 if (DMA_TLB_IAIG(val) == 0)
1458 pr_err("Flush IOTLB failed\n");
1459 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1460 pr_debug("TLB flush request %Lx, actual %Lx\n",
1461 (unsigned long long)DMA_TLB_IIRG(type),
1462 (unsigned long long)DMA_TLB_IAIG(val));
1463}
1464
1465static struct device_domain_info *
1466iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1467 u8 bus, u8 devfn)
1468{
1469 struct device_domain_info *info;
1470
1471 assert_spin_locked(&device_domain_lock);
1472
1473 if (!iommu->qi)
1474 return NULL;
1475
1476 list_for_each_entry(info, &domain->devices, link)
1477 if (info->iommu == iommu && info->bus == bus &&
1478 info->devfn == devfn) {
1479 if (info->ats_supported && info->dev)
1480 return info;
1481 break;
1482 }
1483
1484 return NULL;
1485}
1486
1487static void domain_update_iotlb(struct dmar_domain *domain)
1488{
1489 struct device_domain_info *info;
1490 bool has_iotlb_device = false;
1491
1492 assert_spin_locked(&device_domain_lock);
1493
1494 list_for_each_entry(info, &domain->devices, link)
1495 if (info->ats_enabled) {
1496 has_iotlb_device = true;
1497 break;
1498 }
1499
1500 if (!has_iotlb_device) {
1501 struct subdev_domain_info *sinfo;
1502
1503 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1504 info = get_domain_info(sinfo->pdev);
1505 if (info && info->ats_enabled) {
1506 has_iotlb_device = true;
1507 break;
1508 }
1509 }
1510 }
1511
1512 domain->has_iotlb_device = has_iotlb_device;
1513}
1514
1515static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1516{
1517 struct pci_dev *pdev;
1518
1519 assert_spin_locked(&device_domain_lock);
1520
1521 if (!info || !dev_is_pci(info->dev))
1522 return;
1523
1524 pdev = to_pci_dev(info->dev);
1525 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1526 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1527 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1528 * reserved, which should be set to 0.
1529 */
1530 if (!ecap_dit(info->iommu->ecap))
1531 info->pfsid = 0;
1532 else {
1533 struct pci_dev *pf_pdev;
1534
1535 /* pdev will be returned if device is not a vf */
1536 pf_pdev = pci_physfn(pdev);
1537 info->pfsid = pci_dev_id(pf_pdev);
1538 }
1539
1540#ifdef CONFIG_INTEL_IOMMU_SVM
1541 /* The PCIe spec, in its wisdom, declares that the behaviour of
1542 the device if you enable PASID support after ATS support is
1543 undefined. So always enable PASID support on devices which
1544 have it, even if we can't yet know if we're ever going to
1545 use it. */
1546 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1547 info->pasid_enabled = 1;
1548
1549 if (info->pri_supported &&
1550 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1551 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1552 info->pri_enabled = 1;
1553#endif
1554 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1555 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1556 info->ats_enabled = 1;
1557 domain_update_iotlb(info->domain);
1558 info->ats_qdep = pci_ats_queue_depth(pdev);
1559 }
1560}
1561
1562static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1563{
1564 struct pci_dev *pdev;
1565
1566 assert_spin_locked(&device_domain_lock);
1567
1568 if (!dev_is_pci(info->dev))
1569 return;
1570
1571 pdev = to_pci_dev(info->dev);
1572
1573 if (info->ats_enabled) {
1574 pci_disable_ats(pdev);
1575 info->ats_enabled = 0;
1576 domain_update_iotlb(info->domain);
1577 }
1578#ifdef CONFIG_INTEL_IOMMU_SVM
1579 if (info->pri_enabled) {
1580 pci_disable_pri(pdev);
1581 info->pri_enabled = 0;
1582 }
1583 if (info->pasid_enabled) {
1584 pci_disable_pasid(pdev);
1585 info->pasid_enabled = 0;
1586 }
1587#endif
1588}
1589
1590static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1591 u64 addr, unsigned int mask)
1592{
1593 u16 sid, qdep;
1594
1595 if (!info || !info->ats_enabled)
1596 return;
1597
1598 sid = info->bus << 8 | info->devfn;
1599 qdep = info->ats_qdep;
1600 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1601 qdep, addr, mask);
1602}
1603
1604static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1605 u64 addr, unsigned mask)
1606{
1607 unsigned long flags;
1608 struct device_domain_info *info;
1609 struct subdev_domain_info *sinfo;
1610
1611 if (!domain->has_iotlb_device)
1612 return;
1613
1614 spin_lock_irqsave(&device_domain_lock, flags);
1615 list_for_each_entry(info, &domain->devices, link)
1616 __iommu_flush_dev_iotlb(info, addr, mask);
1617
1618 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
1619 info = get_domain_info(sinfo->pdev);
1620 __iommu_flush_dev_iotlb(info, addr, mask);
1621 }
1622 spin_unlock_irqrestore(&device_domain_lock, flags);
1623}
1624
1625static void domain_flush_piotlb(struct intel_iommu *iommu,
1626 struct dmar_domain *domain,
1627 u64 addr, unsigned long npages, bool ih)
1628{
1629 u16 did = domain->iommu_did[iommu->seq_id];
1630
1631 if (domain->default_pasid)
1632 qi_flush_piotlb(iommu, did, domain->default_pasid,
1633 addr, npages, ih);
1634
1635 if (!list_empty(&domain->devices))
1636 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1637}
1638
1639static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1640 struct dmar_domain *domain,
1641 unsigned long pfn, unsigned int pages,
1642 int ih, int map)
1643{
1644 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1645 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1646 u16 did = domain->iommu_did[iommu->seq_id];
1647
1648 BUG_ON(pages == 0);
1649
1650 if (ih)
1651 ih = 1 << 6;
1652
1653 if (domain_use_first_level(domain)) {
1654 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1655 } else {
1656 /*
1657 * Fallback to domain selective flush if no PSI support or
1658 * the size is too big. PSI requires page size to be 2 ^ x,
1659 * and the base address is naturally aligned to the size.
1660 */
1661 if (!cap_pgsel_inv(iommu->cap) ||
1662 mask > cap_max_amask_val(iommu->cap))
1663 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1664 DMA_TLB_DSI_FLUSH);
1665 else
1666 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1667 DMA_TLB_PSI_FLUSH);
1668 }
1669
1670 /*
1671 * In caching mode, changes of pages from non-present to present require
1672 * flush. However, device IOTLB doesn't need to be flushed in this case.
1673 */
1674 if (!cap_caching_mode(iommu->cap) || !map)
1675 iommu_flush_dev_iotlb(domain, addr, mask);
1676}
1677
1678/* Notification for newly created mappings */
1679static inline void __mapping_notify_one(struct intel_iommu *iommu,
1680 struct dmar_domain *domain,
1681 unsigned long pfn, unsigned int pages)
1682{
1683 /*
1684 * It's a non-present to present mapping. Only flush if caching mode
1685 * and second level.
1686 */
1687 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1688 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1689 else
1690 iommu_flush_write_buffer(iommu);
1691}
1692
1693static void intel_flush_iotlb_all(struct iommu_domain *domain)
1694{
1695 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1696 int idx;
1697
1698 for_each_domain_iommu(idx, dmar_domain) {
1699 struct intel_iommu *iommu = g_iommus[idx];
1700 u16 did = dmar_domain->iommu_did[iommu->seq_id];
1701
1702 if (domain_use_first_level(dmar_domain))
1703 domain_flush_piotlb(iommu, dmar_domain, 0, -1, 0);
1704 else
1705 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1706 DMA_TLB_DSI_FLUSH);
1707
1708 if (!cap_caching_mode(iommu->cap))
1709 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1710 0, MAX_AGAW_PFN_WIDTH);
1711 }
1712}
1713
1714static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1715{
1716 u32 pmen;
1717 unsigned long flags;
1718
1719 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1720 return;
1721
1722 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1723 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1724 pmen &= ~DMA_PMEN_EPM;
1725 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1726
1727 /* wait for the protected region status bit to clear */
1728 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1729 readl, !(pmen & DMA_PMEN_PRS), pmen);
1730
1731 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1732}
1733
1734static void iommu_enable_translation(struct intel_iommu *iommu)
1735{
1736 u32 sts;
1737 unsigned long flags;
1738
1739 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1740 iommu->gcmd |= DMA_GCMD_TE;
1741 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1742
1743 /* Make sure hardware complete it */
1744 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1745 readl, (sts & DMA_GSTS_TES), sts);
1746
1747 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1748}
1749
1750static void iommu_disable_translation(struct intel_iommu *iommu)
1751{
1752 u32 sts;
1753 unsigned long flag;
1754
1755 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1756 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1757 return;
1758
1759 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1760 iommu->gcmd &= ~DMA_GCMD_TE;
1761 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1762
1763 /* Make sure hardware complete it */
1764 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1765 readl, (!(sts & DMA_GSTS_TES)), sts);
1766
1767 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1768}
1769
1770static int iommu_init_domains(struct intel_iommu *iommu)
1771{
1772 u32 ndomains, nlongs;
1773 size_t size;
1774
1775 ndomains = cap_ndoms(iommu->cap);
1776 pr_debug("%s: Number of Domains supported <%d>\n",
1777 iommu->name, ndomains);
1778 nlongs = BITS_TO_LONGS(ndomains);
1779
1780 spin_lock_init(&iommu->lock);
1781
1782 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1783 if (!iommu->domain_ids) {
1784 pr_err("%s: Allocating domain id array failed\n",
1785 iommu->name);
1786 return -ENOMEM;
1787 }
1788
1789 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1790 iommu->domains = kzalloc(size, GFP_KERNEL);
1791
1792 if (iommu->domains) {
1793 size = 256 * sizeof(struct dmar_domain *);
1794 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1795 }
1796
1797 if (!iommu->domains || !iommu->domains[0]) {
1798 pr_err("%s: Allocating domain array failed\n",
1799 iommu->name);
1800 kfree(iommu->domain_ids);
1801 kfree(iommu->domains);
1802 iommu->domain_ids = NULL;
1803 iommu->domains = NULL;
1804 return -ENOMEM;
1805 }
1806
1807 /*
1808 * If Caching mode is set, then invalid translations are tagged
1809 * with domain-id 0, hence we need to pre-allocate it. We also
1810 * use domain-id 0 as a marker for non-allocated domain-id, so
1811 * make sure it is not used for a real domain.
1812 */
1813 set_bit(0, iommu->domain_ids);
1814
1815 /*
1816 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1817 * entry for first-level or pass-through translation modes should
1818 * be programmed with a domain id different from those used for
1819 * second-level or nested translation. We reserve a domain id for
1820 * this purpose.
1821 */
1822 if (sm_supported(iommu))
1823 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1824
1825 return 0;
1826}
1827
1828static void disable_dmar_iommu(struct intel_iommu *iommu)
1829{
1830 struct device_domain_info *info, *tmp;
1831 unsigned long flags;
1832
1833 if (!iommu->domains || !iommu->domain_ids)
1834 return;
1835
1836 spin_lock_irqsave(&device_domain_lock, flags);
1837 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1838 if (info->iommu != iommu)
1839 continue;
1840
1841 if (!info->dev || !info->domain)
1842 continue;
1843
1844 __dmar_remove_one_dev_info(info);
1845 }
1846 spin_unlock_irqrestore(&device_domain_lock, flags);
1847
1848 if (iommu->gcmd & DMA_GCMD_TE)
1849 iommu_disable_translation(iommu);
1850}
1851
1852static void free_dmar_iommu(struct intel_iommu *iommu)
1853{
1854 if ((iommu->domains) && (iommu->domain_ids)) {
1855 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1856 int i;
1857
1858 for (i = 0; i < elems; i++)
1859 kfree(iommu->domains[i]);
1860 kfree(iommu->domains);
1861 kfree(iommu->domain_ids);
1862 iommu->domains = NULL;
1863 iommu->domain_ids = NULL;
1864 }
1865
1866 g_iommus[iommu->seq_id] = NULL;
1867
1868 /* free context mapping */
1869 free_context_table(iommu);
1870
1871#ifdef CONFIG_INTEL_IOMMU_SVM
1872 if (pasid_supported(iommu)) {
1873 if (ecap_prs(iommu->ecap))
1874 intel_svm_finish_prq(iommu);
1875 }
1876 if (vccap_pasid(iommu->vccap))
1877 ioasid_unregister_allocator(&iommu->pasid_allocator);
1878
1879#endif
1880}
1881
1882/*
1883 * Check and return whether first level is used by default for
1884 * DMA translation.
1885 */
1886static bool first_level_by_default(void)
1887{
1888 return scalable_mode_support() && intel_cap_flts_sanity();
1889}
1890
1891static struct dmar_domain *alloc_domain(int flags)
1892{
1893 struct dmar_domain *domain;
1894
1895 domain = alloc_domain_mem();
1896 if (!domain)
1897 return NULL;
1898
1899 memset(domain, 0, sizeof(*domain));
1900 domain->nid = NUMA_NO_NODE;
1901 domain->flags = flags;
1902 if (first_level_by_default())
1903 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1904 domain->has_iotlb_device = false;
1905 INIT_LIST_HEAD(&domain->devices);
1906 INIT_LIST_HEAD(&domain->subdevices);
1907
1908 return domain;
1909}
1910
1911/* Must be called with iommu->lock */
1912static int domain_attach_iommu(struct dmar_domain *domain,
1913 struct intel_iommu *iommu)
1914{
1915 unsigned long ndomains;
1916 int num;
1917
1918 assert_spin_locked(&device_domain_lock);
1919 assert_spin_locked(&iommu->lock);
1920
1921 domain->iommu_refcnt[iommu->seq_id] += 1;
1922 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1923 ndomains = cap_ndoms(iommu->cap);
1924 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1925
1926 if (num >= ndomains) {
1927 pr_err("%s: No free domain ids\n", iommu->name);
1928 domain->iommu_refcnt[iommu->seq_id] -= 1;
1929 return -ENOSPC;
1930 }
1931
1932 set_bit(num, iommu->domain_ids);
1933 set_iommu_domain(iommu, num, domain);
1934
1935 domain->iommu_did[iommu->seq_id] = num;
1936 domain->nid = iommu->node;
1937
1938 domain_update_iommu_cap(domain);
1939 }
1940
1941 return 0;
1942}
1943
1944static void domain_detach_iommu(struct dmar_domain *domain,
1945 struct intel_iommu *iommu)
1946{
1947 int num;
1948
1949 assert_spin_locked(&device_domain_lock);
1950 assert_spin_locked(&iommu->lock);
1951
1952 domain->iommu_refcnt[iommu->seq_id] -= 1;
1953 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1954 num = domain->iommu_did[iommu->seq_id];
1955 clear_bit(num, iommu->domain_ids);
1956 set_iommu_domain(iommu, num, NULL);
1957
1958 domain_update_iommu_cap(domain);
1959 domain->iommu_did[iommu->seq_id] = 0;
1960 }
1961}
1962
1963static inline int guestwidth_to_adjustwidth(int gaw)
1964{
1965 int agaw;
1966 int r = (gaw - 12) % 9;
1967
1968 if (r == 0)
1969 agaw = gaw;
1970 else
1971 agaw = gaw + 9 - r;
1972 if (agaw > 64)
1973 agaw = 64;
1974 return agaw;
1975}
1976
1977static void domain_exit(struct dmar_domain *domain)
1978{
1979
1980 /* Remove associated devices and clear attached or cached domains */
1981 domain_remove_dev_info(domain);
1982
1983 /* destroy iovas */
1984 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1985 iommu_put_dma_cookie(&domain->domain);
1986
1987 if (domain->pgd) {
1988 struct page *freelist;
1989
1990 freelist = domain_unmap(domain, 0,
1991 DOMAIN_MAX_PFN(domain->gaw), NULL);
1992 dma_free_pagelist(freelist);
1993 }
1994
1995 free_domain_mem(domain);
1996}
1997
1998/*
1999 * Get the PASID directory size for scalable mode context entry.
2000 * Value of X in the PDTS field of a scalable mode context entry
2001 * indicates PASID directory with 2^(X + 7) entries.
2002 */
2003static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2004{
2005 int pds, max_pde;
2006
2007 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2008 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2009 if (pds < 7)
2010 return 0;
2011
2012 return pds - 7;
2013}
2014
2015/*
2016 * Set the RID_PASID field of a scalable mode context entry. The
2017 * IOMMU hardware will use the PASID value set in this field for
2018 * DMA translations of DMA requests without PASID.
2019 */
2020static inline void
2021context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2022{
2023 context->hi |= pasid & ((1 << 20) - 1);
2024}
2025
2026/*
2027 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2028 * entry.
2029 */
2030static inline void context_set_sm_dte(struct context_entry *context)
2031{
2032 context->lo |= (1 << 2);
2033}
2034
2035/*
2036 * Set the PRE(Page Request Enable) field of a scalable mode context
2037 * entry.
2038 */
2039static inline void context_set_sm_pre(struct context_entry *context)
2040{
2041 context->lo |= (1 << 4);
2042}
2043
2044/* Convert value to context PASID directory size field coding. */
2045#define context_pdts(pds) (((pds) & 0x7) << 9)
2046
2047static int domain_context_mapping_one(struct dmar_domain *domain,
2048 struct intel_iommu *iommu,
2049 struct pasid_table *table,
2050 u8 bus, u8 devfn)
2051{
2052 u16 did = domain->iommu_did[iommu->seq_id];
2053 int translation = CONTEXT_TT_MULTI_LEVEL;
2054 struct device_domain_info *info = NULL;
2055 struct context_entry *context;
2056 unsigned long flags;
2057 int ret;
2058
2059 WARN_ON(did == 0);
2060
2061 if (hw_pass_through && domain_type_is_si(domain))
2062 translation = CONTEXT_TT_PASS_THROUGH;
2063
2064 pr_debug("Set context mapping for %02x:%02x.%d\n",
2065 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2066
2067 BUG_ON(!domain->pgd);
2068
2069 spin_lock_irqsave(&device_domain_lock, flags);
2070 spin_lock(&iommu->lock);
2071
2072 ret = -ENOMEM;
2073 context = iommu_context_addr(iommu, bus, devfn, 1);
2074 if (!context)
2075 goto out_unlock;
2076
2077 ret = 0;
2078 if (context_present(context))
2079 goto out_unlock;
2080
2081 /*
2082 * For kdump cases, old valid entries may be cached due to the
2083 * in-flight DMA and copied pgtable, but there is no unmapping
2084 * behaviour for them, thus we need an explicit cache flush for
2085 * the newly-mapped device. For kdump, at this point, the device
2086 * is supposed to finish reset at its driver probe stage, so no
2087 * in-flight DMA will exist, and we don't need to worry anymore
2088 * hereafter.
2089 */
2090 if (context_copied(context)) {
2091 u16 did_old = context_domain_id(context);
2092
2093 if (did_old < cap_ndoms(iommu->cap)) {
2094 iommu->flush.flush_context(iommu, did_old,
2095 (((u16)bus) << 8) | devfn,
2096 DMA_CCMD_MASK_NOBIT,
2097 DMA_CCMD_DEVICE_INVL);
2098 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2099 DMA_TLB_DSI_FLUSH);
2100 }
2101 }
2102
2103 context_clear_entry(context);
2104
2105 if (sm_supported(iommu)) {
2106 unsigned long pds;
2107
2108 WARN_ON(!table);
2109
2110 /* Setup the PASID DIR pointer: */
2111 pds = context_get_sm_pds(table);
2112 context->lo = (u64)virt_to_phys(table->table) |
2113 context_pdts(pds);
2114
2115 /* Setup the RID_PASID field: */
2116 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2117
2118 /*
2119 * Setup the Device-TLB enable bit and Page request
2120 * Enable bit:
2121 */
2122 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2123 if (info && info->ats_supported)
2124 context_set_sm_dte(context);
2125 if (info && info->pri_supported)
2126 context_set_sm_pre(context);
2127 } else {
2128 struct dma_pte *pgd = domain->pgd;
2129 int agaw;
2130
2131 context_set_domain_id(context, did);
2132
2133 if (translation != CONTEXT_TT_PASS_THROUGH) {
2134 /*
2135 * Skip top levels of page tables for iommu which has
2136 * less agaw than default. Unnecessary for PT mode.
2137 */
2138 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2139 ret = -ENOMEM;
2140 pgd = phys_to_virt(dma_pte_addr(pgd));
2141 if (!dma_pte_present(pgd))
2142 goto out_unlock;
2143 }
2144
2145 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2146 if (info && info->ats_supported)
2147 translation = CONTEXT_TT_DEV_IOTLB;
2148 else
2149 translation = CONTEXT_TT_MULTI_LEVEL;
2150
2151 context_set_address_root(context, virt_to_phys(pgd));
2152 context_set_address_width(context, agaw);
2153 } else {
2154 /*
2155 * In pass through mode, AW must be programmed to
2156 * indicate the largest AGAW value supported by
2157 * hardware. And ASR is ignored by hardware.
2158 */
2159 context_set_address_width(context, iommu->msagaw);
2160 }
2161
2162 context_set_translation_type(context, translation);
2163 }
2164
2165 context_set_fault_enable(context);
2166 context_set_present(context);
2167 if (!ecap_coherent(iommu->ecap))
2168 clflush_cache_range(context, sizeof(*context));
2169
2170 /*
2171 * It's a non-present to present mapping. If hardware doesn't cache
2172 * non-present entry we only need to flush the write-buffer. If the
2173 * _does_ cache non-present entries, then it does so in the special
2174 * domain #0, which we have to flush:
2175 */
2176 if (cap_caching_mode(iommu->cap)) {
2177 iommu->flush.flush_context(iommu, 0,
2178 (((u16)bus) << 8) | devfn,
2179 DMA_CCMD_MASK_NOBIT,
2180 DMA_CCMD_DEVICE_INVL);
2181 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2182 } else {
2183 iommu_flush_write_buffer(iommu);
2184 }
2185 iommu_enable_dev_iotlb(info);
2186
2187 ret = 0;
2188
2189out_unlock:
2190 spin_unlock(&iommu->lock);
2191 spin_unlock_irqrestore(&device_domain_lock, flags);
2192
2193 return ret;
2194}
2195
2196struct domain_context_mapping_data {
2197 struct dmar_domain *domain;
2198 struct intel_iommu *iommu;
2199 struct pasid_table *table;
2200};
2201
2202static int domain_context_mapping_cb(struct pci_dev *pdev,
2203 u16 alias, void *opaque)
2204{
2205 struct domain_context_mapping_data *data = opaque;
2206
2207 return domain_context_mapping_one(data->domain, data->iommu,
2208 data->table, PCI_BUS_NUM(alias),
2209 alias & 0xff);
2210}
2211
2212static int
2213domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2214{
2215 struct domain_context_mapping_data data;
2216 struct pasid_table *table;
2217 struct intel_iommu *iommu;
2218 u8 bus, devfn;
2219
2220 iommu = device_to_iommu(dev, &bus, &devfn);
2221 if (!iommu)
2222 return -ENODEV;
2223
2224 table = intel_pasid_get_table(dev);
2225
2226 if (!dev_is_pci(dev))
2227 return domain_context_mapping_one(domain, iommu, table,
2228 bus, devfn);
2229
2230 data.domain = domain;
2231 data.iommu = iommu;
2232 data.table = table;
2233
2234 return pci_for_each_dma_alias(to_pci_dev(dev),
2235 &domain_context_mapping_cb, &data);
2236}
2237
2238static int domain_context_mapped_cb(struct pci_dev *pdev,
2239 u16 alias, void *opaque)
2240{
2241 struct intel_iommu *iommu = opaque;
2242
2243 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2244}
2245
2246static int domain_context_mapped(struct device *dev)
2247{
2248 struct intel_iommu *iommu;
2249 u8 bus, devfn;
2250
2251 iommu = device_to_iommu(dev, &bus, &devfn);
2252 if (!iommu)
2253 return -ENODEV;
2254
2255 if (!dev_is_pci(dev))
2256 return device_context_mapped(iommu, bus, devfn);
2257
2258 return !pci_for_each_dma_alias(to_pci_dev(dev),
2259 domain_context_mapped_cb, iommu);
2260}
2261
2262/* Returns a number of VTD pages, but aligned to MM page size */
2263static inline unsigned long aligned_nrpages(unsigned long host_addr,
2264 size_t size)
2265{
2266 host_addr &= ~PAGE_MASK;
2267 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2268}
2269
2270/* Return largest possible superpage level for a given mapping */
2271static inline int hardware_largepage_caps(struct dmar_domain *domain,
2272 unsigned long iov_pfn,
2273 unsigned long phy_pfn,
2274 unsigned long pages)
2275{
2276 int support, level = 1;
2277 unsigned long pfnmerge;
2278
2279 support = domain->iommu_superpage;
2280
2281 /* To use a large page, the virtual *and* physical addresses
2282 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2283 of them will mean we have to use smaller pages. So just
2284 merge them and check both at once. */
2285 pfnmerge = iov_pfn | phy_pfn;
2286
2287 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2288 pages >>= VTD_STRIDE_SHIFT;
2289 if (!pages)
2290 break;
2291 pfnmerge >>= VTD_STRIDE_SHIFT;
2292 level++;
2293 support--;
2294 }
2295 return level;
2296}
2297
2298/*
2299 * Ensure that old small page tables are removed to make room for superpage(s).
2300 * We're going to add new large pages, so make sure we don't remove their parent
2301 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2302 */
2303static void switch_to_super_page(struct dmar_domain *domain,
2304 unsigned long start_pfn,
2305 unsigned long end_pfn, int level)
2306{
2307 unsigned long lvl_pages = lvl_to_nr_pages(level);
2308 struct dma_pte *pte = NULL;
2309 int i;
2310
2311 while (start_pfn <= end_pfn) {
2312 if (!pte)
2313 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2314
2315 if (dma_pte_present(pte)) {
2316 dma_pte_free_pagetable(domain, start_pfn,
2317 start_pfn + lvl_pages - 1,
2318 level + 1);
2319
2320 for_each_domain_iommu(i, domain)
2321 iommu_flush_iotlb_psi(g_iommus[i], domain,
2322 start_pfn, lvl_pages,
2323 0, 0);
2324 }
2325
2326 pte++;
2327 start_pfn += lvl_pages;
2328 if (first_pte_in_page(pte))
2329 pte = NULL;
2330 }
2331}
2332
2333static int
2334__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2335 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2336{
2337 unsigned int largepage_lvl = 0;
2338 unsigned long lvl_pages = 0;
2339 struct dma_pte *pte = NULL;
2340 phys_addr_t pteval;
2341 u64 attr;
2342
2343 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2344
2345 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2346 return -EINVAL;
2347
2348 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2349 attr |= DMA_FL_PTE_PRESENT;
2350 if (domain_use_first_level(domain)) {
2351 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
2352
2353 if (domain->domain.type == IOMMU_DOMAIN_DMA) {
2354 attr |= DMA_FL_PTE_ACCESS;
2355 if (prot & DMA_PTE_WRITE)
2356 attr |= DMA_FL_PTE_DIRTY;
2357 }
2358 }
2359
2360 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2361
2362 while (nr_pages > 0) {
2363 uint64_t tmp;
2364
2365 if (!pte) {
2366 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2367 phys_pfn, nr_pages);
2368
2369 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2370 if (!pte)
2371 return -ENOMEM;
2372 /* It is large page*/
2373 if (largepage_lvl > 1) {
2374 unsigned long end_pfn;
2375
2376 pteval |= DMA_PTE_LARGE_PAGE;
2377 end_pfn = ((iov_pfn + nr_pages) & level_mask(largepage_lvl)) - 1;
2378 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2379 } else {
2380 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2381 }
2382
2383 }
2384 /* We don't need lock here, nobody else
2385 * touches the iova range
2386 */
2387 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2388 if (tmp) {
2389 static int dumps = 5;
2390 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2391 iov_pfn, tmp, (unsigned long long)pteval);
2392 if (dumps) {
2393 dumps--;
2394 debug_dma_dump_mappings(NULL);
2395 }
2396 WARN_ON(1);
2397 }
2398
2399 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2400
2401 BUG_ON(nr_pages < lvl_pages);
2402
2403 nr_pages -= lvl_pages;
2404 iov_pfn += lvl_pages;
2405 phys_pfn += lvl_pages;
2406 pteval += lvl_pages * VTD_PAGE_SIZE;
2407
2408 /* If the next PTE would be the first in a new page, then we
2409 * need to flush the cache on the entries we've just written.
2410 * And then we'll need to recalculate 'pte', so clear it and
2411 * let it get set again in the if (!pte) block above.
2412 *
2413 * If we're done (!nr_pages) we need to flush the cache too.
2414 *
2415 * Also if we've been setting superpages, we may need to
2416 * recalculate 'pte' and switch back to smaller pages for the
2417 * end of the mapping, if the trailing size is not enough to
2418 * use another superpage (i.e. nr_pages < lvl_pages).
2419 *
2420 * We leave clflush for the leaf pte changes to iotlb_sync_map()
2421 * callback.
2422 */
2423 pte++;
2424 if (!nr_pages || first_pte_in_page(pte) ||
2425 (largepage_lvl > 1 && nr_pages < lvl_pages))
2426 pte = NULL;
2427 }
2428
2429 return 0;
2430}
2431
2432static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2433{
2434 struct intel_iommu *iommu = info->iommu;
2435 struct context_entry *context;
2436 unsigned long flags;
2437 u16 did_old;
2438
2439 if (!iommu)
2440 return;
2441
2442 spin_lock_irqsave(&iommu->lock, flags);
2443 context = iommu_context_addr(iommu, bus, devfn, 0);
2444 if (!context) {
2445 spin_unlock_irqrestore(&iommu->lock, flags);
2446 return;
2447 }
2448
2449 if (sm_supported(iommu)) {
2450 if (hw_pass_through && domain_type_is_si(info->domain))
2451 did_old = FLPT_DEFAULT_DID;
2452 else
2453 did_old = info->domain->iommu_did[iommu->seq_id];
2454 } else {
2455 did_old = context_domain_id(context);
2456 }
2457
2458 context_clear_entry(context);
2459 __iommu_flush_cache(iommu, context, sizeof(*context));
2460 spin_unlock_irqrestore(&iommu->lock, flags);
2461 iommu->flush.flush_context(iommu,
2462 did_old,
2463 (((u16)bus) << 8) | devfn,
2464 DMA_CCMD_MASK_NOBIT,
2465 DMA_CCMD_DEVICE_INVL);
2466
2467 if (sm_supported(iommu))
2468 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2469
2470 iommu->flush.flush_iotlb(iommu,
2471 did_old,
2472 0,
2473 0,
2474 DMA_TLB_DSI_FLUSH);
2475
2476 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2477}
2478
2479static inline void unlink_domain_info(struct device_domain_info *info)
2480{
2481 assert_spin_locked(&device_domain_lock);
2482 list_del(&info->link);
2483 list_del(&info->global);
2484 if (info->dev)
2485 dev_iommu_priv_set(info->dev, NULL);
2486}
2487
2488static void domain_remove_dev_info(struct dmar_domain *domain)
2489{
2490 struct device_domain_info *info, *tmp;
2491 unsigned long flags;
2492
2493 spin_lock_irqsave(&device_domain_lock, flags);
2494 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2495 __dmar_remove_one_dev_info(info);
2496 spin_unlock_irqrestore(&device_domain_lock, flags);
2497}
2498
2499struct dmar_domain *find_domain(struct device *dev)
2500{
2501 struct device_domain_info *info;
2502
2503 if (unlikely(!dev || !dev->iommu))
2504 return NULL;
2505
2506 if (unlikely(attach_deferred(dev)))
2507 return NULL;
2508
2509 /* No lock here, assumes no domain exit in normal case */
2510 info = get_domain_info(dev);
2511 if (likely(info))
2512 return info->domain;
2513
2514 return NULL;
2515}
2516
2517static inline struct device_domain_info *
2518dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2519{
2520 struct device_domain_info *info;
2521
2522 list_for_each_entry(info, &device_domain_list, global)
2523 if (info->segment == segment && info->bus == bus &&
2524 info->devfn == devfn)
2525 return info;
2526
2527 return NULL;
2528}
2529
2530static int domain_setup_first_level(struct intel_iommu *iommu,
2531 struct dmar_domain *domain,
2532 struct device *dev,
2533 u32 pasid)
2534{
2535 struct dma_pte *pgd = domain->pgd;
2536 int agaw, level;
2537 int flags = 0;
2538
2539 /*
2540 * Skip top levels of page tables for iommu which has
2541 * less agaw than default. Unnecessary for PT mode.
2542 */
2543 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2544 pgd = phys_to_virt(dma_pte_addr(pgd));
2545 if (!dma_pte_present(pgd))
2546 return -ENOMEM;
2547 }
2548
2549 level = agaw_to_level(agaw);
2550 if (level != 4 && level != 5)
2551 return -EINVAL;
2552
2553 if (pasid != PASID_RID2PASID)
2554 flags |= PASID_FLAG_SUPERVISOR_MODE;
2555 if (level == 5)
2556 flags |= PASID_FLAG_FL5LP;
2557
2558 if (domain->domain.type == IOMMU_DOMAIN_UNMANAGED)
2559 flags |= PASID_FLAG_PAGE_SNOOP;
2560
2561 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2562 domain->iommu_did[iommu->seq_id],
2563 flags);
2564}
2565
2566static bool dev_is_real_dma_subdevice(struct device *dev)
2567{
2568 return dev && dev_is_pci(dev) &&
2569 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2570}
2571
2572static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2573 int bus, int devfn,
2574 struct device *dev,
2575 struct dmar_domain *domain)
2576{
2577 struct dmar_domain *found = NULL;
2578 struct device_domain_info *info;
2579 unsigned long flags;
2580 int ret;
2581
2582 info = alloc_devinfo_mem();
2583 if (!info)
2584 return NULL;
2585
2586 if (!dev_is_real_dma_subdevice(dev)) {
2587 info->bus = bus;
2588 info->devfn = devfn;
2589 info->segment = iommu->segment;
2590 } else {
2591 struct pci_dev *pdev = to_pci_dev(dev);
2592
2593 info->bus = pdev->bus->number;
2594 info->devfn = pdev->devfn;
2595 info->segment = pci_domain_nr(pdev->bus);
2596 }
2597
2598 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2599 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2600 info->ats_qdep = 0;
2601 info->dev = dev;
2602 info->domain = domain;
2603 info->iommu = iommu;
2604 info->pasid_table = NULL;
2605 info->auxd_enabled = 0;
2606 INIT_LIST_HEAD(&info->subdevices);
2607
2608 if (dev && dev_is_pci(dev)) {
2609 struct pci_dev *pdev = to_pci_dev(info->dev);
2610
2611 if (ecap_dev_iotlb_support(iommu->ecap) &&
2612 pci_ats_supported(pdev) &&
2613 dmar_find_matched_atsr_unit(pdev))
2614 info->ats_supported = 1;
2615
2616 if (sm_supported(iommu)) {
2617 if (pasid_supported(iommu)) {
2618 int features = pci_pasid_features(pdev);
2619 if (features >= 0)
2620 info->pasid_supported = features | 1;
2621 }
2622
2623 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2624 pci_pri_supported(pdev))
2625 info->pri_supported = 1;
2626 }
2627 }
2628
2629 spin_lock_irqsave(&device_domain_lock, flags);
2630 if (dev)
2631 found = find_domain(dev);
2632
2633 if (!found) {
2634 struct device_domain_info *info2;
2635 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2636 info->devfn);
2637 if (info2) {
2638 found = info2->domain;
2639 info2->dev = dev;
2640 }
2641 }
2642
2643 if (found) {
2644 spin_unlock_irqrestore(&device_domain_lock, flags);
2645 free_devinfo_mem(info);
2646 /* Caller must free the original domain */
2647 return found;
2648 }
2649
2650 spin_lock(&iommu->lock);
2651 ret = domain_attach_iommu(domain, iommu);
2652 spin_unlock(&iommu->lock);
2653
2654 if (ret) {
2655 spin_unlock_irqrestore(&device_domain_lock, flags);
2656 free_devinfo_mem(info);
2657 return NULL;
2658 }
2659
2660 list_add(&info->link, &domain->devices);
2661 list_add(&info->global, &device_domain_list);
2662 if (dev)
2663 dev_iommu_priv_set(dev, info);
2664 spin_unlock_irqrestore(&device_domain_lock, flags);
2665
2666 /* PASID table is mandatory for a PCI device in scalable mode. */
2667 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2668 ret = intel_pasid_alloc_table(dev);
2669 if (ret) {
2670 dev_err(dev, "PASID table allocation failed\n");
2671 dmar_remove_one_dev_info(dev);
2672 return NULL;
2673 }
2674
2675 /* Setup the PASID entry for requests without PASID: */
2676 spin_lock_irqsave(&iommu->lock, flags);
2677 if (hw_pass_through && domain_type_is_si(domain))
2678 ret = intel_pasid_setup_pass_through(iommu, domain,
2679 dev, PASID_RID2PASID);
2680 else if (domain_use_first_level(domain))
2681 ret = domain_setup_first_level(iommu, domain, dev,
2682 PASID_RID2PASID);
2683 else
2684 ret = intel_pasid_setup_second_level(iommu, domain,
2685 dev, PASID_RID2PASID);
2686 spin_unlock_irqrestore(&iommu->lock, flags);
2687 if (ret) {
2688 dev_err(dev, "Setup RID2PASID failed\n");
2689 dmar_remove_one_dev_info(dev);
2690 return NULL;
2691 }
2692 }
2693
2694 if (dev && domain_context_mapping(domain, dev)) {
2695 dev_err(dev, "Domain context map failed\n");
2696 dmar_remove_one_dev_info(dev);
2697 return NULL;
2698 }
2699
2700 return domain;
2701}
2702
2703static int iommu_domain_identity_map(struct dmar_domain *domain,
2704 unsigned long first_vpfn,
2705 unsigned long last_vpfn)
2706{
2707 /*
2708 * RMRR range might have overlap with physical memory range,
2709 * clear it first
2710 */
2711 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2712
2713 return __domain_mapping(domain, first_vpfn,
2714 first_vpfn, last_vpfn - first_vpfn + 1,
2715 DMA_PTE_READ|DMA_PTE_WRITE);
2716}
2717
2718static int md_domain_init(struct dmar_domain *domain, int guest_width);
2719
2720static int __init si_domain_init(int hw)
2721{
2722 struct dmar_rmrr_unit *rmrr;
2723 struct device *dev;
2724 int i, nid, ret;
2725
2726 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2727 if (!si_domain)
2728 return -EFAULT;
2729
2730 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2731 domain_exit(si_domain);
2732 return -EFAULT;
2733 }
2734
2735 if (hw)
2736 return 0;
2737
2738 for_each_online_node(nid) {
2739 unsigned long start_pfn, end_pfn;
2740 int i;
2741
2742 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2743 ret = iommu_domain_identity_map(si_domain,
2744 mm_to_dma_pfn(start_pfn),
2745 mm_to_dma_pfn(end_pfn));
2746 if (ret)
2747 return ret;
2748 }
2749 }
2750
2751 /*
2752 * Identity map the RMRRs so that devices with RMRRs could also use
2753 * the si_domain.
2754 */
2755 for_each_rmrr_units(rmrr) {
2756 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2757 i, dev) {
2758 unsigned long long start = rmrr->base_address;
2759 unsigned long long end = rmrr->end_address;
2760
2761 if (WARN_ON(end < start ||
2762 end >> agaw_to_width(si_domain->agaw)))
2763 continue;
2764
2765 ret = iommu_domain_identity_map(si_domain,
2766 mm_to_dma_pfn(start >> PAGE_SHIFT),
2767 mm_to_dma_pfn(end >> PAGE_SHIFT));
2768 if (ret)
2769 return ret;
2770 }
2771 }
2772
2773 return 0;
2774}
2775
2776static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2777{
2778 struct dmar_domain *ndomain;
2779 struct intel_iommu *iommu;
2780 u8 bus, devfn;
2781
2782 iommu = device_to_iommu(dev, &bus, &devfn);
2783 if (!iommu)
2784 return -ENODEV;
2785
2786 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2787 if (ndomain != domain)
2788 return -EBUSY;
2789
2790 return 0;
2791}
2792
2793static bool device_has_rmrr(struct device *dev)
2794{
2795 struct dmar_rmrr_unit *rmrr;
2796 struct device *tmp;
2797 int i;
2798
2799 rcu_read_lock();
2800 for_each_rmrr_units(rmrr) {
2801 /*
2802 * Return TRUE if this RMRR contains the device that
2803 * is passed in.
2804 */
2805 for_each_active_dev_scope(rmrr->devices,
2806 rmrr->devices_cnt, i, tmp)
2807 if (tmp == dev ||
2808 is_downstream_to_pci_bridge(dev, tmp)) {
2809 rcu_read_unlock();
2810 return true;
2811 }
2812 }
2813 rcu_read_unlock();
2814 return false;
2815}
2816
2817/**
2818 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2819 * is relaxable (ie. is allowed to be not enforced under some conditions)
2820 * @dev: device handle
2821 *
2822 * We assume that PCI USB devices with RMRRs have them largely
2823 * for historical reasons and that the RMRR space is not actively used post
2824 * boot. This exclusion may change if vendors begin to abuse it.
2825 *
2826 * The same exception is made for graphics devices, with the requirement that
2827 * any use of the RMRR regions will be torn down before assigning the device
2828 * to a guest.
2829 *
2830 * Return: true if the RMRR is relaxable, false otherwise
2831 */
2832static bool device_rmrr_is_relaxable(struct device *dev)
2833{
2834 struct pci_dev *pdev;
2835
2836 if (!dev_is_pci(dev))
2837 return false;
2838
2839 pdev = to_pci_dev(dev);
2840 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2841 return true;
2842 else
2843 return false;
2844}
2845
2846/*
2847 * There are a couple cases where we need to restrict the functionality of
2848 * devices associated with RMRRs. The first is when evaluating a device for
2849 * identity mapping because problems exist when devices are moved in and out
2850 * of domains and their respective RMRR information is lost. This means that
2851 * a device with associated RMRRs will never be in a "passthrough" domain.
2852 * The second is use of the device through the IOMMU API. This interface
2853 * expects to have full control of the IOVA space for the device. We cannot
2854 * satisfy both the requirement that RMRR access is maintained and have an
2855 * unencumbered IOVA space. We also have no ability to quiesce the device's
2856 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2857 * We therefore prevent devices associated with an RMRR from participating in
2858 * the IOMMU API, which eliminates them from device assignment.
2859 *
2860 * In both cases, devices which have relaxable RMRRs are not concerned by this
2861 * restriction. See device_rmrr_is_relaxable comment.
2862 */
2863static bool device_is_rmrr_locked(struct device *dev)
2864{
2865 if (!device_has_rmrr(dev))
2866 return false;
2867
2868 if (device_rmrr_is_relaxable(dev))
2869 return false;
2870
2871 return true;
2872}
2873
2874/*
2875 * Return the required default domain type for a specific device.
2876 *
2877 * @dev: the device in query
2878 * @startup: true if this is during early boot
2879 *
2880 * Returns:
2881 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2882 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2883 * - 0: both identity and dynamic domains work for this device
2884 */
2885static int device_def_domain_type(struct device *dev)
2886{
2887 if (dev_is_pci(dev)) {
2888 struct pci_dev *pdev = to_pci_dev(dev);
2889
2890 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2891 return IOMMU_DOMAIN_IDENTITY;
2892
2893 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2894 return IOMMU_DOMAIN_IDENTITY;
2895 }
2896
2897 return 0;
2898}
2899
2900static void intel_iommu_init_qi(struct intel_iommu *iommu)
2901{
2902 /*
2903 * Start from the sane iommu hardware state.
2904 * If the queued invalidation is already initialized by us
2905 * (for example, while enabling interrupt-remapping) then
2906 * we got the things already rolling from a sane state.
2907 */
2908 if (!iommu->qi) {
2909 /*
2910 * Clear any previous faults.
2911 */
2912 dmar_fault(-1, iommu);
2913 /*
2914 * Disable queued invalidation if supported and already enabled
2915 * before OS handover.
2916 */
2917 dmar_disable_qi(iommu);
2918 }
2919
2920 if (dmar_enable_qi(iommu)) {
2921 /*
2922 * Queued Invalidate not enabled, use Register Based Invalidate
2923 */
2924 iommu->flush.flush_context = __iommu_flush_context;
2925 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2926 pr_info("%s: Using Register based invalidation\n",
2927 iommu->name);
2928 } else {
2929 iommu->flush.flush_context = qi_flush_context;
2930 iommu->flush.flush_iotlb = qi_flush_iotlb;
2931 pr_info("%s: Using Queued invalidation\n", iommu->name);
2932 }
2933}
2934
2935static int copy_context_table(struct intel_iommu *iommu,
2936 struct root_entry *old_re,
2937 struct context_entry **tbl,
2938 int bus, bool ext)
2939{
2940 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2941 struct context_entry *new_ce = NULL, ce;
2942 struct context_entry *old_ce = NULL;
2943 struct root_entry re;
2944 phys_addr_t old_ce_phys;
2945
2946 tbl_idx = ext ? bus * 2 : bus;
2947 memcpy(&re, old_re, sizeof(re));
2948
2949 for (devfn = 0; devfn < 256; devfn++) {
2950 /* First calculate the correct index */
2951 idx = (ext ? devfn * 2 : devfn) % 256;
2952
2953 if (idx == 0) {
2954 /* First save what we may have and clean up */
2955 if (new_ce) {
2956 tbl[tbl_idx] = new_ce;
2957 __iommu_flush_cache(iommu, new_ce,
2958 VTD_PAGE_SIZE);
2959 pos = 1;
2960 }
2961
2962 if (old_ce)
2963 memunmap(old_ce);
2964
2965 ret = 0;
2966 if (devfn < 0x80)
2967 old_ce_phys = root_entry_lctp(&re);
2968 else
2969 old_ce_phys = root_entry_uctp(&re);
2970
2971 if (!old_ce_phys) {
2972 if (ext && devfn == 0) {
2973 /* No LCTP, try UCTP */
2974 devfn = 0x7f;
2975 continue;
2976 } else {
2977 goto out;
2978 }
2979 }
2980
2981 ret = -ENOMEM;
2982 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2983 MEMREMAP_WB);
2984 if (!old_ce)
2985 goto out;
2986
2987 new_ce = alloc_pgtable_page(iommu->node);
2988 if (!new_ce)
2989 goto out_unmap;
2990
2991 ret = 0;
2992 }
2993
2994 /* Now copy the context entry */
2995 memcpy(&ce, old_ce + idx, sizeof(ce));
2996
2997 if (!__context_present(&ce))
2998 continue;
2999
3000 did = context_domain_id(&ce);
3001 if (did >= 0 && did < cap_ndoms(iommu->cap))
3002 set_bit(did, iommu->domain_ids);
3003
3004 /*
3005 * We need a marker for copied context entries. This
3006 * marker needs to work for the old format as well as
3007 * for extended context entries.
3008 *
3009 * Bit 67 of the context entry is used. In the old
3010 * format this bit is available to software, in the
3011 * extended format it is the PGE bit, but PGE is ignored
3012 * by HW if PASIDs are disabled (and thus still
3013 * available).
3014 *
3015 * So disable PASIDs first and then mark the entry
3016 * copied. This means that we don't copy PASID
3017 * translations from the old kernel, but this is fine as
3018 * faults there are not fatal.
3019 */
3020 context_clear_pasid_enable(&ce);
3021 context_set_copied(&ce);
3022
3023 new_ce[idx] = ce;
3024 }
3025
3026 tbl[tbl_idx + pos] = new_ce;
3027
3028 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3029
3030out_unmap:
3031 memunmap(old_ce);
3032
3033out:
3034 return ret;
3035}
3036
3037static int copy_translation_tables(struct intel_iommu *iommu)
3038{
3039 struct context_entry **ctxt_tbls;
3040 struct root_entry *old_rt;
3041 phys_addr_t old_rt_phys;
3042 int ctxt_table_entries;
3043 unsigned long flags;
3044 u64 rtaddr_reg;
3045 int bus, ret;
3046 bool new_ext, ext;
3047
3048 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3049 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3050 new_ext = !!ecap_ecs(iommu->ecap);
3051
3052 /*
3053 * The RTT bit can only be changed when translation is disabled,
3054 * but disabling translation means to open a window for data
3055 * corruption. So bail out and don't copy anything if we would
3056 * have to change the bit.
3057 */
3058 if (new_ext != ext)
3059 return -EINVAL;
3060
3061 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3062 if (!old_rt_phys)
3063 return -EINVAL;
3064
3065 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3066 if (!old_rt)
3067 return -ENOMEM;
3068
3069 /* This is too big for the stack - allocate it from slab */
3070 ctxt_table_entries = ext ? 512 : 256;
3071 ret = -ENOMEM;
3072 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3073 if (!ctxt_tbls)
3074 goto out_unmap;
3075
3076 for (bus = 0; bus < 256; bus++) {
3077 ret = copy_context_table(iommu, &old_rt[bus],
3078 ctxt_tbls, bus, ext);
3079 if (ret) {
3080 pr_err("%s: Failed to copy context table for bus %d\n",
3081 iommu->name, bus);
3082 continue;
3083 }
3084 }
3085
3086 spin_lock_irqsave(&iommu->lock, flags);
3087
3088 /* Context tables are copied, now write them to the root_entry table */
3089 for (bus = 0; bus < 256; bus++) {
3090 int idx = ext ? bus * 2 : bus;
3091 u64 val;
3092
3093 if (ctxt_tbls[idx]) {
3094 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3095 iommu->root_entry[bus].lo = val;
3096 }
3097
3098 if (!ext || !ctxt_tbls[idx + 1])
3099 continue;
3100
3101 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3102 iommu->root_entry[bus].hi = val;
3103 }
3104
3105 spin_unlock_irqrestore(&iommu->lock, flags);
3106
3107 kfree(ctxt_tbls);
3108
3109 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3110
3111 ret = 0;
3112
3113out_unmap:
3114 memunmap(old_rt);
3115
3116 return ret;
3117}
3118
3119#ifdef CONFIG_INTEL_IOMMU_SVM
3120static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3121{
3122 struct intel_iommu *iommu = data;
3123 ioasid_t ioasid;
3124
3125 if (!iommu)
3126 return INVALID_IOASID;
3127 /*
3128 * VT-d virtual command interface always uses the full 20 bit
3129 * PASID range. Host can partition guest PASID range based on
3130 * policies but it is out of guest's control.
3131 */
3132 if (min < PASID_MIN || max > intel_pasid_max_id)
3133 return INVALID_IOASID;
3134
3135 if (vcmd_alloc_pasid(iommu, &ioasid))
3136 return INVALID_IOASID;
3137
3138 return ioasid;
3139}
3140
3141static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3142{
3143 struct intel_iommu *iommu = data;
3144
3145 if (!iommu)
3146 return;
3147 /*
3148 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3149 * We can only free the PASID when all the devices are unbound.
3150 */
3151 if (ioasid_find(NULL, ioasid, NULL)) {
3152 pr_alert("Cannot free active IOASID %d\n", ioasid);
3153 return;
3154 }
3155 vcmd_free_pasid(iommu, ioasid);
3156}
3157
3158static void register_pasid_allocator(struct intel_iommu *iommu)
3159{
3160 /*
3161 * If we are running in the host, no need for custom allocator
3162 * in that PASIDs are allocated from the host system-wide.
3163 */
3164 if (!cap_caching_mode(iommu->cap))
3165 return;
3166
3167 if (!sm_supported(iommu)) {
3168 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3169 return;
3170 }
3171
3172 /*
3173 * Register a custom PASID allocator if we are running in a guest,
3174 * guest PASID must be obtained via virtual command interface.
3175 * There can be multiple vIOMMUs in each guest but only one allocator
3176 * is active. All vIOMMU allocators will eventually be calling the same
3177 * host allocator.
3178 */
3179 if (!vccap_pasid(iommu->vccap))
3180 return;
3181
3182 pr_info("Register custom PASID allocator\n");
3183 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3184 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3185 iommu->pasid_allocator.pdata = (void *)iommu;
3186 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3187 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3188 /*
3189 * Disable scalable mode on this IOMMU if there
3190 * is no custom allocator. Mixing SM capable vIOMMU
3191 * and non-SM vIOMMU are not supported.
3192 */
3193 intel_iommu_sm = 0;
3194 }
3195}
3196#endif
3197
3198static int __init init_dmars(void)
3199{
3200 struct dmar_drhd_unit *drhd;
3201 struct intel_iommu *iommu;
3202 int ret;
3203
3204 /*
3205 * for each drhd
3206 * allocate root
3207 * initialize and program root entry to not present
3208 * endfor
3209 */
3210 for_each_drhd_unit(drhd) {
3211 /*
3212 * lock not needed as this is only incremented in the single
3213 * threaded kernel __init code path all other access are read
3214 * only
3215 */
3216 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3217 g_num_of_iommus++;
3218 continue;
3219 }
3220 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3221 }
3222
3223 /* Preallocate enough resources for IOMMU hot-addition */
3224 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3225 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3226
3227 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3228 GFP_KERNEL);
3229 if (!g_iommus) {
3230 pr_err("Allocating global iommu array failed\n");
3231 ret = -ENOMEM;
3232 goto error;
3233 }
3234
3235 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
3236 if (ret)
3237 goto free_iommu;
3238
3239 for_each_iommu(iommu, drhd) {
3240 if (drhd->ignored) {
3241 iommu_disable_translation(iommu);
3242 continue;
3243 }
3244
3245 /*
3246 * Find the max pasid size of all IOMMU's in the system.
3247 * We need to ensure the system pasid table is no bigger
3248 * than the smallest supported.
3249 */
3250 if (pasid_supported(iommu)) {
3251 u32 temp = 2 << ecap_pss(iommu->ecap);
3252
3253 intel_pasid_max_id = min_t(u32, temp,
3254 intel_pasid_max_id);
3255 }
3256
3257 g_iommus[iommu->seq_id] = iommu;
3258
3259 intel_iommu_init_qi(iommu);
3260
3261 ret = iommu_init_domains(iommu);
3262 if (ret)
3263 goto free_iommu;
3264
3265 init_translation_status(iommu);
3266
3267 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3268 iommu_disable_translation(iommu);
3269 clear_translation_pre_enabled(iommu);
3270 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3271 iommu->name);
3272 }
3273
3274 /*
3275 * TBD:
3276 * we could share the same root & context tables
3277 * among all IOMMU's. Need to Split it later.
3278 */
3279 ret = iommu_alloc_root_entry(iommu);
3280 if (ret)
3281 goto free_iommu;
3282
3283 if (translation_pre_enabled(iommu)) {
3284 pr_info("Translation already enabled - trying to copy translation structures\n");
3285
3286 ret = copy_translation_tables(iommu);
3287 if (ret) {
3288 /*
3289 * We found the IOMMU with translation
3290 * enabled - but failed to copy over the
3291 * old root-entry table. Try to proceed
3292 * by disabling translation now and
3293 * allocating a clean root-entry table.
3294 * This might cause DMAR faults, but
3295 * probably the dump will still succeed.
3296 */
3297 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3298 iommu->name);
3299 iommu_disable_translation(iommu);
3300 clear_translation_pre_enabled(iommu);
3301 } else {
3302 pr_info("Copied translation tables from previous kernel for %s\n",
3303 iommu->name);
3304 }
3305 }
3306
3307 if (!ecap_pass_through(iommu->ecap))
3308 hw_pass_through = 0;
3309 intel_svm_check(iommu);
3310 }
3311
3312 /*
3313 * Now that qi is enabled on all iommus, set the root entry and flush
3314 * caches. This is required on some Intel X58 chipsets, otherwise the
3315 * flush_context function will loop forever and the boot hangs.
3316 */
3317 for_each_active_iommu(iommu, drhd) {
3318 iommu_flush_write_buffer(iommu);
3319#ifdef CONFIG_INTEL_IOMMU_SVM
3320 register_pasid_allocator(iommu);
3321#endif
3322 iommu_set_root_entry(iommu);
3323 }
3324
3325#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3326 dmar_map_gfx = 0;
3327#endif
3328
3329 if (!dmar_map_gfx)
3330 iommu_identity_mapping |= IDENTMAP_GFX;
3331
3332 check_tylersburg_isoch();
3333
3334 ret = si_domain_init(hw_pass_through);
3335 if (ret)
3336 goto free_iommu;
3337
3338 /*
3339 * for each drhd
3340 * enable fault log
3341 * global invalidate context cache
3342 * global invalidate iotlb
3343 * enable translation
3344 */
3345 for_each_iommu(iommu, drhd) {
3346 if (drhd->ignored) {
3347 /*
3348 * we always have to disable PMRs or DMA may fail on
3349 * this device
3350 */
3351 if (force_on)
3352 iommu_disable_protect_mem_regions(iommu);
3353 continue;
3354 }
3355
3356 iommu_flush_write_buffer(iommu);
3357
3358#ifdef CONFIG_INTEL_IOMMU_SVM
3359 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3360 /*
3361 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3362 * could cause possible lock race condition.
3363 */
3364 up_write(&dmar_global_lock);
3365 ret = intel_svm_enable_prq(iommu);
3366 down_write(&dmar_global_lock);
3367 if (ret)
3368 goto free_iommu;
3369 }
3370#endif
3371 ret = dmar_set_interrupt(iommu);
3372 if (ret)
3373 goto free_iommu;
3374 }
3375
3376 return 0;
3377
3378free_iommu:
3379 for_each_active_iommu(iommu, drhd) {
3380 disable_dmar_iommu(iommu);
3381 free_dmar_iommu(iommu);
3382 }
3383
3384 kfree(g_iommus);
3385
3386error:
3387 return ret;
3388}
3389
3390static inline int iommu_domain_cache_init(void)
3391{
3392 int ret = 0;
3393
3394 iommu_domain_cache = kmem_cache_create("iommu_domain",
3395 sizeof(struct dmar_domain),
3396 0,
3397 SLAB_HWCACHE_ALIGN,
3398
3399 NULL);
3400 if (!iommu_domain_cache) {
3401 pr_err("Couldn't create iommu_domain cache\n");
3402 ret = -ENOMEM;
3403 }
3404
3405 return ret;
3406}
3407
3408static inline int iommu_devinfo_cache_init(void)
3409{
3410 int ret = 0;
3411
3412 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3413 sizeof(struct device_domain_info),
3414 0,
3415 SLAB_HWCACHE_ALIGN,
3416 NULL);
3417 if (!iommu_devinfo_cache) {
3418 pr_err("Couldn't create devinfo cache\n");
3419 ret = -ENOMEM;
3420 }
3421
3422 return ret;
3423}
3424
3425static int __init iommu_init_mempool(void)
3426{
3427 int ret;
3428 ret = iova_cache_get();
3429 if (ret)
3430 return ret;
3431
3432 ret = iommu_domain_cache_init();
3433 if (ret)
3434 goto domain_error;
3435
3436 ret = iommu_devinfo_cache_init();
3437 if (!ret)
3438 return ret;
3439
3440 kmem_cache_destroy(iommu_domain_cache);
3441domain_error:
3442 iova_cache_put();
3443
3444 return -ENOMEM;
3445}
3446
3447static void __init iommu_exit_mempool(void)
3448{
3449 kmem_cache_destroy(iommu_devinfo_cache);
3450 kmem_cache_destroy(iommu_domain_cache);
3451 iova_cache_put();
3452}
3453
3454static void __init init_no_remapping_devices(void)
3455{
3456 struct dmar_drhd_unit *drhd;
3457 struct device *dev;
3458 int i;
3459
3460 for_each_drhd_unit(drhd) {
3461 if (!drhd->include_all) {
3462 for_each_active_dev_scope(drhd->devices,
3463 drhd->devices_cnt, i, dev)
3464 break;
3465 /* ignore DMAR unit if no devices exist */
3466 if (i == drhd->devices_cnt)
3467 drhd->ignored = 1;
3468 }
3469 }
3470
3471 for_each_active_drhd_unit(drhd) {
3472 if (drhd->include_all)
3473 continue;
3474
3475 for_each_active_dev_scope(drhd->devices,
3476 drhd->devices_cnt, i, dev)
3477 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3478 break;
3479 if (i < drhd->devices_cnt)
3480 continue;
3481
3482 /* This IOMMU has *only* gfx devices. Either bypass it or
3483 set the gfx_mapped flag, as appropriate */
3484 drhd->gfx_dedicated = 1;
3485 if (!dmar_map_gfx)
3486 drhd->ignored = 1;
3487 }
3488}
3489
3490#ifdef CONFIG_SUSPEND
3491static int init_iommu_hw(void)
3492{
3493 struct dmar_drhd_unit *drhd;
3494 struct intel_iommu *iommu = NULL;
3495
3496 for_each_active_iommu(iommu, drhd)
3497 if (iommu->qi)
3498 dmar_reenable_qi(iommu);
3499
3500 for_each_iommu(iommu, drhd) {
3501 if (drhd->ignored) {
3502 /*
3503 * we always have to disable PMRs or DMA may fail on
3504 * this device
3505 */
3506 if (force_on)
3507 iommu_disable_protect_mem_regions(iommu);
3508 continue;
3509 }
3510
3511 iommu_flush_write_buffer(iommu);
3512 iommu_set_root_entry(iommu);
3513 iommu_enable_translation(iommu);
3514 iommu_disable_protect_mem_regions(iommu);
3515 }
3516
3517 return 0;
3518}
3519
3520static void iommu_flush_all(void)
3521{
3522 struct dmar_drhd_unit *drhd;
3523 struct intel_iommu *iommu;
3524
3525 for_each_active_iommu(iommu, drhd) {
3526 iommu->flush.flush_context(iommu, 0, 0, 0,
3527 DMA_CCMD_GLOBAL_INVL);
3528 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3529 DMA_TLB_GLOBAL_FLUSH);
3530 }
3531}
3532
3533static int iommu_suspend(void)
3534{
3535 struct dmar_drhd_unit *drhd;
3536 struct intel_iommu *iommu = NULL;
3537 unsigned long flag;
3538
3539 for_each_active_iommu(iommu, drhd) {
3540 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3541 GFP_KERNEL);
3542 if (!iommu->iommu_state)
3543 goto nomem;
3544 }
3545
3546 iommu_flush_all();
3547
3548 for_each_active_iommu(iommu, drhd) {
3549 iommu_disable_translation(iommu);
3550
3551 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3552
3553 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3554 readl(iommu->reg + DMAR_FECTL_REG);
3555 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3556 readl(iommu->reg + DMAR_FEDATA_REG);
3557 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3558 readl(iommu->reg + DMAR_FEADDR_REG);
3559 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3560 readl(iommu->reg + DMAR_FEUADDR_REG);
3561
3562 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3563 }
3564 return 0;
3565
3566nomem:
3567 for_each_active_iommu(iommu, drhd)
3568 kfree(iommu->iommu_state);
3569
3570 return -ENOMEM;
3571}
3572
3573static void iommu_resume(void)
3574{
3575 struct dmar_drhd_unit *drhd;
3576 struct intel_iommu *iommu = NULL;
3577 unsigned long flag;
3578
3579 if (init_iommu_hw()) {
3580 if (force_on)
3581 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3582 else
3583 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3584 return;
3585 }
3586
3587 for_each_active_iommu(iommu, drhd) {
3588
3589 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3590
3591 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3592 iommu->reg + DMAR_FECTL_REG);
3593 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3594 iommu->reg + DMAR_FEDATA_REG);
3595 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3596 iommu->reg + DMAR_FEADDR_REG);
3597 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3598 iommu->reg + DMAR_FEUADDR_REG);
3599
3600 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3601 }
3602
3603 for_each_active_iommu(iommu, drhd)
3604 kfree(iommu->iommu_state);
3605}
3606
3607static struct syscore_ops iommu_syscore_ops = {
3608 .resume = iommu_resume,
3609 .suspend = iommu_suspend,
3610};
3611
3612static void __init init_iommu_pm_ops(void)
3613{
3614 register_syscore_ops(&iommu_syscore_ops);
3615}
3616
3617#else
3618static inline void init_iommu_pm_ops(void) {}
3619#endif /* CONFIG_PM */
3620
3621static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3622{
3623 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3624 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3625 rmrr->end_address <= rmrr->base_address ||
3626 arch_rmrr_sanity_check(rmrr))
3627 return -EINVAL;
3628
3629 return 0;
3630}
3631
3632int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3633{
3634 struct acpi_dmar_reserved_memory *rmrr;
3635 struct dmar_rmrr_unit *rmrru;
3636
3637 rmrr = (struct acpi_dmar_reserved_memory *)header;
3638 if (rmrr_sanity_check(rmrr)) {
3639 pr_warn(FW_BUG
3640 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3641 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3642 rmrr->base_address, rmrr->end_address,
3643 dmi_get_system_info(DMI_BIOS_VENDOR),
3644 dmi_get_system_info(DMI_BIOS_VERSION),
3645 dmi_get_system_info(DMI_PRODUCT_VERSION));
3646 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3647 }
3648
3649 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3650 if (!rmrru)
3651 goto out;
3652
3653 rmrru->hdr = header;
3654
3655 rmrru->base_address = rmrr->base_address;
3656 rmrru->end_address = rmrr->end_address;
3657
3658 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3659 ((void *)rmrr) + rmrr->header.length,
3660 &rmrru->devices_cnt);
3661 if (rmrru->devices_cnt && rmrru->devices == NULL)
3662 goto free_rmrru;
3663
3664 list_add(&rmrru->list, &dmar_rmrr_units);
3665
3666 return 0;
3667free_rmrru:
3668 kfree(rmrru);
3669out:
3670 return -ENOMEM;
3671}
3672
3673static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3674{
3675 struct dmar_atsr_unit *atsru;
3676 struct acpi_dmar_atsr *tmp;
3677
3678 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3679 dmar_rcu_check()) {
3680 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3681 if (atsr->segment != tmp->segment)
3682 continue;
3683 if (atsr->header.length != tmp->header.length)
3684 continue;
3685 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3686 return atsru;
3687 }
3688
3689 return NULL;
3690}
3691
3692int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3693{
3694 struct acpi_dmar_atsr *atsr;
3695 struct dmar_atsr_unit *atsru;
3696
3697 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3698 return 0;
3699
3700 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3701 atsru = dmar_find_atsr(atsr);
3702 if (atsru)
3703 return 0;
3704
3705 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3706 if (!atsru)
3707 return -ENOMEM;
3708
3709 /*
3710 * If memory is allocated from slab by ACPI _DSM method, we need to
3711 * copy the memory content because the memory buffer will be freed
3712 * on return.
3713 */
3714 atsru->hdr = (void *)(atsru + 1);
3715 memcpy(atsru->hdr, hdr, hdr->length);
3716 atsru->include_all = atsr->flags & 0x1;
3717 if (!atsru->include_all) {
3718 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3719 (void *)atsr + atsr->header.length,
3720 &atsru->devices_cnt);
3721 if (atsru->devices_cnt && atsru->devices == NULL) {
3722 kfree(atsru);
3723 return -ENOMEM;
3724 }
3725 }
3726
3727 list_add_rcu(&atsru->list, &dmar_atsr_units);
3728
3729 return 0;
3730}
3731
3732static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3733{
3734 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3735 kfree(atsru);
3736}
3737
3738int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3739{
3740 struct acpi_dmar_atsr *atsr;
3741 struct dmar_atsr_unit *atsru;
3742
3743 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3744 atsru = dmar_find_atsr(atsr);
3745 if (atsru) {
3746 list_del_rcu(&atsru->list);
3747 synchronize_rcu();
3748 intel_iommu_free_atsr(atsru);
3749 }
3750
3751 return 0;
3752}
3753
3754int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3755{
3756 int i;
3757 struct device *dev;
3758 struct acpi_dmar_atsr *atsr;
3759 struct dmar_atsr_unit *atsru;
3760
3761 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3762 atsru = dmar_find_atsr(atsr);
3763 if (!atsru)
3764 return 0;
3765
3766 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3767 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3768 i, dev)
3769 return -EBUSY;
3770 }
3771
3772 return 0;
3773}
3774
3775static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3776{
3777 struct dmar_satc_unit *satcu;
3778 struct acpi_dmar_satc *tmp;
3779
3780 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3781 dmar_rcu_check()) {
3782 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3783 if (satc->segment != tmp->segment)
3784 continue;
3785 if (satc->header.length != tmp->header.length)
3786 continue;
3787 if (memcmp(satc, tmp, satc->header.length) == 0)
3788 return satcu;
3789 }
3790
3791 return NULL;
3792}
3793
3794int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3795{
3796 struct acpi_dmar_satc *satc;
3797 struct dmar_satc_unit *satcu;
3798
3799 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3800 return 0;
3801
3802 satc = container_of(hdr, struct acpi_dmar_satc, header);
3803 satcu = dmar_find_satc(satc);
3804 if (satcu)
3805 return 0;
3806
3807 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3808 if (!satcu)
3809 return -ENOMEM;
3810
3811 satcu->hdr = (void *)(satcu + 1);
3812 memcpy(satcu->hdr, hdr, hdr->length);
3813 satcu->atc_required = satc->flags & 0x1;
3814 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3815 (void *)satc + satc->header.length,
3816 &satcu->devices_cnt);
3817 if (satcu->devices_cnt && !satcu->devices) {
3818 kfree(satcu);
3819 return -ENOMEM;
3820 }
3821 list_add_rcu(&satcu->list, &dmar_satc_units);
3822
3823 return 0;
3824}
3825
3826static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3827{
3828 int sp, ret;
3829 struct intel_iommu *iommu = dmaru->iommu;
3830
3831 if (g_iommus[iommu->seq_id])
3832 return 0;
3833
3834 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3835 if (ret)
3836 goto out;
3837
3838 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3839 pr_warn("%s: Doesn't support hardware pass through.\n",
3840 iommu->name);
3841 return -ENXIO;
3842 }
3843 if (!ecap_sc_support(iommu->ecap) &&
3844 domain_update_iommu_snooping(iommu)) {
3845 pr_warn("%s: Doesn't support snooping.\n",
3846 iommu->name);
3847 return -ENXIO;
3848 }
3849 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3850 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3851 pr_warn("%s: Doesn't support large page.\n",
3852 iommu->name);
3853 return -ENXIO;
3854 }
3855
3856 /*
3857 * Disable translation if already enabled prior to OS handover.
3858 */
3859 if (iommu->gcmd & DMA_GCMD_TE)
3860 iommu_disable_translation(iommu);
3861
3862 g_iommus[iommu->seq_id] = iommu;
3863 ret = iommu_init_domains(iommu);
3864 if (ret == 0)
3865 ret = iommu_alloc_root_entry(iommu);
3866 if (ret)
3867 goto out;
3868
3869 intel_svm_check(iommu);
3870
3871 if (dmaru->ignored) {
3872 /*
3873 * we always have to disable PMRs or DMA may fail on this device
3874 */
3875 if (force_on)
3876 iommu_disable_protect_mem_regions(iommu);
3877 return 0;
3878 }
3879
3880 intel_iommu_init_qi(iommu);
3881 iommu_flush_write_buffer(iommu);
3882
3883#ifdef CONFIG_INTEL_IOMMU_SVM
3884 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3885 ret = intel_svm_enable_prq(iommu);
3886 if (ret)
3887 goto disable_iommu;
3888 }
3889#endif
3890 ret = dmar_set_interrupt(iommu);
3891 if (ret)
3892 goto disable_iommu;
3893
3894 iommu_set_root_entry(iommu);
3895 iommu_enable_translation(iommu);
3896
3897 iommu_disable_protect_mem_regions(iommu);
3898 return 0;
3899
3900disable_iommu:
3901 disable_dmar_iommu(iommu);
3902out:
3903 free_dmar_iommu(iommu);
3904 return ret;
3905}
3906
3907int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3908{
3909 int ret = 0;
3910 struct intel_iommu *iommu = dmaru->iommu;
3911
3912 if (!intel_iommu_enabled)
3913 return 0;
3914 if (iommu == NULL)
3915 return -EINVAL;
3916
3917 if (insert) {
3918 ret = intel_iommu_add(dmaru);
3919 } else {
3920 disable_dmar_iommu(iommu);
3921 free_dmar_iommu(iommu);
3922 }
3923
3924 return ret;
3925}
3926
3927static void intel_iommu_free_dmars(void)
3928{
3929 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3930 struct dmar_atsr_unit *atsru, *atsr_n;
3931 struct dmar_satc_unit *satcu, *satc_n;
3932
3933 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3934 list_del(&rmrru->list);
3935 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3936 kfree(rmrru);
3937 }
3938
3939 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3940 list_del(&atsru->list);
3941 intel_iommu_free_atsr(atsru);
3942 }
3943 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3944 list_del(&satcu->list);
3945 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3946 kfree(satcu);
3947 }
3948}
3949
3950int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3951{
3952 int i, ret = 1;
3953 struct pci_bus *bus;
3954 struct pci_dev *bridge = NULL;
3955 struct device *tmp;
3956 struct acpi_dmar_atsr *atsr;
3957 struct dmar_atsr_unit *atsru;
3958
3959 dev = pci_physfn(dev);
3960 for (bus = dev->bus; bus; bus = bus->parent) {
3961 bridge = bus->self;
3962 /* If it's an integrated device, allow ATS */
3963 if (!bridge)
3964 return 1;
3965 /* Connected via non-PCIe: no ATS */
3966 if (!pci_is_pcie(bridge) ||
3967 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3968 return 0;
3969 /* If we found the root port, look it up in the ATSR */
3970 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3971 break;
3972 }
3973
3974 rcu_read_lock();
3975 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3976 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3977 if (atsr->segment != pci_domain_nr(dev->bus))
3978 continue;
3979
3980 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3981 if (tmp == &bridge->dev)
3982 goto out;
3983
3984 if (atsru->include_all)
3985 goto out;
3986 }
3987 ret = 0;
3988out:
3989 rcu_read_unlock();
3990
3991 return ret;
3992}
3993
3994int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3995{
3996 int ret;
3997 struct dmar_rmrr_unit *rmrru;
3998 struct dmar_atsr_unit *atsru;
3999 struct dmar_satc_unit *satcu;
4000 struct acpi_dmar_atsr *atsr;
4001 struct acpi_dmar_reserved_memory *rmrr;
4002 struct acpi_dmar_satc *satc;
4003
4004 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4005 return 0;
4006
4007 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4008 rmrr = container_of(rmrru->hdr,
4009 struct acpi_dmar_reserved_memory, header);
4010 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4011 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4012 ((void *)rmrr) + rmrr->header.length,
4013 rmrr->segment, rmrru->devices,
4014 rmrru->devices_cnt);
4015 if (ret < 0)
4016 return ret;
4017 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4018 dmar_remove_dev_scope(info, rmrr->segment,
4019 rmrru->devices, rmrru->devices_cnt);
4020 }
4021 }
4022
4023 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4024 if (atsru->include_all)
4025 continue;
4026
4027 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4028 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4029 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4030 (void *)atsr + atsr->header.length,
4031 atsr->segment, atsru->devices,
4032 atsru->devices_cnt);
4033 if (ret > 0)
4034 break;
4035 else if (ret < 0)
4036 return ret;
4037 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4038 if (dmar_remove_dev_scope(info, atsr->segment,
4039 atsru->devices, atsru->devices_cnt))
4040 break;
4041 }
4042 }
4043 list_for_each_entry(satcu, &dmar_satc_units, list) {
4044 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
4045 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4046 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
4047 (void *)satc + satc->header.length,
4048 satc->segment, satcu->devices,
4049 satcu->devices_cnt);
4050 if (ret > 0)
4051 break;
4052 else if (ret < 0)
4053 return ret;
4054 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4055 if (dmar_remove_dev_scope(info, satc->segment,
4056 satcu->devices, satcu->devices_cnt))
4057 break;
4058 }
4059 }
4060
4061 return 0;
4062}
4063
4064static int intel_iommu_memory_notifier(struct notifier_block *nb,
4065 unsigned long val, void *v)
4066{
4067 struct memory_notify *mhp = v;
4068 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4069 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4070 mhp->nr_pages - 1);
4071
4072 switch (val) {
4073 case MEM_GOING_ONLINE:
4074 if (iommu_domain_identity_map(si_domain,
4075 start_vpfn, last_vpfn)) {
4076 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4077 start_vpfn, last_vpfn);
4078 return NOTIFY_BAD;
4079 }
4080 break;
4081
4082 case MEM_OFFLINE:
4083 case MEM_CANCEL_ONLINE:
4084 {
4085 struct dmar_drhd_unit *drhd;
4086 struct intel_iommu *iommu;
4087 struct page *freelist;
4088
4089 freelist = domain_unmap(si_domain,
4090 start_vpfn, last_vpfn,
4091 NULL);
4092
4093 rcu_read_lock();
4094 for_each_active_iommu(iommu, drhd)
4095 iommu_flush_iotlb_psi(iommu, si_domain,
4096 start_vpfn, mhp->nr_pages,
4097 !freelist, 0);
4098 rcu_read_unlock();
4099 dma_free_pagelist(freelist);
4100 }
4101 break;
4102 }
4103
4104 return NOTIFY_OK;
4105}
4106
4107static struct notifier_block intel_iommu_memory_nb = {
4108 .notifier_call = intel_iommu_memory_notifier,
4109 .priority = 0
4110};
4111
4112static void intel_disable_iommus(void)
4113{
4114 struct intel_iommu *iommu = NULL;
4115 struct dmar_drhd_unit *drhd;
4116
4117 for_each_iommu(iommu, drhd)
4118 iommu_disable_translation(iommu);
4119}
4120
4121void intel_iommu_shutdown(void)
4122{
4123 struct dmar_drhd_unit *drhd;
4124 struct intel_iommu *iommu = NULL;
4125
4126 if (no_iommu || dmar_disabled)
4127 return;
4128
4129 down_write(&dmar_global_lock);
4130
4131 /* Disable PMRs explicitly here. */
4132 for_each_iommu(iommu, drhd)
4133 iommu_disable_protect_mem_regions(iommu);
4134
4135 /* Make sure the IOMMUs are switched off */
4136 intel_disable_iommus();
4137
4138 up_write(&dmar_global_lock);
4139}
4140
4141static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4142{
4143 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4144
4145 return container_of(iommu_dev, struct intel_iommu, iommu);
4146}
4147
4148static ssize_t version_show(struct device *dev,
4149 struct device_attribute *attr, char *buf)
4150{
4151 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4152 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4153 return sprintf(buf, "%d:%d\n",
4154 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4155}
4156static DEVICE_ATTR_RO(version);
4157
4158static ssize_t address_show(struct device *dev,
4159 struct device_attribute *attr, char *buf)
4160{
4161 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4162 return sprintf(buf, "%llx\n", iommu->reg_phys);
4163}
4164static DEVICE_ATTR_RO(address);
4165
4166static ssize_t cap_show(struct device *dev,
4167 struct device_attribute *attr, char *buf)
4168{
4169 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4170 return sprintf(buf, "%llx\n", iommu->cap);
4171}
4172static DEVICE_ATTR_RO(cap);
4173
4174static ssize_t ecap_show(struct device *dev,
4175 struct device_attribute *attr, char *buf)
4176{
4177 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4178 return sprintf(buf, "%llx\n", iommu->ecap);
4179}
4180static DEVICE_ATTR_RO(ecap);
4181
4182static ssize_t domains_supported_show(struct device *dev,
4183 struct device_attribute *attr, char *buf)
4184{
4185 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4186 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4187}
4188static DEVICE_ATTR_RO(domains_supported);
4189
4190static ssize_t domains_used_show(struct device *dev,
4191 struct device_attribute *attr, char *buf)
4192{
4193 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4194 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4195 cap_ndoms(iommu->cap)));
4196}
4197static DEVICE_ATTR_RO(domains_used);
4198
4199static struct attribute *intel_iommu_attrs[] = {
4200 &dev_attr_version.attr,
4201 &dev_attr_address.attr,
4202 &dev_attr_cap.attr,
4203 &dev_attr_ecap.attr,
4204 &dev_attr_domains_supported.attr,
4205 &dev_attr_domains_used.attr,
4206 NULL,
4207};
4208
4209static struct attribute_group intel_iommu_group = {
4210 .name = "intel-iommu",
4211 .attrs = intel_iommu_attrs,
4212};
4213
4214const struct attribute_group *intel_iommu_groups[] = {
4215 &intel_iommu_group,
4216 NULL,
4217};
4218
4219static inline bool has_external_pci(void)
4220{
4221 struct pci_dev *pdev = NULL;
4222
4223 for_each_pci_dev(pdev)
4224 if (pdev->external_facing)
4225 return true;
4226
4227 return false;
4228}
4229
4230static int __init platform_optin_force_iommu(void)
4231{
4232 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4233 return 0;
4234
4235 if (no_iommu || dmar_disabled)
4236 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4237
4238 /*
4239 * If Intel-IOMMU is disabled by default, we will apply identity
4240 * map for all devices except those marked as being untrusted.
4241 */
4242 if (dmar_disabled)
4243 iommu_set_default_passthrough(false);
4244
4245 dmar_disabled = 0;
4246 no_iommu = 0;
4247
4248 return 1;
4249}
4250
4251static int __init probe_acpi_namespace_devices(void)
4252{
4253 struct dmar_drhd_unit *drhd;
4254 /* To avoid a -Wunused-but-set-variable warning. */
4255 struct intel_iommu *iommu __maybe_unused;
4256 struct device *dev;
4257 int i, ret = 0;
4258
4259 for_each_active_iommu(iommu, drhd) {
4260 for_each_active_dev_scope(drhd->devices,
4261 drhd->devices_cnt, i, dev) {
4262 struct acpi_device_physical_node *pn;
4263 struct iommu_group *group;
4264 struct acpi_device *adev;
4265
4266 if (dev->bus != &acpi_bus_type)
4267 continue;
4268
4269 adev = to_acpi_device(dev);
4270 mutex_lock(&adev->physical_node_lock);
4271 list_for_each_entry(pn,
4272 &adev->physical_node_list, node) {
4273 group = iommu_group_get(pn->dev);
4274 if (group) {
4275 iommu_group_put(group);
4276 continue;
4277 }
4278
4279 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4280 ret = iommu_probe_device(pn->dev);
4281 if (ret)
4282 break;
4283 }
4284 mutex_unlock(&adev->physical_node_lock);
4285
4286 if (ret)
4287 return ret;
4288 }
4289 }
4290
4291 return 0;
4292}
4293
4294int __init intel_iommu_init(void)
4295{
4296 int ret = -ENODEV;
4297 struct dmar_drhd_unit *drhd;
4298 struct intel_iommu *iommu;
4299
4300 /*
4301 * Intel IOMMU is required for a TXT/tboot launch or platform
4302 * opt in, so enforce that.
4303 */
4304 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
4305 platform_optin_force_iommu();
4306
4307 if (iommu_init_mempool()) {
4308 if (force_on)
4309 panic("tboot: Failed to initialize iommu memory\n");
4310 return -ENOMEM;
4311 }
4312
4313 down_write(&dmar_global_lock);
4314 if (dmar_table_init()) {
4315 if (force_on)
4316 panic("tboot: Failed to initialize DMAR table\n");
4317 goto out_free_dmar;
4318 }
4319
4320 if (dmar_dev_scope_init() < 0) {
4321 if (force_on)
4322 panic("tboot: Failed to initialize DMAR device scope\n");
4323 goto out_free_dmar;
4324 }
4325
4326 up_write(&dmar_global_lock);
4327
4328 /*
4329 * The bus notifier takes the dmar_global_lock, so lockdep will
4330 * complain later when we register it under the lock.
4331 */
4332 dmar_register_bus_notifier();
4333
4334 down_write(&dmar_global_lock);
4335
4336 if (!no_iommu)
4337 intel_iommu_debugfs_init();
4338
4339 if (no_iommu || dmar_disabled) {
4340 /*
4341 * We exit the function here to ensure IOMMU's remapping and
4342 * mempool aren't setup, which means that the IOMMU's PMRs
4343 * won't be disabled via the call to init_dmars(). So disable
4344 * it explicitly here. The PMRs were setup by tboot prior to
4345 * calling SENTER, but the kernel is expected to reset/tear
4346 * down the PMRs.
4347 */
4348 if (intel_iommu_tboot_noforce) {
4349 for_each_iommu(iommu, drhd)
4350 iommu_disable_protect_mem_regions(iommu);
4351 }
4352
4353 /*
4354 * Make sure the IOMMUs are switched off, even when we
4355 * boot into a kexec kernel and the previous kernel left
4356 * them enabled
4357 */
4358 intel_disable_iommus();
4359 goto out_free_dmar;
4360 }
4361
4362 if (list_empty(&dmar_rmrr_units))
4363 pr_info("No RMRR found\n");
4364
4365 if (list_empty(&dmar_atsr_units))
4366 pr_info("No ATSR found\n");
4367
4368 if (list_empty(&dmar_satc_units))
4369 pr_info("No SATC found\n");
4370
4371 if (dmar_map_gfx)
4372 intel_iommu_gfx_mapped = 1;
4373
4374 init_no_remapping_devices();
4375
4376 ret = init_dmars();
4377 if (ret) {
4378 if (force_on)
4379 panic("tboot: Failed to initialize DMARs\n");
4380 pr_err("Initialization failed\n");
4381 goto out_free_dmar;
4382 }
4383 up_write(&dmar_global_lock);
4384
4385 init_iommu_pm_ops();
4386
4387 down_read(&dmar_global_lock);
4388 for_each_active_iommu(iommu, drhd) {
4389 /*
4390 * The flush queue implementation does not perform
4391 * page-selective invalidations that are required for efficient
4392 * TLB flushes in virtual environments. The benefit of batching
4393 * is likely to be much lower than the overhead of synchronizing
4394 * the virtual and physical IOMMU page-tables.
4395 */
4396 if (!intel_iommu_strict && cap_caching_mode(iommu->cap)) {
4397 pr_warn("IOMMU batching is disabled due to virtualization");
4398 intel_iommu_strict = 1;
4399 }
4400 iommu_device_sysfs_add(&iommu->iommu, NULL,
4401 intel_iommu_groups,
4402 "%s", iommu->name);
4403 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4404 }
4405 up_read(&dmar_global_lock);
4406
4407 iommu_set_dma_strict(intel_iommu_strict);
4408 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4409 if (si_domain && !hw_pass_through)
4410 register_memory_notifier(&intel_iommu_memory_nb);
4411
4412 down_read(&dmar_global_lock);
4413 if (probe_acpi_namespace_devices())
4414 pr_warn("ACPI name space devices didn't probe correctly\n");
4415
4416 /* Finally, we enable the DMA remapping hardware. */
4417 for_each_iommu(iommu, drhd) {
4418 if (!drhd->ignored && !translation_pre_enabled(iommu))
4419 iommu_enable_translation(iommu);
4420
4421 iommu_disable_protect_mem_regions(iommu);
4422 }
4423 up_read(&dmar_global_lock);
4424
4425 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4426
4427 intel_iommu_enabled = 1;
4428
4429 return 0;
4430
4431out_free_dmar:
4432 intel_iommu_free_dmars();
4433 up_write(&dmar_global_lock);
4434 iommu_exit_mempool();
4435 return ret;
4436}
4437
4438static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4439{
4440 struct device_domain_info *info = opaque;
4441
4442 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4443 return 0;
4444}
4445
4446/*
4447 * NB - intel-iommu lacks any sort of reference counting for the users of
4448 * dependent devices. If multiple endpoints have intersecting dependent
4449 * devices, unbinding the driver from any one of them will possibly leave
4450 * the others unable to operate.
4451 */
4452static void domain_context_clear(struct device_domain_info *info)
4453{
4454 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4455 return;
4456
4457 pci_for_each_dma_alias(to_pci_dev(info->dev),
4458 &domain_context_clear_one_cb, info);
4459}
4460
4461static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4462{
4463 struct dmar_domain *domain;
4464 struct intel_iommu *iommu;
4465 unsigned long flags;
4466
4467 assert_spin_locked(&device_domain_lock);
4468
4469 if (WARN_ON(!info))
4470 return;
4471
4472 iommu = info->iommu;
4473 domain = info->domain;
4474
4475 if (info->dev && !dev_is_real_dma_subdevice(info->dev)) {
4476 if (dev_is_pci(info->dev) && sm_supported(iommu))
4477 intel_pasid_tear_down_entry(iommu, info->dev,
4478 PASID_RID2PASID, false);
4479
4480 iommu_disable_dev_iotlb(info);
4481 domain_context_clear(info);
4482 intel_pasid_free_table(info->dev);
4483 }
4484
4485 unlink_domain_info(info);
4486
4487 spin_lock_irqsave(&iommu->lock, flags);
4488 domain_detach_iommu(domain, iommu);
4489 spin_unlock_irqrestore(&iommu->lock, flags);
4490
4491 free_devinfo_mem(info);
4492}
4493
4494static void dmar_remove_one_dev_info(struct device *dev)
4495{
4496 struct device_domain_info *info;
4497 unsigned long flags;
4498
4499 spin_lock_irqsave(&device_domain_lock, flags);
4500 info = get_domain_info(dev);
4501 if (info)
4502 __dmar_remove_one_dev_info(info);
4503 spin_unlock_irqrestore(&device_domain_lock, flags);
4504}
4505
4506static int md_domain_init(struct dmar_domain *domain, int guest_width)
4507{
4508 int adjust_width;
4509
4510 /* calculate AGAW */
4511 domain->gaw = guest_width;
4512 adjust_width = guestwidth_to_adjustwidth(guest_width);
4513 domain->agaw = width_to_agaw(adjust_width);
4514
4515 domain->iommu_coherency = false;
4516 domain->iommu_snooping = false;
4517 domain->iommu_superpage = 0;
4518 domain->max_addr = 0;
4519
4520 /* always allocate the top pgd */
4521 domain->pgd = alloc_pgtable_page(domain->nid);
4522 if (!domain->pgd)
4523 return -ENOMEM;
4524 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4525 return 0;
4526}
4527
4528static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4529{
4530 struct dmar_domain *dmar_domain;
4531 struct iommu_domain *domain;
4532
4533 switch (type) {
4534 case IOMMU_DOMAIN_DMA:
4535 case IOMMU_DOMAIN_UNMANAGED:
4536 dmar_domain = alloc_domain(0);
4537 if (!dmar_domain) {
4538 pr_err("Can't allocate dmar_domain\n");
4539 return NULL;
4540 }
4541 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4542 pr_err("Domain initialization failed\n");
4543 domain_exit(dmar_domain);
4544 return NULL;
4545 }
4546
4547 if (type == IOMMU_DOMAIN_DMA &&
4548 iommu_get_dma_cookie(&dmar_domain->domain))
4549 return NULL;
4550
4551 domain = &dmar_domain->domain;
4552 domain->geometry.aperture_start = 0;
4553 domain->geometry.aperture_end =
4554 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4555 domain->geometry.force_aperture = true;
4556
4557 return domain;
4558 case IOMMU_DOMAIN_IDENTITY:
4559 return &si_domain->domain;
4560 default:
4561 return NULL;
4562 }
4563
4564 return NULL;
4565}
4566
4567static void intel_iommu_domain_free(struct iommu_domain *domain)
4568{
4569 if (domain != &si_domain->domain)
4570 domain_exit(to_dmar_domain(domain));
4571}
4572
4573/*
4574 * Check whether a @domain could be attached to the @dev through the
4575 * aux-domain attach/detach APIs.
4576 */
4577static inline bool
4578is_aux_domain(struct device *dev, struct iommu_domain *domain)
4579{
4580 struct device_domain_info *info = get_domain_info(dev);
4581
4582 return info && info->auxd_enabled &&
4583 domain->type == IOMMU_DOMAIN_UNMANAGED;
4584}
4585
4586static inline struct subdev_domain_info *
4587lookup_subdev_info(struct dmar_domain *domain, struct device *dev)
4588{
4589 struct subdev_domain_info *sinfo;
4590
4591 if (!list_empty(&domain->subdevices)) {
4592 list_for_each_entry(sinfo, &domain->subdevices, link_domain) {
4593 if (sinfo->pdev == dev)
4594 return sinfo;
4595 }
4596 }
4597
4598 return NULL;
4599}
4600
4601static int auxiliary_link_device(struct dmar_domain *domain,
4602 struct device *dev)
4603{
4604 struct device_domain_info *info = get_domain_info(dev);
4605 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4606
4607 assert_spin_locked(&device_domain_lock);
4608 if (WARN_ON(!info))
4609 return -EINVAL;
4610
4611 if (!sinfo) {
4612 sinfo = kzalloc(sizeof(*sinfo), GFP_ATOMIC);
4613 if (!sinfo)
4614 return -ENOMEM;
4615 sinfo->domain = domain;
4616 sinfo->pdev = dev;
4617 list_add(&sinfo->link_phys, &info->subdevices);
4618 list_add(&sinfo->link_domain, &domain->subdevices);
4619 }
4620
4621 return ++sinfo->users;
4622}
4623
4624static int auxiliary_unlink_device(struct dmar_domain *domain,
4625 struct device *dev)
4626{
4627 struct device_domain_info *info = get_domain_info(dev);
4628 struct subdev_domain_info *sinfo = lookup_subdev_info(domain, dev);
4629 int ret;
4630
4631 assert_spin_locked(&device_domain_lock);
4632 if (WARN_ON(!info || !sinfo || sinfo->users <= 0))
4633 return -EINVAL;
4634
4635 ret = --sinfo->users;
4636 if (!ret) {
4637 list_del(&sinfo->link_phys);
4638 list_del(&sinfo->link_domain);
4639 kfree(sinfo);
4640 }
4641
4642 return ret;
4643}
4644
4645static int aux_domain_add_dev(struct dmar_domain *domain,
4646 struct device *dev)
4647{
4648 int ret;
4649 unsigned long flags;
4650 struct intel_iommu *iommu;
4651
4652 iommu = device_to_iommu(dev, NULL, NULL);
4653 if (!iommu)
4654 return -ENODEV;
4655
4656 if (domain->default_pasid <= 0) {
4657 u32 pasid;
4658
4659 /* No private data needed for the default pasid */
4660 pasid = ioasid_alloc(NULL, PASID_MIN,
4661 pci_max_pasids(to_pci_dev(dev)) - 1,
4662 NULL);
4663 if (pasid == INVALID_IOASID) {
4664 pr_err("Can't allocate default pasid\n");
4665 return -ENODEV;
4666 }
4667 domain->default_pasid = pasid;
4668 }
4669
4670 spin_lock_irqsave(&device_domain_lock, flags);
4671 ret = auxiliary_link_device(domain, dev);
4672 if (ret <= 0)
4673 goto link_failed;
4674
4675 /*
4676 * Subdevices from the same physical device can be attached to the
4677 * same domain. For such cases, only the first subdevice attachment
4678 * needs to go through the full steps in this function. So if ret >
4679 * 1, just goto out.
4680 */
4681 if (ret > 1)
4682 goto out;
4683
4684 /*
4685 * iommu->lock must be held to attach domain to iommu and setup the
4686 * pasid entry for second level translation.
4687 */
4688 spin_lock(&iommu->lock);
4689 ret = domain_attach_iommu(domain, iommu);
4690 if (ret)
4691 goto attach_failed;
4692
4693 /* Setup the PASID entry for mediated devices: */
4694 if (domain_use_first_level(domain))
4695 ret = domain_setup_first_level(iommu, domain, dev,
4696 domain->default_pasid);
4697 else
4698 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4699 domain->default_pasid);
4700 if (ret)
4701 goto table_failed;
4702
4703 spin_unlock(&iommu->lock);
4704out:
4705 spin_unlock_irqrestore(&device_domain_lock, flags);
4706
4707 return 0;
4708
4709table_failed:
4710 domain_detach_iommu(domain, iommu);
4711attach_failed:
4712 spin_unlock(&iommu->lock);
4713 auxiliary_unlink_device(domain, dev);
4714link_failed:
4715 spin_unlock_irqrestore(&device_domain_lock, flags);
4716 if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4717 ioasid_put(domain->default_pasid);
4718
4719 return ret;
4720}
4721
4722static void aux_domain_remove_dev(struct dmar_domain *domain,
4723 struct device *dev)
4724{
4725 struct device_domain_info *info;
4726 struct intel_iommu *iommu;
4727 unsigned long flags;
4728
4729 if (!is_aux_domain(dev, &domain->domain))
4730 return;
4731
4732 spin_lock_irqsave(&device_domain_lock, flags);
4733 info = get_domain_info(dev);
4734 iommu = info->iommu;
4735
4736 if (!auxiliary_unlink_device(domain, dev)) {
4737 spin_lock(&iommu->lock);
4738 intel_pasid_tear_down_entry(iommu, dev,
4739 domain->default_pasid, false);
4740 domain_detach_iommu(domain, iommu);
4741 spin_unlock(&iommu->lock);
4742 }
4743
4744 spin_unlock_irqrestore(&device_domain_lock, flags);
4745
4746 if (list_empty(&domain->subdevices) && domain->default_pasid > 0)
4747 ioasid_put(domain->default_pasid);
4748}
4749
4750static int prepare_domain_attach_device(struct iommu_domain *domain,
4751 struct device *dev)
4752{
4753 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4754 struct intel_iommu *iommu;
4755 int addr_width;
4756
4757 iommu = device_to_iommu(dev, NULL, NULL);
4758 if (!iommu)
4759 return -ENODEV;
4760
4761 if ((dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE) &&
4762 !ecap_nest(iommu->ecap)) {
4763 dev_err(dev, "%s: iommu not support nested translation\n",
4764 iommu->name);
4765 return -EINVAL;
4766 }
4767
4768 /* check if this iommu agaw is sufficient for max mapped address */
4769 addr_width = agaw_to_width(iommu->agaw);
4770 if (addr_width > cap_mgaw(iommu->cap))
4771 addr_width = cap_mgaw(iommu->cap);
4772
4773 if (dmar_domain->max_addr > (1LL << addr_width)) {
4774 dev_err(dev, "%s: iommu width (%d) is not "
4775 "sufficient for the mapped address (%llx)\n",
4776 __func__, addr_width, dmar_domain->max_addr);
4777 return -EFAULT;
4778 }
4779 dmar_domain->gaw = addr_width;
4780
4781 /*
4782 * Knock out extra levels of page tables if necessary
4783 */
4784 while (iommu->agaw < dmar_domain->agaw) {
4785 struct dma_pte *pte;
4786
4787 pte = dmar_domain->pgd;
4788 if (dma_pte_present(pte)) {
4789 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4790 free_pgtable_page(pte);
4791 }
4792 dmar_domain->agaw--;
4793 }
4794
4795 return 0;
4796}
4797
4798static int intel_iommu_attach_device(struct iommu_domain *domain,
4799 struct device *dev)
4800{
4801 int ret;
4802
4803 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4804 device_is_rmrr_locked(dev)) {
4805 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4806 return -EPERM;
4807 }
4808
4809 if (is_aux_domain(dev, domain))
4810 return -EPERM;
4811
4812 /* normally dev is not mapped */
4813 if (unlikely(domain_context_mapped(dev))) {
4814 struct dmar_domain *old_domain;
4815
4816 old_domain = find_domain(dev);
4817 if (old_domain)
4818 dmar_remove_one_dev_info(dev);
4819 }
4820
4821 ret = prepare_domain_attach_device(domain, dev);
4822 if (ret)
4823 return ret;
4824
4825 return domain_add_dev_info(to_dmar_domain(domain), dev);
4826}
4827
4828static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
4829 struct device *dev)
4830{
4831 int ret;
4832
4833 if (!is_aux_domain(dev, domain))
4834 return -EPERM;
4835
4836 ret = prepare_domain_attach_device(domain, dev);
4837 if (ret)
4838 return ret;
4839
4840 return aux_domain_add_dev(to_dmar_domain(domain), dev);
4841}
4842
4843static void intel_iommu_detach_device(struct iommu_domain *domain,
4844 struct device *dev)
4845{
4846 dmar_remove_one_dev_info(dev);
4847}
4848
4849static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
4850 struct device *dev)
4851{
4852 aux_domain_remove_dev(to_dmar_domain(domain), dev);
4853}
4854
4855#ifdef CONFIG_INTEL_IOMMU_SVM
4856/*
4857 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
4858 * VT-d granularity. Invalidation is typically included in the unmap operation
4859 * as a result of DMA or VFIO unmap. However, for assigned devices guest
4860 * owns the first level page tables. Invalidations of translation caches in the
4861 * guest are trapped and passed down to the host.
4862 *
4863 * vIOMMU in the guest will only expose first level page tables, therefore
4864 * we do not support IOTLB granularity for request without PASID (second level).
4865 *
4866 * For example, to find the VT-d granularity encoding for IOTLB
4867 * type and page selective granularity within PASID:
4868 * X: indexed by iommu cache type
4869 * Y: indexed by enum iommu_inv_granularity
4870 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
4871 */
4872
4873static const int
4874inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
4875 /*
4876 * PASID based IOTLB invalidation: PASID selective (per PASID),
4877 * page selective (address granularity)
4878 */
4879 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
4880 /* PASID based dev TLBs */
4881 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
4882 /* PASID cache */
4883 {-EINVAL, -EINVAL, -EINVAL}
4884};
4885
4886static inline int to_vtd_granularity(int type, int granu)
4887{
4888 return inv_type_granu_table[type][granu];
4889}
4890
4891static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
4892{
4893 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
4894
4895 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
4896 * IOMMU cache invalidate API passes granu_size in bytes, and number of
4897 * granu size in contiguous memory.
4898 */
4899 return order_base_2(nr_pages);
4900}
4901
4902static int
4903intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
4904 struct iommu_cache_invalidate_info *inv_info)
4905{
4906 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4907 struct device_domain_info *info;
4908 struct intel_iommu *iommu;
4909 unsigned long flags;
4910 int cache_type;
4911 u8 bus, devfn;
4912 u16 did, sid;
4913 int ret = 0;
4914 u64 size = 0;
4915
4916 if (!inv_info || !dmar_domain)
4917 return -EINVAL;
4918
4919 if (!dev || !dev_is_pci(dev))
4920 return -ENODEV;
4921
4922 iommu = device_to_iommu(dev, &bus, &devfn);
4923 if (!iommu)
4924 return -ENODEV;
4925
4926 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
4927 return -EINVAL;
4928
4929 spin_lock_irqsave(&device_domain_lock, flags);
4930 spin_lock(&iommu->lock);
4931 info = get_domain_info(dev);
4932 if (!info) {
4933 ret = -EINVAL;
4934 goto out_unlock;
4935 }
4936 did = dmar_domain->iommu_did[iommu->seq_id];
4937 sid = PCI_DEVID(bus, devfn);
4938
4939 /* Size is only valid in address selective invalidation */
4940 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
4941 size = to_vtd_size(inv_info->granu.addr_info.granule_size,
4942 inv_info->granu.addr_info.nb_granules);
4943
4944 for_each_set_bit(cache_type,
4945 (unsigned long *)&inv_info->cache,
4946 IOMMU_CACHE_INV_TYPE_NR) {
4947 int granu = 0;
4948 u64 pasid = 0;
4949 u64 addr = 0;
4950
4951 granu = to_vtd_granularity(cache_type, inv_info->granularity);
4952 if (granu == -EINVAL) {
4953 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
4954 cache_type, inv_info->granularity);
4955 break;
4956 }
4957
4958 /*
4959 * PASID is stored in different locations based on the
4960 * granularity.
4961 */
4962 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
4963 (inv_info->granu.pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
4964 pasid = inv_info->granu.pasid_info.pasid;
4965 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4966 (inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
4967 pasid = inv_info->granu.addr_info.pasid;
4968
4969 switch (BIT(cache_type)) {
4970 case IOMMU_CACHE_INV_TYPE_IOTLB:
4971 /* HW will ignore LSB bits based on address mask */
4972 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
4973 size &&
4974 (inv_info->granu.addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
4975 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
4976 inv_info->granu.addr_info.addr, size);
4977 }
4978
4979 /*
4980 * If granu is PASID-selective, address is ignored.
4981 * We use npages = -1 to indicate that.
4982 */
4983 qi_flush_piotlb(iommu, did, pasid,
4984 mm_to_dma_pfn(inv_info->granu.addr_info.addr),
4985 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
4986 inv_info->granu.addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
4987
4988 if (!info->ats_enabled)
4989 break;
4990 /*
4991 * Always flush device IOTLB if ATS is enabled. vIOMMU
4992 * in the guest may assume IOTLB flush is inclusive,
4993 * which is more efficient.
4994 */
4995 fallthrough;
4996 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
4997 /*
4998 * PASID based device TLB invalidation does not support
4999 * IOMMU_INV_GRANU_PASID granularity but only supports
5000 * IOMMU_INV_GRANU_ADDR.
5001 * The equivalent of that is we set the size to be the
5002 * entire range of 64 bit. User only provides PASID info
5003 * without address info. So we set addr to 0.
5004 */
5005 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5006 size = 64 - VTD_PAGE_SHIFT;
5007 addr = 0;
5008 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5009 addr = inv_info->granu.addr_info.addr;
5010 }
5011
5012 if (info->ats_enabled)
5013 qi_flush_dev_iotlb_pasid(iommu, sid,
5014 info->pfsid, pasid,
5015 info->ats_qdep, addr,
5016 size);
5017 else
5018 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5019 break;
5020 default:
5021 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5022 cache_type);
5023 ret = -EINVAL;
5024 }
5025 }
5026out_unlock:
5027 spin_unlock(&iommu->lock);
5028 spin_unlock_irqrestore(&device_domain_lock, flags);
5029
5030 return ret;
5031}
5032#endif
5033
5034static int intel_iommu_map(struct iommu_domain *domain,
5035 unsigned long iova, phys_addr_t hpa,
5036 size_t size, int iommu_prot, gfp_t gfp)
5037{
5038 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5039 u64 max_addr;
5040 int prot = 0;
5041
5042 if (iommu_prot & IOMMU_READ)
5043 prot |= DMA_PTE_READ;
5044 if (iommu_prot & IOMMU_WRITE)
5045 prot |= DMA_PTE_WRITE;
5046 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5047 prot |= DMA_PTE_SNP;
5048
5049 max_addr = iova + size;
5050 if (dmar_domain->max_addr < max_addr) {
5051 u64 end;
5052
5053 /* check if minimum agaw is sufficient for mapped address */
5054 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5055 if (end < max_addr) {
5056 pr_err("%s: iommu width (%d) is not "
5057 "sufficient for the mapped address (%llx)\n",
5058 __func__, dmar_domain->gaw, max_addr);
5059 return -EFAULT;
5060 }
5061 dmar_domain->max_addr = max_addr;
5062 }
5063 /* Round up size to next multiple of PAGE_SIZE, if it and
5064 the low bits of hpa would take us onto the next page */
5065 size = aligned_nrpages(hpa, size);
5066 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5067 hpa >> VTD_PAGE_SHIFT, size, prot);
5068}
5069
5070static size_t intel_iommu_unmap(struct iommu_domain *domain,
5071 unsigned long iova, size_t size,
5072 struct iommu_iotlb_gather *gather)
5073{
5074 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5075 unsigned long start_pfn, last_pfn;
5076 int level = 0;
5077
5078 /* Cope with horrid API which requires us to unmap more than the
5079 size argument if it happens to be a large-page mapping. */
5080 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5081
5082 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5083 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5084
5085 start_pfn = iova >> VTD_PAGE_SHIFT;
5086 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5087
5088 gather->freelist = domain_unmap(dmar_domain, start_pfn,
5089 last_pfn, gather->freelist);
5090
5091 if (dmar_domain->max_addr == iova + size)
5092 dmar_domain->max_addr = iova;
5093
5094 iommu_iotlb_gather_add_page(domain, gather, iova, size);
5095
5096 return size;
5097}
5098
5099static void intel_iommu_tlb_sync(struct iommu_domain *domain,
5100 struct iommu_iotlb_gather *gather)
5101{
5102 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5103 unsigned long iova_pfn = IOVA_PFN(gather->start);
5104 size_t size = gather->end - gather->start;
5105 unsigned long start_pfn;
5106 unsigned long nrpages;
5107 int iommu_id;
5108
5109 nrpages = aligned_nrpages(gather->start, size);
5110 start_pfn = mm_to_dma_pfn(iova_pfn);
5111
5112 for_each_domain_iommu(iommu_id, dmar_domain)
5113 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5114 start_pfn, nrpages, !gather->freelist, 0);
5115
5116 dma_free_pagelist(gather->freelist);
5117}
5118
5119static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5120 dma_addr_t iova)
5121{
5122 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5123 struct dma_pte *pte;
5124 int level = 0;
5125 u64 phys = 0;
5126
5127 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5128 if (pte && dma_pte_present(pte))
5129 phys = dma_pte_addr(pte) +
5130 (iova & (BIT_MASK(level_to_offset_bits(level) +
5131 VTD_PAGE_SHIFT) - 1));
5132
5133 return phys;
5134}
5135
5136static bool intel_iommu_capable(enum iommu_cap cap)
5137{
5138 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5139 return domain_update_iommu_snooping(NULL);
5140 if (cap == IOMMU_CAP_INTR_REMAP)
5141 return irq_remapping_enabled == 1;
5142
5143 return false;
5144}
5145
5146static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5147{
5148 struct intel_iommu *iommu;
5149
5150 iommu = device_to_iommu(dev, NULL, NULL);
5151 if (!iommu)
5152 return ERR_PTR(-ENODEV);
5153
5154 if (translation_pre_enabled(iommu))
5155 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5156
5157 return &iommu->iommu;
5158}
5159
5160static void intel_iommu_release_device(struct device *dev)
5161{
5162 struct intel_iommu *iommu;
5163
5164 iommu = device_to_iommu(dev, NULL, NULL);
5165 if (!iommu)
5166 return;
5167
5168 dmar_remove_one_dev_info(dev);
5169
5170 set_dma_ops(dev, NULL);
5171}
5172
5173static void intel_iommu_probe_finalize(struct device *dev)
5174{
5175 struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
5176
5177 if (domain && domain->type == IOMMU_DOMAIN_DMA)
5178 iommu_setup_dma_ops(dev, 0, U64_MAX);
5179 else
5180 set_dma_ops(dev, NULL);
5181}
5182
5183static void intel_iommu_get_resv_regions(struct device *device,
5184 struct list_head *head)
5185{
5186 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5187 struct iommu_resv_region *reg;
5188 struct dmar_rmrr_unit *rmrr;
5189 struct device *i_dev;
5190 int i;
5191
5192 down_read(&dmar_global_lock);
5193 for_each_rmrr_units(rmrr) {
5194 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5195 i, i_dev) {
5196 struct iommu_resv_region *resv;
5197 enum iommu_resv_type type;
5198 size_t length;
5199
5200 if (i_dev != device &&
5201 !is_downstream_to_pci_bridge(device, i_dev))
5202 continue;
5203
5204 length = rmrr->end_address - rmrr->base_address + 1;
5205
5206 type = device_rmrr_is_relaxable(device) ?
5207 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5208
5209 resv = iommu_alloc_resv_region(rmrr->base_address,
5210 length, prot, type);
5211 if (!resv)
5212 break;
5213
5214 list_add_tail(&resv->list, head);
5215 }
5216 }
5217 up_read(&dmar_global_lock);
5218
5219#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5220 if (dev_is_pci(device)) {
5221 struct pci_dev *pdev = to_pci_dev(device);
5222
5223 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5224 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5225 IOMMU_RESV_DIRECT_RELAXABLE);
5226 if (reg)
5227 list_add_tail(®->list, head);
5228 }
5229 }
5230#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5231
5232 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5233 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5234 0, IOMMU_RESV_MSI);
5235 if (!reg)
5236 return;
5237 list_add_tail(®->list, head);
5238}
5239
5240int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5241{
5242 struct device_domain_info *info;
5243 struct context_entry *context;
5244 struct dmar_domain *domain;
5245 unsigned long flags;
5246 u64 ctx_lo;
5247 int ret;
5248
5249 domain = find_domain(dev);
5250 if (!domain)
5251 return -EINVAL;
5252
5253 spin_lock_irqsave(&device_domain_lock, flags);
5254 spin_lock(&iommu->lock);
5255
5256 ret = -EINVAL;
5257 info = get_domain_info(dev);
5258 if (!info || !info->pasid_supported)
5259 goto out;
5260
5261 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5262 if (WARN_ON(!context))
5263 goto out;
5264
5265 ctx_lo = context[0].lo;
5266
5267 if (!(ctx_lo & CONTEXT_PASIDE)) {
5268 ctx_lo |= CONTEXT_PASIDE;
5269 context[0].lo = ctx_lo;
5270 wmb();
5271 iommu->flush.flush_context(iommu,
5272 domain->iommu_did[iommu->seq_id],
5273 PCI_DEVID(info->bus, info->devfn),
5274 DMA_CCMD_MASK_NOBIT,
5275 DMA_CCMD_DEVICE_INVL);
5276 }
5277
5278 /* Enable PASID support in the device, if it wasn't already */
5279 if (!info->pasid_enabled)
5280 iommu_enable_dev_iotlb(info);
5281
5282 ret = 0;
5283
5284 out:
5285 spin_unlock(&iommu->lock);
5286 spin_unlock_irqrestore(&device_domain_lock, flags);
5287
5288 return ret;
5289}
5290
5291static struct iommu_group *intel_iommu_device_group(struct device *dev)
5292{
5293 if (dev_is_pci(dev))
5294 return pci_device_group(dev);
5295 return generic_device_group(dev);
5296}
5297
5298static int intel_iommu_enable_auxd(struct device *dev)
5299{
5300 struct device_domain_info *info;
5301 struct intel_iommu *iommu;
5302 unsigned long flags;
5303 int ret;
5304
5305 iommu = device_to_iommu(dev, NULL, NULL);
5306 if (!iommu || dmar_disabled)
5307 return -EINVAL;
5308
5309 if (!sm_supported(iommu) || !pasid_supported(iommu))
5310 return -EINVAL;
5311
5312 ret = intel_iommu_enable_pasid(iommu, dev);
5313 if (ret)
5314 return -ENODEV;
5315
5316 spin_lock_irqsave(&device_domain_lock, flags);
5317 info = get_domain_info(dev);
5318 info->auxd_enabled = 1;
5319 spin_unlock_irqrestore(&device_domain_lock, flags);
5320
5321 return 0;
5322}
5323
5324static int intel_iommu_disable_auxd(struct device *dev)
5325{
5326 struct device_domain_info *info;
5327 unsigned long flags;
5328
5329 spin_lock_irqsave(&device_domain_lock, flags);
5330 info = get_domain_info(dev);
5331 if (!WARN_ON(!info))
5332 info->auxd_enabled = 0;
5333 spin_unlock_irqrestore(&device_domain_lock, flags);
5334
5335 return 0;
5336}
5337
5338static int intel_iommu_enable_sva(struct device *dev)
5339{
5340 struct device_domain_info *info = get_domain_info(dev);
5341 struct intel_iommu *iommu;
5342 int ret;
5343
5344 if (!info || dmar_disabled)
5345 return -EINVAL;
5346
5347 iommu = info->iommu;
5348 if (!iommu)
5349 return -EINVAL;
5350
5351 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
5352 return -ENODEV;
5353
5354 if (intel_iommu_enable_pasid(iommu, dev))
5355 return -ENODEV;
5356
5357 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
5358 return -EINVAL;
5359
5360 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
5361 if (!ret)
5362 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
5363
5364 return ret;
5365}
5366
5367static int intel_iommu_disable_sva(struct device *dev)
5368{
5369 struct device_domain_info *info = get_domain_info(dev);
5370 struct intel_iommu *iommu = info->iommu;
5371 int ret;
5372
5373 ret = iommu_unregister_device_fault_handler(dev);
5374 if (!ret)
5375 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
5376
5377 return ret;
5378}
5379
5380/*
5381 * A PCI express designated vendor specific extended capability is defined
5382 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5383 * for system software and tools to detect endpoint devices supporting the
5384 * Intel scalable IO virtualization without host driver dependency.
5385 *
5386 * Returns the address of the matching extended capability structure within
5387 * the device's PCI configuration space or 0 if the device does not support
5388 * it.
5389 */
5390static int siov_find_pci_dvsec(struct pci_dev *pdev)
5391{
5392 int pos;
5393 u16 vendor, id;
5394
5395 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5396 while (pos) {
5397 pci_read_config_word(pdev, pos + 4, &vendor);
5398 pci_read_config_word(pdev, pos + 8, &id);
5399 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5400 return pos;
5401
5402 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5403 }
5404
5405 return 0;
5406}
5407
5408static bool
5409intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5410{
5411 struct device_domain_info *info = get_domain_info(dev);
5412
5413 if (feat == IOMMU_DEV_FEAT_AUX) {
5414 int ret;
5415
5416 if (!dev_is_pci(dev) || dmar_disabled ||
5417 !scalable_mode_support() || !pasid_mode_support())
5418 return false;
5419
5420 ret = pci_pasid_features(to_pci_dev(dev));
5421 if (ret < 0)
5422 return false;
5423
5424 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5425 }
5426
5427 if (feat == IOMMU_DEV_FEAT_IOPF)
5428 return info && info->pri_supported;
5429
5430 if (feat == IOMMU_DEV_FEAT_SVA)
5431 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5432 info->pasid_supported && info->pri_supported &&
5433 info->ats_supported;
5434
5435 return false;
5436}
5437
5438static int
5439intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5440{
5441 switch (feat) {
5442 case IOMMU_DEV_FEAT_AUX:
5443 return intel_iommu_enable_auxd(dev);
5444
5445 case IOMMU_DEV_FEAT_IOPF:
5446 return intel_iommu_dev_has_feat(dev, feat) ? 0 : -ENODEV;
5447
5448 case IOMMU_DEV_FEAT_SVA:
5449 return intel_iommu_enable_sva(dev);
5450
5451 default:
5452 return -ENODEV;
5453 }
5454}
5455
5456static int
5457intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5458{
5459 switch (feat) {
5460 case IOMMU_DEV_FEAT_AUX:
5461 return intel_iommu_disable_auxd(dev);
5462
5463 case IOMMU_DEV_FEAT_IOPF:
5464 return 0;
5465
5466 case IOMMU_DEV_FEAT_SVA:
5467 return intel_iommu_disable_sva(dev);
5468
5469 default:
5470 return -ENODEV;
5471 }
5472}
5473
5474static bool
5475intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5476{
5477 struct device_domain_info *info = get_domain_info(dev);
5478
5479 if (feat == IOMMU_DEV_FEAT_AUX)
5480 return scalable_mode_support() && info && info->auxd_enabled;
5481
5482 return false;
5483}
5484
5485static int
5486intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5487{
5488 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5489
5490 return dmar_domain->default_pasid > 0 ?
5491 dmar_domain->default_pasid : -EINVAL;
5492}
5493
5494static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5495 struct device *dev)
5496{
5497 return attach_deferred(dev);
5498}
5499
5500static int
5501intel_iommu_enable_nesting(struct iommu_domain *domain)
5502{
5503 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504 unsigned long flags;
5505 int ret = -ENODEV;
5506
5507 spin_lock_irqsave(&device_domain_lock, flags);
5508 if (list_empty(&dmar_domain->devices)) {
5509 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
5510 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
5511 ret = 0;
5512 }
5513 spin_unlock_irqrestore(&device_domain_lock, flags);
5514
5515 return ret;
5516}
5517
5518/*
5519 * Check that the device does not live on an external facing PCI port that is
5520 * marked as untrusted. Such devices should not be able to apply quirks and
5521 * thus not be able to bypass the IOMMU restrictions.
5522 */
5523static bool risky_device(struct pci_dev *pdev)
5524{
5525 if (pdev->untrusted) {
5526 pci_info(pdev,
5527 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
5528 pdev->vendor, pdev->device);
5529 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
5530 return true;
5531 }
5532 return false;
5533}
5534
5535static void clflush_sync_map(struct dmar_domain *domain, unsigned long clf_pfn,
5536 unsigned long clf_pages)
5537{
5538 struct dma_pte *first_pte = NULL, *pte = NULL;
5539 unsigned long lvl_pages = 0;
5540 int level = 0;
5541
5542 while (clf_pages > 0) {
5543 if (!pte) {
5544 level = 0;
5545 pte = pfn_to_dma_pte(domain, clf_pfn, &level);
5546 if (WARN_ON(!pte))
5547 return;
5548 first_pte = pte;
5549 lvl_pages = lvl_to_nr_pages(level);
5550 }
5551
5552 if (WARN_ON(!lvl_pages || clf_pages < lvl_pages))
5553 return;
5554
5555 clf_pages -= lvl_pages;
5556 clf_pfn += lvl_pages;
5557 pte++;
5558
5559 if (!clf_pages || first_pte_in_page(pte) ||
5560 (level > 1 && clf_pages < lvl_pages)) {
5561 domain_flush_cache(domain, first_pte,
5562 (void *)pte - (void *)first_pte);
5563 pte = NULL;
5564 }
5565 }
5566}
5567
5568static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
5569 unsigned long iova, size_t size)
5570{
5571 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5572 unsigned long pages = aligned_nrpages(iova, size);
5573 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
5574 struct intel_iommu *iommu;
5575 int iommu_id;
5576
5577 if (!dmar_domain->iommu_coherency)
5578 clflush_sync_map(dmar_domain, pfn, pages);
5579
5580 for_each_domain_iommu(iommu_id, dmar_domain) {
5581 iommu = g_iommus[iommu_id];
5582 __mapping_notify_one(iommu, dmar_domain, pfn, pages);
5583 }
5584}
5585
5586const struct iommu_ops intel_iommu_ops = {
5587 .capable = intel_iommu_capable,
5588 .domain_alloc = intel_iommu_domain_alloc,
5589 .domain_free = intel_iommu_domain_free,
5590 .enable_nesting = intel_iommu_enable_nesting,
5591 .attach_dev = intel_iommu_attach_device,
5592 .detach_dev = intel_iommu_detach_device,
5593 .aux_attach_dev = intel_iommu_aux_attach_device,
5594 .aux_detach_dev = intel_iommu_aux_detach_device,
5595 .aux_get_pasid = intel_iommu_aux_get_pasid,
5596 .map = intel_iommu_map,
5597 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
5598 .unmap = intel_iommu_unmap,
5599 .flush_iotlb_all = intel_flush_iotlb_all,
5600 .iotlb_sync = intel_iommu_tlb_sync,
5601 .iova_to_phys = intel_iommu_iova_to_phys,
5602 .probe_device = intel_iommu_probe_device,
5603 .probe_finalize = intel_iommu_probe_finalize,
5604 .release_device = intel_iommu_release_device,
5605 .get_resv_regions = intel_iommu_get_resv_regions,
5606 .put_resv_regions = generic_iommu_put_resv_regions,
5607 .device_group = intel_iommu_device_group,
5608 .dev_has_feat = intel_iommu_dev_has_feat,
5609 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5610 .dev_enable_feat = intel_iommu_dev_enable_feat,
5611 .dev_disable_feat = intel_iommu_dev_disable_feat,
5612 .is_attach_deferred = intel_iommu_is_attach_deferred,
5613 .def_domain_type = device_def_domain_type,
5614 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5615#ifdef CONFIG_INTEL_IOMMU_SVM
5616 .cache_invalidate = intel_iommu_sva_invalidate,
5617 .sva_bind_gpasid = intel_svm_bind_gpasid,
5618 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
5619 .sva_bind = intel_svm_bind,
5620 .sva_unbind = intel_svm_unbind,
5621 .sva_get_pasid = intel_svm_get_pasid,
5622 .page_response = intel_svm_page_response,
5623#endif
5624};
5625
5626static void quirk_iommu_igfx(struct pci_dev *dev)
5627{
5628 if (risky_device(dev))
5629 return;
5630
5631 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5632 dmar_map_gfx = 0;
5633}
5634
5635/* G4x/GM45 integrated gfx dmar support is totally busted. */
5636DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5637DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5638DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5639DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5640DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5641DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5642DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5643
5644/* Broadwell igfx malfunctions with dmar */
5645DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5646DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5647DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5648DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5649DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5650DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
5651DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
5652DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
5653DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
5654DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
5655DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
5656DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
5657DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
5658DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
5659DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
5660DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
5661DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
5662DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
5663DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
5664DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
5665DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
5666DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
5667DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
5668DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
5669
5670static void quirk_iommu_rwbf(struct pci_dev *dev)
5671{
5672 if (risky_device(dev))
5673 return;
5674
5675 /*
5676 * Mobile 4 Series Chipset neglects to set RWBF capability,
5677 * but needs it. Same seems to hold for the desktop versions.
5678 */
5679 pci_info(dev, "Forcing write-buffer flush capability\n");
5680 rwbf_quirk = 1;
5681}
5682
5683DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5684DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5685DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5686DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5687DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5688DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5689DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5690
5691#define GGC 0x52
5692#define GGC_MEMORY_SIZE_MASK (0xf << 8)
5693#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5694#define GGC_MEMORY_SIZE_1M (0x1 << 8)
5695#define GGC_MEMORY_SIZE_2M (0x3 << 8)
5696#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5697#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5698#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5699#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5700
5701static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5702{
5703 unsigned short ggc;
5704
5705 if (risky_device(dev))
5706 return;
5707
5708 if (pci_read_config_word(dev, GGC, &ggc))
5709 return;
5710
5711 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5712 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5713 dmar_map_gfx = 0;
5714 } else if (dmar_map_gfx) {
5715 /* we have to ensure the gfx device is idle before we flush */
5716 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5717 intel_iommu_strict = 1;
5718 }
5719}
5720DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5721DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5722DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5723DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5724
5725static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
5726{
5727 unsigned short ver;
5728
5729 if (!IS_GFX_DEVICE(dev))
5730 return;
5731
5732 ver = (dev->device >> 8) & 0xff;
5733 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
5734 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
5735 ver != 0x9a)
5736 return;
5737
5738 if (risky_device(dev))
5739 return;
5740
5741 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5742 iommu_skip_te_disable = 1;
5743}
5744DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5745
5746/* On Tylersburg chipsets, some BIOSes have been known to enable the
5747 ISOCH DMAR unit for the Azalia sound device, but not give it any
5748 TLB entries, which causes it to deadlock. Check for that. We do
5749 this in a function called from init_dmars(), instead of in a PCI
5750 quirk, because we don't want to print the obnoxious "BIOS broken"
5751 message if VT-d is actually disabled.
5752*/
5753static void __init check_tylersburg_isoch(void)
5754{
5755 struct pci_dev *pdev;
5756 uint32_t vtisochctrl;
5757
5758 /* If there's no Azalia in the system anyway, forget it. */
5759 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5760 if (!pdev)
5761 return;
5762
5763 if (risky_device(pdev)) {
5764 pci_dev_put(pdev);
5765 return;
5766 }
5767
5768 pci_dev_put(pdev);
5769
5770 /* System Management Registers. Might be hidden, in which case
5771 we can't do the sanity check. But that's OK, because the
5772 known-broken BIOSes _don't_ actually hide it, so far. */
5773 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5774 if (!pdev)
5775 return;
5776
5777 if (risky_device(pdev)) {
5778 pci_dev_put(pdev);
5779 return;
5780 }
5781
5782 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5783 pci_dev_put(pdev);
5784 return;
5785 }
5786
5787 pci_dev_put(pdev);
5788
5789 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790 if (vtisochctrl & 1)
5791 return;
5792
5793 /* Drop all bits other than the number of TLB entries */
5794 vtisochctrl &= 0x1c;
5795
5796 /* If we have the recommended number of TLB entries (16), fine. */
5797 if (vtisochctrl == 0x10)
5798 return;
5799
5800 /* Zero TLB entries? You get to ride the short bus to school. */
5801 if (!vtisochctrl) {
5802 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804 dmi_get_system_info(DMI_BIOS_VENDOR),
5805 dmi_get_system_info(DMI_BIOS_VERSION),
5806 dmi_get_system_info(DMI_PRODUCT_VERSION));
5807 iommu_identity_mapping |= IDENTMAP_AZALIA;
5808 return;
5809 }
5810
5811 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5812 vtisochctrl);
5813}