Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/intel-svm.h>
20#include <linux/memory.h>
21#include <linux/pci.h>
22#include <linux/pci-ats.h>
23#include <linux/spinlock.h>
24#include <linux/syscore_ops.h>
25#include <linux/tboot.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33
34#define ROOT_SIZE VTD_PAGE_SIZE
35#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START (0xfee00000)
43#define IOAPIC_RANGE_END (0xfeefffff)
44#define IOVA_START_ADDR (0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define MAX_AGAW_WIDTH 64
49#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60/* IO virtual address start page frame number */
61#define IOVA_START_PFN (1)
62
63#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
64
65/* page table handling */
66#define LEVEL_STRIDE (9)
67#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
68
69static inline int agaw_to_level(int agaw)
70{
71 return agaw + 2;
72}
73
74static inline int agaw_to_width(int agaw)
75{
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77}
78
79static inline int width_to_agaw(int width)
80{
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82}
83
84static inline unsigned int level_to_offset_bits(int level)
85{
86 return (level - 1) * LEVEL_STRIDE;
87}
88
89static inline int pfn_level_offset(u64 pfn, int level)
90{
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92}
93
94static inline u64 level_mask(int level)
95{
96 return -1ULL << level_to_offset_bits(level);
97}
98
99static inline u64 level_size(int level)
100{
101 return 1ULL << level_to_offset_bits(level);
102}
103
104static inline u64 align_to_level(u64 pfn, int level)
105{
106 return (pfn + level_size(level) - 1) & level_mask(level);
107}
108
109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110{
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112}
113
114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117{
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119}
120static inline unsigned long page_to_dma_pfn(struct page *pg)
121{
122 return mm_to_dma_pfn(page_to_pfn(pg));
123}
124static inline unsigned long virt_to_dma_pfn(void *p)
125{
126 return page_to_dma_pfn(virt_to_page(p));
127}
128
129static void __init check_tylersburg_isoch(void);
130static int rwbf_quirk;
131
132/*
133 * set to 1 to panic kernel if can't successfully enable VT-d
134 * (used when kernel is launched w/ TXT)
135 */
136static int force_on = 0;
137static int intel_iommu_tboot_noforce;
138static int no_platform_optin;
139
140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142/*
143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144 * if marked present.
145 */
146static phys_addr_t root_entry_lctp(struct root_entry *re)
147{
148 if (!(re->lo & 1))
149 return 0;
150
151 return re->lo & VTD_PAGE_MASK;
152}
153
154/*
155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156 * if marked present.
157 */
158static phys_addr_t root_entry_uctp(struct root_entry *re)
159{
160 if (!(re->hi & 1))
161 return 0;
162
163 return re->hi & VTD_PAGE_MASK;
164}
165
166static inline void context_set_present(struct context_entry *context)
167{
168 context->lo |= 1;
169}
170
171static inline void context_set_fault_enable(struct context_entry *context)
172{
173 context->lo &= (((u64)-1) << 2) | 1;
174}
175
176static inline void context_set_translation_type(struct context_entry *context,
177 unsigned long value)
178{
179 context->lo &= (((u64)-1) << 4) | 3;
180 context->lo |= (value & 3) << 2;
181}
182
183static inline void context_set_address_root(struct context_entry *context,
184 unsigned long value)
185{
186 context->lo &= ~VTD_PAGE_MASK;
187 context->lo |= value & VTD_PAGE_MASK;
188}
189
190static inline void context_set_address_width(struct context_entry *context,
191 unsigned long value)
192{
193 context->hi |= value & 7;
194}
195
196static inline void context_set_domain_id(struct context_entry *context,
197 unsigned long value)
198{
199 context->hi |= (value & ((1 << 16) - 1)) << 8;
200}
201
202static inline void context_set_pasid(struct context_entry *context)
203{
204 context->lo |= CONTEXT_PASIDE;
205}
206
207static inline int context_domain_id(struct context_entry *c)
208{
209 return((c->hi >> 8) & 0xffff);
210}
211
212static inline void context_clear_entry(struct context_entry *context)
213{
214 context->lo = 0;
215 context->hi = 0;
216}
217
218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219{
220 if (!iommu->copied_tables)
221 return false;
222
223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224}
225
226static inline void
227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228{
229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230}
231
232static inline void
233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234{
235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236}
237
238/*
239 * This domain is a statically identity mapping domain.
240 * 1. This domain creats a static 1:1 mapping to all usable memory.
241 * 2. It maps to each iommu if successful.
242 * 3. Each iommu mapps to this domain if successful.
243 */
244static struct dmar_domain *si_domain;
245static int hw_pass_through = 1;
246
247struct dmar_rmrr_unit {
248 struct list_head list; /* list of rmrr units */
249 struct acpi_dmar_header *hdr; /* ACPI header */
250 u64 base_address; /* reserved base address*/
251 u64 end_address; /* reserved end address */
252 struct dmar_dev_scope *devices; /* target devices */
253 int devices_cnt; /* target device count */
254};
255
256struct dmar_atsr_unit {
257 struct list_head list; /* list of ATSR units */
258 struct acpi_dmar_header *hdr; /* ACPI header */
259 struct dmar_dev_scope *devices; /* target devices */
260 int devices_cnt; /* target device count */
261 u8 include_all:1; /* include all ports */
262};
263
264struct dmar_satc_unit {
265 struct list_head list; /* list of SATC units */
266 struct acpi_dmar_header *hdr; /* ACPI header */
267 struct dmar_dev_scope *devices; /* target devices */
268 struct intel_iommu *iommu; /* the corresponding iommu */
269 int devices_cnt; /* target device count */
270 u8 atc_required:1; /* ATS is required */
271};
272
273static LIST_HEAD(dmar_atsr_units);
274static LIST_HEAD(dmar_rmrr_units);
275static LIST_HEAD(dmar_satc_units);
276
277#define for_each_rmrr_units(rmrr) \
278 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279
280static void device_block_translation(struct device *dev);
281static void intel_iommu_domain_free(struct iommu_domain *domain);
282
283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285
286int intel_iommu_enabled = 0;
287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288
289static int dmar_map_gfx = 1;
290static int intel_iommu_superpage = 1;
291static int iommu_identity_mapping;
292static int iommu_skip_te_disable;
293
294#define IDENTMAP_GFX 2
295#define IDENTMAP_AZALIA 4
296
297const struct iommu_ops intel_iommu_ops;
298
299static bool translation_pre_enabled(struct intel_iommu *iommu)
300{
301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302}
303
304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305{
306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307}
308
309static void init_translation_status(struct intel_iommu *iommu)
310{
311 u32 gsts;
312
313 gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 if (gsts & DMA_GSTS_TES)
315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316}
317
318static int __init intel_iommu_setup(char *str)
319{
320 if (!str)
321 return -EINVAL;
322
323 while (*str) {
324 if (!strncmp(str, "on", 2)) {
325 dmar_disabled = 0;
326 pr_info("IOMMU enabled\n");
327 } else if (!strncmp(str, "off", 3)) {
328 dmar_disabled = 1;
329 no_platform_optin = 1;
330 pr_info("IOMMU disabled\n");
331 } else if (!strncmp(str, "igfx_off", 8)) {
332 dmar_map_gfx = 0;
333 pr_info("Disable GFX device mapping\n");
334 } else if (!strncmp(str, "forcedac", 8)) {
335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 iommu_dma_forcedac = true;
337 } else if (!strncmp(str, "strict", 6)) {
338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 iommu_set_dma_strict();
340 } else if (!strncmp(str, "sp_off", 6)) {
341 pr_info("Disable supported super page\n");
342 intel_iommu_superpage = 0;
343 } else if (!strncmp(str, "sm_on", 5)) {
344 pr_info("Enable scalable mode if hardware supports\n");
345 intel_iommu_sm = 1;
346 } else if (!strncmp(str, "sm_off", 6)) {
347 pr_info("Scalable mode is disallowed\n");
348 intel_iommu_sm = 0;
349 } else if (!strncmp(str, "tboot_noforce", 13)) {
350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 intel_iommu_tboot_noforce = 1;
352 } else {
353 pr_notice("Unknown option - '%s'\n", str);
354 }
355
356 str += strcspn(str, ",");
357 while (*str == ',')
358 str++;
359 }
360
361 return 1;
362}
363__setup("intel_iommu=", intel_iommu_setup);
364
365void *alloc_pgtable_page(int node)
366{
367 struct page *page;
368 void *vaddr = NULL;
369
370 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
371 if (page)
372 vaddr = page_address(page);
373 return vaddr;
374}
375
376void free_pgtable_page(void *vaddr)
377{
378 free_page((unsigned long)vaddr);
379}
380
381static inline int domain_type_is_si(struct dmar_domain *domain)
382{
383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384}
385
386static inline int domain_pfn_supported(struct dmar_domain *domain,
387 unsigned long pfn)
388{
389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390
391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392}
393
394/*
395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397 * the returned SAGAW.
398 */
399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400{
401 unsigned long fl_sagaw, sl_sagaw;
402
403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 sl_sagaw = cap_sagaw(iommu->cap);
405
406 /* Second level only. */
407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 return sl_sagaw;
409
410 /* First level only. */
411 if (!ecap_slts(iommu->ecap))
412 return fl_sagaw;
413
414 return fl_sagaw & sl_sagaw;
415}
416
417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418{
419 unsigned long sagaw;
420 int agaw;
421
422 sagaw = __iommu_calculate_sagaw(iommu);
423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 if (test_bit(agaw, &sagaw))
425 break;
426 }
427
428 return agaw;
429}
430
431/*
432 * Calculate max SAGAW for each iommu.
433 */
434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435{
436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437}
438
439/*
440 * calculate agaw for each iommu.
441 * "SAGAW" may be different across iommus, use a default agaw, and
442 * get a supported less agaw for iommus that don't support the default agaw.
443 */
444int iommu_calculate_agaw(struct intel_iommu *iommu)
445{
446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447}
448
449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450{
451 return sm_supported(iommu) ?
452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453}
454
455static void domain_update_iommu_coherency(struct dmar_domain *domain)
456{
457 struct iommu_domain_info *info;
458 struct dmar_drhd_unit *drhd;
459 struct intel_iommu *iommu;
460 bool found = false;
461 unsigned long i;
462
463 domain->iommu_coherency = true;
464 xa_for_each(&domain->iommu_array, i, info) {
465 found = true;
466 if (!iommu_paging_structure_coherency(info->iommu)) {
467 domain->iommu_coherency = false;
468 break;
469 }
470 }
471 if (found)
472 return;
473
474 /* No hardware attached; use lowest common denominator */
475 rcu_read_lock();
476 for_each_active_iommu(iommu, drhd) {
477 if (!iommu_paging_structure_coherency(iommu)) {
478 domain->iommu_coherency = false;
479 break;
480 }
481 }
482 rcu_read_unlock();
483}
484
485static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 struct intel_iommu *skip)
487{
488 struct dmar_drhd_unit *drhd;
489 struct intel_iommu *iommu;
490 int mask = 0x3;
491
492 if (!intel_iommu_superpage)
493 return 0;
494
495 /* set iommu_superpage to the smallest common denominator */
496 rcu_read_lock();
497 for_each_active_iommu(iommu, drhd) {
498 if (iommu != skip) {
499 if (domain && domain->use_first_level) {
500 if (!cap_fl1gp_support(iommu->cap))
501 mask = 0x1;
502 } else {
503 mask &= cap_super_page_val(iommu->cap);
504 }
505
506 if (!mask)
507 break;
508 }
509 }
510 rcu_read_unlock();
511
512 return fls(mask);
513}
514
515static int domain_update_device_node(struct dmar_domain *domain)
516{
517 struct device_domain_info *info;
518 int nid = NUMA_NO_NODE;
519 unsigned long flags;
520
521 spin_lock_irqsave(&domain->lock, flags);
522 list_for_each_entry(info, &domain->devices, link) {
523 /*
524 * There could possibly be multiple device numa nodes as devices
525 * within the same domain may sit behind different IOMMUs. There
526 * isn't perfect answer in such situation, so we select first
527 * come first served policy.
528 */
529 nid = dev_to_node(info->dev);
530 if (nid != NUMA_NO_NODE)
531 break;
532 }
533 spin_unlock_irqrestore(&domain->lock, flags);
534
535 return nid;
536}
537
538static void domain_update_iotlb(struct dmar_domain *domain);
539
540/* Return the super pagesize bitmap if supported. */
541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542{
543 unsigned long bitmap = 0;
544
545 /*
546 * 1-level super page supports page size of 2MiB, 2-level super page
547 * supports page size of both 2MiB and 1GiB.
548 */
549 if (domain->iommu_superpage == 1)
550 bitmap |= SZ_2M;
551 else if (domain->iommu_superpage == 2)
552 bitmap |= SZ_2M | SZ_1G;
553
554 return bitmap;
555}
556
557/* Some capabilities may be different across iommus */
558static void domain_update_iommu_cap(struct dmar_domain *domain)
559{
560 domain_update_iommu_coherency(domain);
561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562
563 /*
564 * If RHSA is missing, we should default to the device numa domain
565 * as fall back.
566 */
567 if (domain->nid == NUMA_NO_NODE)
568 domain->nid = domain_update_device_node(domain);
569
570 /*
571 * First-level translation restricts the input-address to a
572 * canonical address (i.e., address bits 63:N have the same
573 * value as address bit [N-1], where N is 48-bits with 4-level
574 * paging and 57-bits with 5-level paging). Hence, skip bit
575 * [N-1].
576 */
577 if (domain->use_first_level)
578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 else
580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581
582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 domain_update_iotlb(domain);
584}
585
586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 u8 devfn, int alloc)
588{
589 struct root_entry *root = &iommu->root_entry[bus];
590 struct context_entry *context;
591 u64 *entry;
592
593 /*
594 * Except that the caller requested to allocate a new entry,
595 * returning a copied context entry makes no sense.
596 */
597 if (!alloc && context_copied(iommu, bus, devfn))
598 return NULL;
599
600 entry = &root->lo;
601 if (sm_supported(iommu)) {
602 if (devfn >= 0x80) {
603 devfn -= 0x80;
604 entry = &root->hi;
605 }
606 devfn *= 2;
607 }
608 if (*entry & 1)
609 context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 else {
611 unsigned long phy_addr;
612 if (!alloc)
613 return NULL;
614
615 context = alloc_pgtable_page(iommu->node);
616 if (!context)
617 return NULL;
618
619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 phy_addr = virt_to_phys((void *)context);
621 *entry = phy_addr | 1;
622 __iommu_flush_cache(iommu, entry, sizeof(*entry));
623 }
624 return &context[devfn];
625}
626
627/**
628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629 * sub-hierarchy of a candidate PCI-PCI bridge
630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631 * @bridge: the candidate PCI-PCI bridge
632 *
633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634 */
635static bool
636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637{
638 struct pci_dev *pdev, *pbridge;
639
640 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 return false;
642
643 pdev = to_pci_dev(dev);
644 pbridge = to_pci_dev(bridge);
645
646 if (pbridge->subordinate &&
647 pbridge->subordinate->number <= pdev->bus->number &&
648 pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 return true;
650
651 return false;
652}
653
654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655{
656 struct dmar_drhd_unit *drhd;
657 u32 vtbar;
658 int rc;
659
660 /* We know that this device on this chipset has its own IOMMU.
661 * If we find it under a different IOMMU, then the BIOS is lying
662 * to us. Hope that the IOMMU for this device is actually
663 * disabled, and it needs no translation...
664 */
665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 if (rc) {
667 /* "can't" happen */
668 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 return false;
670 }
671 vtbar &= 0xffff0000;
672
673 /* we know that the this iommu should be at offset 0xa000 from vtbar */
674 drhd = dmar_find_matched_drhd_unit(pdev);
675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 return true;
679 }
680
681 return false;
682}
683
684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685{
686 if (!iommu || iommu->drhd->ignored)
687 return true;
688
689 if (dev_is_pci(dev)) {
690 struct pci_dev *pdev = to_pci_dev(dev);
691
692 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 quirk_ioat_snb_local_iommu(pdev))
695 return true;
696 }
697
698 return false;
699}
700
701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702{
703 struct dmar_drhd_unit *drhd = NULL;
704 struct pci_dev *pdev = NULL;
705 struct intel_iommu *iommu;
706 struct device *tmp;
707 u16 segment = 0;
708 int i;
709
710 if (!dev)
711 return NULL;
712
713 if (dev_is_pci(dev)) {
714 struct pci_dev *pf_pdev;
715
716 pdev = pci_real_dma_dev(to_pci_dev(dev));
717
718 /* VFs aren't listed in scope tables; we need to look up
719 * the PF instead to find the IOMMU. */
720 pf_pdev = pci_physfn(pdev);
721 dev = &pf_pdev->dev;
722 segment = pci_domain_nr(pdev->bus);
723 } else if (has_acpi_companion(dev))
724 dev = &ACPI_COMPANION(dev)->dev;
725
726 rcu_read_lock();
727 for_each_iommu(iommu, drhd) {
728 if (pdev && segment != drhd->segment)
729 continue;
730
731 for_each_active_dev_scope(drhd->devices,
732 drhd->devices_cnt, i, tmp) {
733 if (tmp == dev) {
734 /* For a VF use its original BDF# not that of the PF
735 * which we used for the IOMMU lookup. Strictly speaking
736 * we could do this for all PCI devices; we only need to
737 * get the BDF# from the scope table for ACPI matches. */
738 if (pdev && pdev->is_virtfn)
739 goto got_pdev;
740
741 if (bus && devfn) {
742 *bus = drhd->devices[i].bus;
743 *devfn = drhd->devices[i].devfn;
744 }
745 goto out;
746 }
747
748 if (is_downstream_to_pci_bridge(dev, tmp))
749 goto got_pdev;
750 }
751
752 if (pdev && drhd->include_all) {
753got_pdev:
754 if (bus && devfn) {
755 *bus = pdev->bus->number;
756 *devfn = pdev->devfn;
757 }
758 goto out;
759 }
760 }
761 iommu = NULL;
762out:
763 if (iommu_is_dummy(iommu, dev))
764 iommu = NULL;
765
766 rcu_read_unlock();
767
768 return iommu;
769}
770
771static void domain_flush_cache(struct dmar_domain *domain,
772 void *addr, int size)
773{
774 if (!domain->iommu_coherency)
775 clflush_cache_range(addr, size);
776}
777
778static void free_context_table(struct intel_iommu *iommu)
779{
780 struct context_entry *context;
781 int i;
782
783 if (!iommu->root_entry)
784 return;
785
786 for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 context = iommu_context_addr(iommu, i, 0, 0);
788 if (context)
789 free_pgtable_page(context);
790
791 if (!sm_supported(iommu))
792 continue;
793
794 context = iommu_context_addr(iommu, i, 0x80, 0);
795 if (context)
796 free_pgtable_page(context);
797 }
798
799 free_pgtable_page(iommu->root_entry);
800 iommu->root_entry = NULL;
801}
802
803#ifdef CONFIG_DMAR_DEBUG
804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806{
807 struct dma_pte *pte;
808 int offset;
809
810 while (1) {
811 offset = pfn_level_offset(pfn, level);
812 pte = &parent[offset];
813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 pr_info("PTE not present at level %d\n", level);
815 break;
816 }
817
818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819
820 if (level == 1)
821 break;
822
823 parent = phys_to_virt(dma_pte_addr(pte));
824 level--;
825 }
826}
827
828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 unsigned long long addr, u32 pasid)
830{
831 struct pasid_dir_entry *dir, *pde;
832 struct pasid_entry *entries, *pte;
833 struct context_entry *ctx_entry;
834 struct root_entry *rt_entry;
835 int i, dir_index, index, level;
836 u8 devfn = source_id & 0xff;
837 u8 bus = source_id >> 8;
838 struct dma_pte *pgtable;
839
840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841
842 /* root entry dump */
843 rt_entry = &iommu->root_entry[bus];
844 if (!rt_entry) {
845 pr_info("root table entry is not present\n");
846 return;
847 }
848
849 if (sm_supported(iommu))
850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 rt_entry->hi, rt_entry->lo);
852 else
853 pr_info("root entry: 0x%016llx", rt_entry->lo);
854
855 /* context entry dump */
856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 if (!ctx_entry) {
858 pr_info("context table entry is not present\n");
859 return;
860 }
861
862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 ctx_entry->hi, ctx_entry->lo);
864
865 /* legacy mode does not require PASID entries */
866 if (!sm_supported(iommu)) {
867 level = agaw_to_level(ctx_entry->hi & 7);
868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 goto pgtable_walk;
870 }
871
872 /* get the pointer to pasid directory entry */
873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 if (!dir) {
875 pr_info("pasid directory entry is not present\n");
876 return;
877 }
878 /* For request-without-pasid, get the pasid from context entry */
879 if (intel_iommu_sm && pasid == INVALID_IOASID)
880 pasid = PASID_RID2PASID;
881
882 dir_index = pasid >> PASID_PDE_SHIFT;
883 pde = &dir[dir_index];
884 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885
886 /* get the pointer to the pasid table entry */
887 entries = get_pasid_table_from_pde(pde);
888 if (!entries) {
889 pr_info("pasid table entry is not present\n");
890 return;
891 }
892 index = pasid & PASID_PTE_MASK;
893 pte = &entries[index];
894 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896
897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 } else {
901 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 }
904
905pgtable_walk:
906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907}
908#endif
909
910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
912{
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
915 int offset;
916
917 BUG_ON(!domain->pgd);
918
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
921 return NULL;
922
923 parent = domain->pgd;
924
925 while (1) {
926 void *tmp_page;
927
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 break;
932 if (level == *target_level)
933 break;
934
935 if (!dma_pte_present(pte)) {
936 uint64_t pteval;
937
938 tmp_page = alloc_pgtable_page(domain->nid);
939
940 if (!tmp_page)
941 return NULL;
942
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain->use_first_level)
946 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
947
948 if (cmpxchg64(&pte->val, 0ULL, pteval))
949 /* Someone else set it while we were thinking; use theirs. */
950 free_pgtable_page(tmp_page);
951 else
952 domain_flush_cache(domain, pte, sizeof(*pte));
953 }
954 if (level == 1)
955 break;
956
957 parent = phys_to_virt(dma_pte_addr(pte));
958 level--;
959 }
960
961 if (!*target_level)
962 *target_level = level;
963
964 return pte;
965}
966
967/* return address's pte at specific level */
968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 unsigned long pfn,
970 int level, int *large_page)
971{
972 struct dma_pte *parent, *pte;
973 int total = agaw_to_level(domain->agaw);
974 int offset;
975
976 parent = domain->pgd;
977 while (level <= total) {
978 offset = pfn_level_offset(pfn, total);
979 pte = &parent[offset];
980 if (level == total)
981 return pte;
982
983 if (!dma_pte_present(pte)) {
984 *large_page = total;
985 break;
986 }
987
988 if (dma_pte_superpage(pte)) {
989 *large_page = total;
990 return pte;
991 }
992
993 parent = phys_to_virt(dma_pte_addr(pte));
994 total--;
995 }
996 return NULL;
997}
998
999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001 unsigned long start_pfn,
1002 unsigned long last_pfn)
1003{
1004 unsigned int large_page;
1005 struct dma_pte *first_pte, *pte;
1006
1007 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 BUG_ON(start_pfn > last_pfn);
1010
1011 /* we don't need lock here; nobody else touches the iova range */
1012 do {
1013 large_page = 1;
1014 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 if (!pte) {
1016 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017 continue;
1018 }
1019 do {
1020 dma_clear_pte(pte);
1021 start_pfn += lvl_to_nr_pages(large_page);
1022 pte++;
1023 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025 domain_flush_cache(domain, first_pte,
1026 (void *)pte - (void *)first_pte);
1027
1028 } while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 int retain_level, struct dma_pte *pte,
1033 unsigned long pfn, unsigned long start_pfn,
1034 unsigned long last_pfn)
1035{
1036 pfn = max(start_pfn, pfn);
1037 pte = &pte[pfn_level_offset(pfn, level)];
1038
1039 do {
1040 unsigned long level_pfn;
1041 struct dma_pte *level_pte;
1042
1043 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044 goto next;
1045
1046 level_pfn = pfn & level_mask(level);
1047 level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049 if (level > 2) {
1050 dma_pte_free_level(domain, level - 1, retain_level,
1051 level_pte, level_pfn, start_pfn,
1052 last_pfn);
1053 }
1054
1055 /*
1056 * Free the page table if we're below the level we want to
1057 * retain and the range covers the entire table.
1058 */
1059 if (level < retain_level && !(start_pfn > level_pfn ||
1060 last_pfn < level_pfn + level_size(level) - 1)) {
1061 dma_clear_pte(pte);
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1063 free_pgtable_page(level_pte);
1064 }
1065next:
1066 pfn += level_size(level);
1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn,
1077 int retain_level)
1078{
1079 dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081 /* We don't need lock here; nobody else touches the iova range */
1082 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083 domain->pgd, 0, start_pfn, last_pfn);
1084
1085 /* free pgd */
1086 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 free_pgtable_page(domain->pgd);
1088 domain->pgd = NULL;
1089 }
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093 need to *modify* it at all. All we need to do is make a list of all the
1094 pages which can be freed just as soon as we've flushed the IOTLB and we
1095 know the hardware page-walk will no longer touch them.
1096 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097 be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099 int level, struct dma_pte *pte,
1100 struct list_head *freelist)
1101{
1102 struct page *pg;
1103
1104 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105 list_add_tail(&pg->lru, freelist);
1106
1107 if (level == 1)
1108 return;
1109
1110 pte = page_address(pg);
1111 do {
1112 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 pte++;
1115 } while (!first_pte_in_page(pte));
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119 struct dma_pte *pte, unsigned long pfn,
1120 unsigned long start_pfn, unsigned long last_pfn,
1121 struct list_head *freelist)
1122{
1123 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125 pfn = max(start_pfn, pfn);
1126 pte = &pte[pfn_level_offset(pfn, level)];
1127
1128 do {
1129 unsigned long level_pfn = pfn & level_mask(level);
1130
1131 if (!dma_pte_present(pte))
1132 goto next;
1133
1134 /* If range covers entire pagetable, free it */
1135 if (start_pfn <= level_pfn &&
1136 last_pfn >= level_pfn + level_size(level) - 1) {
1137 /* These suborbinate page tables are going away entirely. Don't
1138 bother to clear them; we're just going to *free* them. */
1139 if (level > 1 && !dma_pte_superpage(pte))
1140 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142 dma_clear_pte(pte);
1143 if (!first_pte)
1144 first_pte = pte;
1145 last_pte = pte;
1146 } else if (level > 1) {
1147 /* Recurse down into a level that isn't *entirely* obsolete */
1148 dma_pte_clear_level(domain, level - 1,
1149 phys_to_virt(dma_pte_addr(pte)),
1150 level_pfn, start_pfn, last_pfn,
1151 freelist);
1152 }
1153next:
1154 pfn = level_pfn + level_size(level);
1155 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157 if (first_pte)
1158 domain_flush_cache(domain, first_pte,
1159 (void *)++last_pte - (void *)first_pte);
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163 the page tables, and may have cached the intermediate levels. The
1164 pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166 unsigned long last_pfn, struct list_head *freelist)
1167{
1168 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170 BUG_ON(start_pfn > last_pfn);
1171
1172 /* we don't need lock here; nobody else touches the iova range */
1173 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174 domain->pgd, 0, start_pfn, last_pfn, freelist);
1175
1176 /* free pgd */
1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 struct page *pgd_page = virt_to_page(domain->pgd);
1179 list_add_tail(&pgd_page->lru, freelist);
1180 domain->pgd = NULL;
1181 }
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187 struct root_entry *root;
1188
1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190 if (!root) {
1191 pr_err("Allocating root entry for %s failed\n",
1192 iommu->name);
1193 return -ENOMEM;
1194 }
1195
1196 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1197 iommu->root_entry = root;
1198
1199 return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204 u64 addr;
1205 u32 sts;
1206 unsigned long flag;
1207
1208 addr = virt_to_phys(iommu->root_entry);
1209 if (sm_supported(iommu))
1210 addr |= DMA_RTADDR_SMT;
1211
1212 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217 /* Make sure hardware complete it */
1218 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219 readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223 /*
1224 * Hardware invalidates all DMA remapping hardware translation
1225 * caches as part of SRTP flow.
1226 */
1227 if (cap_esrtps(iommu->cap))
1228 return;
1229
1230 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231 if (sm_supported(iommu))
1232 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238 u32 val;
1239 unsigned long flag;
1240
1241 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242 return;
1243
1244 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247 /* Make sure hardware complete it */
1248 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249 readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256 u16 did, u16 source_id, u8 function_mask,
1257 u64 type)
1258{
1259 u64 val = 0;
1260 unsigned long flag;
1261
1262 switch (type) {
1263 case DMA_CCMD_GLOBAL_INVL:
1264 val = DMA_CCMD_GLOBAL_INVL;
1265 break;
1266 case DMA_CCMD_DOMAIN_INVL:
1267 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268 break;
1269 case DMA_CCMD_DEVICE_INVL:
1270 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272 break;
1273 default:
1274 BUG();
1275 }
1276 val |= DMA_CCMD_ICC;
1277
1278 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281 /* Make sure hardware complete it */
1282 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290 u64 addr, unsigned int size_order, u64 type)
1291{
1292 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293 u64 val = 0, val_iva = 0;
1294 unsigned long flag;
1295
1296 switch (type) {
1297 case DMA_TLB_GLOBAL_FLUSH:
1298 /* global flush doesn't need set IVA_REG */
1299 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300 break;
1301 case DMA_TLB_DSI_FLUSH:
1302 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 break;
1304 case DMA_TLB_PSI_FLUSH:
1305 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 /* IH bit is passed in as part of address */
1307 val_iva = size_order | addr;
1308 break;
1309 default:
1310 BUG();
1311 }
1312 /* Note: set drain read/write */
1313#if 0
1314 /*
1315 * This is probably to be super secure.. Looks like we can
1316 * ignore it without any impact.
1317 */
1318 if (cap_read_drain(iommu->cap))
1319 val |= DMA_TLB_READ_DRAIN;
1320#endif
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1323
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 /* Note: Only uses first TLB reg currently */
1326 if (val_iva)
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
1338 pr_err("Flush IOTLB failed\n");
1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349 struct device_domain_info *info;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&domain->lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 spin_unlock_irqrestore(&domain->lock, flags);
1357 return info;
1358 }
1359 }
1360 spin_unlock_irqrestore(&domain->lock, flags);
1361
1362 return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367 struct device_domain_info *info;
1368 bool has_iotlb_device = false;
1369 unsigned long flags;
1370
1371 spin_lock_irqsave(&domain->lock, flags);
1372 list_for_each_entry(info, &domain->devices, link) {
1373 if (info->ats_enabled) {
1374 has_iotlb_device = true;
1375 break;
1376 }
1377 }
1378 domain->has_iotlb_device = has_iotlb_device;
1379 spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392 return false;
1393
1394 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395 return false;
1396
1397 return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402 struct pci_dev *pdev;
1403
1404 if (!dev_is_pci(info->dev))
1405 return;
1406
1407 pdev = to_pci_dev(info->dev);
1408 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411 * reserved, which should be set to 0.
1412 */
1413 if (!ecap_dit(info->iommu->ecap))
1414 info->pfsid = 0;
1415 else {
1416 struct pci_dev *pf_pdev;
1417
1418 /* pdev will be returned if device is not a vf */
1419 pf_pdev = pci_physfn(pdev);
1420 info->pfsid = pci_dev_id(pf_pdev);
1421 }
1422
1423 /* The PCIe spec, in its wisdom, declares that the behaviour of
1424 the device if you enable PASID support after ATS support is
1425 undefined. So always enable PASID support on devices which
1426 have it, even if we can't yet know if we're ever going to
1427 use it. */
1428 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429 info->pasid_enabled = 1;
1430
1431 if (info->pri_supported &&
1432 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1433 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434 info->pri_enabled = 1;
1435
1436 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438 info->ats_enabled = 1;
1439 domain_update_iotlb(info->domain);
1440 info->ats_qdep = pci_ats_queue_depth(pdev);
1441 }
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446 struct pci_dev *pdev;
1447
1448 if (!dev_is_pci(info->dev))
1449 return;
1450
1451 pdev = to_pci_dev(info->dev);
1452
1453 if (info->ats_enabled) {
1454 pci_disable_ats(pdev);
1455 info->ats_enabled = 0;
1456 domain_update_iotlb(info->domain);
1457 }
1458
1459 if (info->pri_enabled) {
1460 pci_disable_pri(pdev);
1461 info->pri_enabled = 0;
1462 }
1463
1464 if (info->pasid_enabled) {
1465 pci_disable_pasid(pdev);
1466 info->pasid_enabled = 0;
1467 }
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471 u64 addr, unsigned int mask)
1472{
1473 u16 sid, qdep;
1474
1475 if (!info || !info->ats_enabled)
1476 return;
1477
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 qdep, addr, mask);
1482 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 u64 addr, unsigned mask)
1487{
1488 struct device_domain_info *info;
1489 unsigned long flags;
1490
1491 if (!domain->has_iotlb_device)
1492 return;
1493
1494 spin_lock_irqsave(&domain->lock, flags);
1495 list_for_each_entry(info, &domain->devices, link)
1496 __iommu_flush_dev_iotlb(info, addr, mask);
1497 spin_unlock_irqrestore(&domain->lock, flags);
1498}
1499
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 struct dmar_domain *domain,
1502 unsigned long pfn, unsigned int pages,
1503 int ih, int map)
1504{
1505 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 unsigned int mask = ilog2(aligned_pages);
1507 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 u16 did = domain_id_iommu(domain, iommu);
1509
1510 BUG_ON(pages == 0);
1511
1512 if (ih)
1513 ih = 1 << 6;
1514
1515 if (domain->use_first_level) {
1516 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 } else {
1518 unsigned long bitmask = aligned_pages - 1;
1519
1520 /*
1521 * PSI masks the low order bits of the base address. If the
1522 * address isn't aligned to the mask, then compute a mask value
1523 * needed to ensure the target range is flushed.
1524 */
1525 if (unlikely(bitmask & pfn)) {
1526 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528 /*
1529 * Since end_pfn <= pfn + bitmask, the only way bits
1530 * higher than bitmask can differ in pfn and end_pfn is
1531 * by carrying. This means after masking out bitmask,
1532 * high bits starting with the first set bit in
1533 * shared_bits are all equal in both pfn and end_pfn.
1534 */
1535 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 }
1538
1539 /*
1540 * Fallback to domain selective flush if no PSI support or
1541 * the size is too big.
1542 */
1543 if (!cap_pgsel_inv(iommu->cap) ||
1544 mask > cap_max_amask_val(iommu->cap))
1545 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 DMA_TLB_DSI_FLUSH);
1547 else
1548 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 DMA_TLB_PSI_FLUSH);
1550 }
1551
1552 /*
1553 * In caching mode, changes of pages from non-present to present require
1554 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 */
1556 if (!cap_caching_mode(iommu->cap) || !map)
1557 iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 struct dmar_domain *domain,
1563 unsigned long pfn, unsigned int pages)
1564{
1565 /*
1566 * It's a non-present to present mapping. Only flush if caching mode
1567 * and second level.
1568 */
1569 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 else
1572 iommu_flush_write_buffer(iommu);
1573}
1574
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 struct iommu_domain_info *info;
1579 unsigned long idx;
1580
1581 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 struct intel_iommu *iommu = info->iommu;
1583 u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585 if (dmar_domain->use_first_level)
1586 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 else
1588 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 DMA_TLB_DSI_FLUSH);
1590
1591 if (!cap_caching_mode(iommu->cap))
1592 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 }
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598 u32 pmen;
1599 unsigned long flags;
1600
1601 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 return;
1603
1604 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 pmen &= ~DMA_PMEN_EPM;
1607 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609 /* wait for the protected region status bit to clear */
1610 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618 u32 sts;
1619 unsigned long flags;
1620
1621 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 iommu->gcmd |= DMA_GCMD_TE;
1623 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625 /* Make sure hardware complete it */
1626 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 readl, (sts & DMA_GSTS_TES), sts);
1628
1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634 u32 sts;
1635 unsigned long flag;
1636
1637 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 return;
1640
1641 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 iommu->gcmd &= ~DMA_GCMD_TE;
1643 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645 /* Make sure hardware complete it */
1646 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654 u32 ndomains;
1655
1656 ndomains = cap_ndoms(iommu->cap);
1657 pr_debug("%s: Number of Domains supported <%d>\n",
1658 iommu->name, ndomains);
1659
1660 spin_lock_init(&iommu->lock);
1661
1662 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 if (!iommu->domain_ids)
1664 return -ENOMEM;
1665
1666 /*
1667 * If Caching mode is set, then invalid translations are tagged
1668 * with domain-id 0, hence we need to pre-allocate it. We also
1669 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 * make sure it is not used for a real domain.
1671 */
1672 set_bit(0, iommu->domain_ids);
1673
1674 /*
1675 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 * entry for first-level or pass-through translation modes should
1677 * be programmed with a domain id different from those used for
1678 * second-level or nested translation. We reserve a domain id for
1679 * this purpose.
1680 */
1681 if (sm_supported(iommu))
1682 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684 return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689 if (!iommu->domain_ids)
1690 return;
1691
1692 /*
1693 * All iommu domains must have been detached from the devices,
1694 * hence there should be no domain IDs in use.
1695 */
1696 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 > NUM_RESERVED_DID))
1698 return;
1699
1700 if (iommu->gcmd & DMA_GCMD_TE)
1701 iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706 if (iommu->domain_ids) {
1707 bitmap_free(iommu->domain_ids);
1708 iommu->domain_ids = NULL;
1709 }
1710
1711 if (iommu->copied_tables) {
1712 bitmap_free(iommu->copied_tables);
1713 iommu->copied_tables = NULL;
1714 }
1715
1716 /* free context mapping */
1717 free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720 if (pasid_supported(iommu)) {
1721 if (ecap_prs(iommu->ecap))
1722 intel_svm_finish_prq(iommu);
1723 }
1724 if (vccap_pasid(iommu->vccap))
1725 ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736 /* Only SL is available in legacy mode */
1737 if (!scalable_mode_support())
1738 return false;
1739
1740 /* Only level (either FL or SL) is available, just use it */
1741 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 return intel_cap_flts_sanity();
1743
1744 /* Both levels are available, decide it based on domain type */
1745 return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750 struct dmar_domain *domain;
1751
1752 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 if (!domain)
1754 return NULL;
1755
1756 domain->nid = NUMA_NO_NODE;
1757 if (first_level_by_default(type))
1758 domain->use_first_level = true;
1759 domain->has_iotlb_device = false;
1760 INIT_LIST_HEAD(&domain->devices);
1761 spin_lock_init(&domain->lock);
1762 xa_init(&domain->iommu_array);
1763
1764 return domain;
1765}
1766
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768 struct intel_iommu *iommu)
1769{
1770 struct iommu_domain_info *info, *curr;
1771 unsigned long ndomains;
1772 int num, ret = -ENOSPC;
1773
1774 info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 if (!info)
1776 return -ENOMEM;
1777
1778 spin_lock(&iommu->lock);
1779 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 if (curr) {
1781 curr->refcnt++;
1782 spin_unlock(&iommu->lock);
1783 kfree(info);
1784 return 0;
1785 }
1786
1787 ndomains = cap_ndoms(iommu->cap);
1788 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 if (num >= ndomains) {
1790 pr_err("%s: No free domain ids\n", iommu->name);
1791 goto err_unlock;
1792 }
1793
1794 set_bit(num, iommu->domain_ids);
1795 info->refcnt = 1;
1796 info->did = num;
1797 info->iommu = iommu;
1798 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 NULL, info, GFP_ATOMIC);
1800 if (curr) {
1801 ret = xa_err(curr) ? : -EBUSY;
1802 goto err_clear;
1803 }
1804 domain_update_iommu_cap(domain);
1805
1806 spin_unlock(&iommu->lock);
1807 return 0;
1808
1809err_clear:
1810 clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812 spin_unlock(&iommu->lock);
1813 kfree(info);
1814 return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818 struct intel_iommu *iommu)
1819{
1820 struct iommu_domain_info *info;
1821
1822 spin_lock(&iommu->lock);
1823 info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 if (--info->refcnt == 0) {
1825 clear_bit(info->did, iommu->domain_ids);
1826 xa_erase(&domain->iommu_array, iommu->seq_id);
1827 domain->nid = NUMA_NO_NODE;
1828 domain_update_iommu_cap(domain);
1829 kfree(info);
1830 }
1831 spin_unlock(&iommu->lock);
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836 int agaw;
1837 int r = (gaw - 12) % 9;
1838
1839 if (r == 0)
1840 agaw = gaw;
1841 else
1842 agaw = gaw + 9 - r;
1843 if (agaw > 64)
1844 agaw = 64;
1845 return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
1850 if (domain->pgd) {
1851 LIST_HEAD(freelist);
1852
1853 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 put_pages_list(&freelist);
1855 }
1856
1857 if (WARN_ON(!list_empty(&domain->devices)))
1858 return;
1859
1860 kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870 unsigned long pds, max_pde;
1871
1872 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 if (pds < 7)
1875 return 0;
1876
1877 return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888 context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897 context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906 context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds) (((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913 struct intel_iommu *iommu,
1914 struct pasid_table *table,
1915 u8 bus, u8 devfn)
1916{
1917 struct device_domain_info *info =
1918 domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 u16 did = domain_id_iommu(domain, iommu);
1920 int translation = CONTEXT_TT_MULTI_LEVEL;
1921 struct context_entry *context;
1922 int ret;
1923
1924 WARN_ON(did == 0);
1925
1926 if (hw_pass_through && domain_type_is_si(domain))
1927 translation = CONTEXT_TT_PASS_THROUGH;
1928
1929 pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932 BUG_ON(!domain->pgd);
1933
1934 spin_lock(&iommu->lock);
1935 ret = -ENOMEM;
1936 context = iommu_context_addr(iommu, bus, devfn, 1);
1937 if (!context)
1938 goto out_unlock;
1939
1940 ret = 0;
1941 if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 goto out_unlock;
1943
1944 /*
1945 * For kdump cases, old valid entries may be cached due to the
1946 * in-flight DMA and copied pgtable, but there is no unmapping
1947 * behaviour for them, thus we need an explicit cache flush for
1948 * the newly-mapped device. For kdump, at this point, the device
1949 * is supposed to finish reset at its driver probe stage, so no
1950 * in-flight DMA will exist, and we don't need to worry anymore
1951 * hereafter.
1952 */
1953 if (context_copied(iommu, bus, devfn)) {
1954 u16 did_old = context_domain_id(context);
1955
1956 if (did_old < cap_ndoms(iommu->cap)) {
1957 iommu->flush.flush_context(iommu, did_old,
1958 (((u16)bus) << 8) | devfn,
1959 DMA_CCMD_MASK_NOBIT,
1960 DMA_CCMD_DEVICE_INVL);
1961 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 DMA_TLB_DSI_FLUSH);
1963 }
1964
1965 clear_context_copied(iommu, bus, devfn);
1966 }
1967
1968 context_clear_entry(context);
1969
1970 if (sm_supported(iommu)) {
1971 unsigned long pds;
1972
1973 WARN_ON(!table);
1974
1975 /* Setup the PASID DIR pointer: */
1976 pds = context_get_sm_pds(table);
1977 context->lo = (u64)virt_to_phys(table->table) |
1978 context_pdts(pds);
1979
1980 /* Setup the RID_PASID field: */
1981 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983 /*
1984 * Setup the Device-TLB enable bit and Page request
1985 * Enable bit:
1986 */
1987 if (info && info->ats_supported)
1988 context_set_sm_dte(context);
1989 if (info && info->pri_supported)
1990 context_set_sm_pre(context);
1991 if (info && info->pasid_supported)
1992 context_set_pasid(context);
1993 } else {
1994 struct dma_pte *pgd = domain->pgd;
1995 int agaw;
1996
1997 context_set_domain_id(context, did);
1998
1999 if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 /*
2001 * Skip top levels of page tables for iommu which has
2002 * less agaw than default. Unnecessary for PT mode.
2003 */
2004 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 ret = -ENOMEM;
2006 pgd = phys_to_virt(dma_pte_addr(pgd));
2007 if (!dma_pte_present(pgd))
2008 goto out_unlock;
2009 }
2010
2011 if (info && info->ats_supported)
2012 translation = CONTEXT_TT_DEV_IOTLB;
2013 else
2014 translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016 context_set_address_root(context, virt_to_phys(pgd));
2017 context_set_address_width(context, agaw);
2018 } else {
2019 /*
2020 * In pass through mode, AW must be programmed to
2021 * indicate the largest AGAW value supported by
2022 * hardware. And ASR is ignored by hardware.
2023 */
2024 context_set_address_width(context, iommu->msagaw);
2025 }
2026
2027 context_set_translation_type(context, translation);
2028 }
2029
2030 context_set_fault_enable(context);
2031 context_set_present(context);
2032 if (!ecap_coherent(iommu->ecap))
2033 clflush_cache_range(context, sizeof(*context));
2034
2035 /*
2036 * It's a non-present to present mapping. If hardware doesn't cache
2037 * non-present entry we only need to flush the write-buffer. If the
2038 * _does_ cache non-present entries, then it does so in the special
2039 * domain #0, which we have to flush:
2040 */
2041 if (cap_caching_mode(iommu->cap)) {
2042 iommu->flush.flush_context(iommu, 0,
2043 (((u16)bus) << 8) | devfn,
2044 DMA_CCMD_MASK_NOBIT,
2045 DMA_CCMD_DEVICE_INVL);
2046 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 } else {
2048 iommu_flush_write_buffer(iommu);
2049 }
2050
2051 ret = 0;
2052
2053out_unlock:
2054 spin_unlock(&iommu->lock);
2055
2056 return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060 struct dmar_domain *domain;
2061 struct intel_iommu *iommu;
2062 struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066 u16 alias, void *opaque)
2067{
2068 struct domain_context_mapping_data *data = opaque;
2069
2070 return domain_context_mapping_one(data->domain, data->iommu,
2071 data->table, PCI_BUS_NUM(alias),
2072 alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
2078 struct domain_context_mapping_data data;
2079 struct pasid_table *table;
2080 struct intel_iommu *iommu;
2081 u8 bus, devfn;
2082
2083 iommu = device_to_iommu(dev, &bus, &devfn);
2084 if (!iommu)
2085 return -ENODEV;
2086
2087 table = intel_pasid_get_table(dev);
2088
2089 if (!dev_is_pci(dev))
2090 return domain_context_mapping_one(domain, iommu, table,
2091 bus, devfn);
2092
2093 data.domain = domain;
2094 data.iommu = iommu;
2095 data.table = table;
2096
2097 return pci_for_each_dma_alias(to_pci_dev(dev),
2098 &domain_context_mapping_cb, &data);
2099}
2100
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103 size_t size)
2104{
2105 host_addr &= ~PAGE_MASK;
2106 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111 unsigned long iov_pfn,
2112 unsigned long phy_pfn,
2113 unsigned long pages)
2114{
2115 int support, level = 1;
2116 unsigned long pfnmerge;
2117
2118 support = domain->iommu_superpage;
2119
2120 /* To use a large page, the virtual *and* physical addresses
2121 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122 of them will mean we have to use smaller pages. So just
2123 merge them and check both at once. */
2124 pfnmerge = iov_pfn | phy_pfn;
2125
2126 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127 pages >>= VTD_STRIDE_SHIFT;
2128 if (!pages)
2129 break;
2130 pfnmerge >>= VTD_STRIDE_SHIFT;
2131 level++;
2132 support--;
2133 }
2134 return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143 unsigned long start_pfn,
2144 unsigned long end_pfn, int level)
2145{
2146 unsigned long lvl_pages = lvl_to_nr_pages(level);
2147 struct iommu_domain_info *info;
2148 struct dma_pte *pte = NULL;
2149 unsigned long i;
2150
2151 while (start_pfn <= end_pfn) {
2152 if (!pte)
2153 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154
2155 if (dma_pte_present(pte)) {
2156 dma_pte_free_pagetable(domain, start_pfn,
2157 start_pfn + lvl_pages - 1,
2158 level + 1);
2159
2160 xa_for_each(&domain->iommu_array, i, info)
2161 iommu_flush_iotlb_psi(info->iommu, domain,
2162 start_pfn, lvl_pages,
2163 0, 0);
2164 }
2165
2166 pte++;
2167 start_pfn += lvl_pages;
2168 if (first_pte_in_page(pte))
2169 pte = NULL;
2170 }
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176{
2177 struct dma_pte *first_pte = NULL, *pte = NULL;
2178 unsigned int largepage_lvl = 0;
2179 unsigned long lvl_pages = 0;
2180 phys_addr_t pteval;
2181 u64 attr;
2182
2183 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184
2185 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186 return -EINVAL;
2187
2188 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189 attr |= DMA_FL_PTE_PRESENT;
2190 if (domain->use_first_level) {
2191 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192 if (prot & DMA_PTE_WRITE)
2193 attr |= DMA_FL_PTE_DIRTY;
2194 }
2195
2196 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197
2198 while (nr_pages > 0) {
2199 uint64_t tmp;
2200
2201 if (!pte) {
2202 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203 phys_pfn, nr_pages);
2204
2205 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206 if (!pte)
2207 return -ENOMEM;
2208 first_pte = pte;
2209
2210 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212 /* It is large page*/
2213 if (largepage_lvl > 1) {
2214 unsigned long end_pfn;
2215 unsigned long pages_to_remove;
2216
2217 pteval |= DMA_PTE_LARGE_PAGE;
2218 pages_to_remove = min_t(unsigned long, nr_pages,
2219 nr_pte_to_next_page(pte) * lvl_pages);
2220 end_pfn = iov_pfn + pages_to_remove - 1;
2221 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222 } else {
2223 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224 }
2225
2226 }
2227 /* We don't need lock here, nobody else
2228 * touches the iova range
2229 */
2230 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231 if (tmp) {
2232 static int dumps = 5;
2233 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234 iov_pfn, tmp, (unsigned long long)pteval);
2235 if (dumps) {
2236 dumps--;
2237 debug_dma_dump_mappings(NULL);
2238 }
2239 WARN_ON(1);
2240 }
2241
2242 nr_pages -= lvl_pages;
2243 iov_pfn += lvl_pages;
2244 phys_pfn += lvl_pages;
2245 pteval += lvl_pages * VTD_PAGE_SIZE;
2246
2247 /* If the next PTE would be the first in a new page, then we
2248 * need to flush the cache on the entries we've just written.
2249 * And then we'll need to recalculate 'pte', so clear it and
2250 * let it get set again in the if (!pte) block above.
2251 *
2252 * If we're done (!nr_pages) we need to flush the cache too.
2253 *
2254 * Also if we've been setting superpages, we may need to
2255 * recalculate 'pte' and switch back to smaller pages for the
2256 * end of the mapping, if the trailing size is not enough to
2257 * use another superpage (i.e. nr_pages < lvl_pages).
2258 */
2259 pte++;
2260 if (!nr_pages || first_pte_in_page(pte) ||
2261 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262 domain_flush_cache(domain, first_pte,
2263 (void *)pte - (void *)first_pte);
2264 pte = NULL;
2265 }
2266 }
2267
2268 return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272{
2273 struct intel_iommu *iommu = info->iommu;
2274 struct context_entry *context;
2275 u16 did_old;
2276
2277 if (!iommu)
2278 return;
2279
2280 spin_lock(&iommu->lock);
2281 context = iommu_context_addr(iommu, bus, devfn, 0);
2282 if (!context) {
2283 spin_unlock(&iommu->lock);
2284 return;
2285 }
2286
2287 if (sm_supported(iommu)) {
2288 if (hw_pass_through && domain_type_is_si(info->domain))
2289 did_old = FLPT_DEFAULT_DID;
2290 else
2291 did_old = domain_id_iommu(info->domain, iommu);
2292 } else {
2293 did_old = context_domain_id(context);
2294 }
2295
2296 context_clear_entry(context);
2297 __iommu_flush_cache(iommu, context, sizeof(*context));
2298 spin_unlock(&iommu->lock);
2299 iommu->flush.flush_context(iommu,
2300 did_old,
2301 (((u16)bus) << 8) | devfn,
2302 DMA_CCMD_MASK_NOBIT,
2303 DMA_CCMD_DEVICE_INVL);
2304
2305 if (sm_supported(iommu))
2306 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308 iommu->flush.flush_iotlb(iommu,
2309 did_old,
2310 0,
2311 0,
2312 DMA_TLB_DSI_FLUSH);
2313
2314 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318 struct dmar_domain *domain,
2319 struct device *dev,
2320 u32 pasid)
2321{
2322 struct dma_pte *pgd = domain->pgd;
2323 int agaw, level;
2324 int flags = 0;
2325
2326 /*
2327 * Skip top levels of page tables for iommu which has
2328 * less agaw than default. Unnecessary for PT mode.
2329 */
2330 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331 pgd = phys_to_virt(dma_pte_addr(pgd));
2332 if (!dma_pte_present(pgd))
2333 return -ENOMEM;
2334 }
2335
2336 level = agaw_to_level(agaw);
2337 if (level != 4 && level != 5)
2338 return -EINVAL;
2339
2340 if (pasid != PASID_RID2PASID)
2341 flags |= PASID_FLAG_SUPERVISOR_MODE;
2342 if (level == 5)
2343 flags |= PASID_FLAG_FL5LP;
2344
2345 if (domain->force_snooping)
2346 flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349 domain_id_iommu(domain, iommu),
2350 flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355 return dev && dev_is_pci(dev) &&
2356 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360 unsigned long first_vpfn,
2361 unsigned long last_vpfn)
2362{
2363 /*
2364 * RMRR range might have overlap with physical memory range,
2365 * clear it first
2366 */
2367 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369 return __domain_mapping(domain, first_vpfn,
2370 first_vpfn, last_vpfn - first_vpfn + 1,
2371 DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378 struct dmar_rmrr_unit *rmrr;
2379 struct device *dev;
2380 int i, nid, ret;
2381
2382 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383 if (!si_domain)
2384 return -EFAULT;
2385
2386 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387 domain_exit(si_domain);
2388 si_domain = NULL;
2389 return -EFAULT;
2390 }
2391
2392 if (hw)
2393 return 0;
2394
2395 for_each_online_node(nid) {
2396 unsigned long start_pfn, end_pfn;
2397 int i;
2398
2399 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400 ret = iommu_domain_identity_map(si_domain,
2401 mm_to_dma_pfn(start_pfn),
2402 mm_to_dma_pfn(end_pfn));
2403 if (ret)
2404 return ret;
2405 }
2406 }
2407
2408 /*
2409 * Identity map the RMRRs so that devices with RMRRs could also use
2410 * the si_domain.
2411 */
2412 for_each_rmrr_units(rmrr) {
2413 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414 i, dev) {
2415 unsigned long long start = rmrr->base_address;
2416 unsigned long long end = rmrr->end_address;
2417
2418 if (WARN_ON(end < start ||
2419 end >> agaw_to_width(si_domain->agaw)))
2420 continue;
2421
2422 ret = iommu_domain_identity_map(si_domain,
2423 mm_to_dma_pfn(start >> PAGE_SHIFT),
2424 mm_to_dma_pfn(end >> PAGE_SHIFT));
2425 if (ret)
2426 return ret;
2427 }
2428 }
2429
2430 return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434 struct device *dev)
2435{
2436 struct device_domain_info *info = dev_iommu_priv_get(dev);
2437 struct intel_iommu *iommu;
2438 unsigned long flags;
2439 u8 bus, devfn;
2440 int ret;
2441
2442 iommu = device_to_iommu(dev, &bus, &devfn);
2443 if (!iommu)
2444 return -ENODEV;
2445
2446 ret = domain_attach_iommu(domain, iommu);
2447 if (ret)
2448 return ret;
2449 info->domain = domain;
2450 spin_lock_irqsave(&domain->lock, flags);
2451 list_add(&info->link, &domain->devices);
2452 spin_unlock_irqrestore(&domain->lock, flags);
2453
2454 /* PASID table is mandatory for a PCI device in scalable mode. */
2455 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456 /* Setup the PASID entry for requests without PASID: */
2457 if (hw_pass_through && domain_type_is_si(domain))
2458 ret = intel_pasid_setup_pass_through(iommu, domain,
2459 dev, PASID_RID2PASID);
2460 else if (domain->use_first_level)
2461 ret = domain_setup_first_level(iommu, domain, dev,
2462 PASID_RID2PASID);
2463 else
2464 ret = intel_pasid_setup_second_level(iommu, domain,
2465 dev, PASID_RID2PASID);
2466 if (ret) {
2467 dev_err(dev, "Setup RID2PASID failed\n");
2468 device_block_translation(dev);
2469 return ret;
2470 }
2471 }
2472
2473 ret = domain_context_mapping(domain, dev);
2474 if (ret) {
2475 dev_err(dev, "Domain context map failed\n");
2476 device_block_translation(dev);
2477 return ret;
2478 }
2479
2480 iommu_enable_pci_caps(info);
2481
2482 return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487 struct dmar_rmrr_unit *rmrr;
2488 struct device *tmp;
2489 int i;
2490
2491 rcu_read_lock();
2492 for_each_rmrr_units(rmrr) {
2493 /*
2494 * Return TRUE if this RMRR contains the device that
2495 * is passed in.
2496 */
2497 for_each_active_dev_scope(rmrr->devices,
2498 rmrr->devices_cnt, i, tmp)
2499 if (tmp == dev ||
2500 is_downstream_to_pci_bridge(dev, tmp)) {
2501 rcu_read_unlock();
2502 return true;
2503 }
2504 }
2505 rcu_read_unlock();
2506 return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot. This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526 struct pci_dev *pdev;
2527
2528 if (!dev_is_pci(dev))
2529 return false;
2530
2531 pdev = to_pci_dev(dev);
2532 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533 return true;
2534 else
2535 return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs. The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost. This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API. This interface
2545 * expects to have full control of the IOVA space for the device. We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space. We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557 if (!device_has_rmrr(dev))
2558 return false;
2559
2560 if (device_rmrr_is_relaxable(dev))
2561 return false;
2562
2563 return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 * - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579 if (dev_is_pci(dev)) {
2580 struct pci_dev *pdev = to_pci_dev(dev);
2581
2582 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583 return IOMMU_DOMAIN_IDENTITY;
2584
2585 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586 return IOMMU_DOMAIN_IDENTITY;
2587 }
2588
2589 return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594 /*
2595 * Start from the sane iommu hardware state.
2596 * If the queued invalidation is already initialized by us
2597 * (for example, while enabling interrupt-remapping) then
2598 * we got the things already rolling from a sane state.
2599 */
2600 if (!iommu->qi) {
2601 /*
2602 * Clear any previous faults.
2603 */
2604 dmar_fault(-1, iommu);
2605 /*
2606 * Disable queued invalidation if supported and already enabled
2607 * before OS handover.
2608 */
2609 dmar_disable_qi(iommu);
2610 }
2611
2612 if (dmar_enable_qi(iommu)) {
2613 /*
2614 * Queued Invalidate not enabled, use Register Based Invalidate
2615 */
2616 iommu->flush.flush_context = __iommu_flush_context;
2617 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618 pr_info("%s: Using Register based invalidation\n",
2619 iommu->name);
2620 } else {
2621 iommu->flush.flush_context = qi_flush_context;
2622 iommu->flush.flush_iotlb = qi_flush_iotlb;
2623 pr_info("%s: Using Queued invalidation\n", iommu->name);
2624 }
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628 struct root_entry *old_re,
2629 struct context_entry **tbl,
2630 int bus, bool ext)
2631{
2632 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633 struct context_entry *new_ce = NULL, ce;
2634 struct context_entry *old_ce = NULL;
2635 struct root_entry re;
2636 phys_addr_t old_ce_phys;
2637
2638 tbl_idx = ext ? bus * 2 : bus;
2639 memcpy(&re, old_re, sizeof(re));
2640
2641 for (devfn = 0; devfn < 256; devfn++) {
2642 /* First calculate the correct index */
2643 idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645 if (idx == 0) {
2646 /* First save what we may have and clean up */
2647 if (new_ce) {
2648 tbl[tbl_idx] = new_ce;
2649 __iommu_flush_cache(iommu, new_ce,
2650 VTD_PAGE_SIZE);
2651 pos = 1;
2652 }
2653
2654 if (old_ce)
2655 memunmap(old_ce);
2656
2657 ret = 0;
2658 if (devfn < 0x80)
2659 old_ce_phys = root_entry_lctp(&re);
2660 else
2661 old_ce_phys = root_entry_uctp(&re);
2662
2663 if (!old_ce_phys) {
2664 if (ext && devfn == 0) {
2665 /* No LCTP, try UCTP */
2666 devfn = 0x7f;
2667 continue;
2668 } else {
2669 goto out;
2670 }
2671 }
2672
2673 ret = -ENOMEM;
2674 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675 MEMREMAP_WB);
2676 if (!old_ce)
2677 goto out;
2678
2679 new_ce = alloc_pgtable_page(iommu->node);
2680 if (!new_ce)
2681 goto out_unmap;
2682
2683 ret = 0;
2684 }
2685
2686 /* Now copy the context entry */
2687 memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689 if (!context_present(&ce))
2690 continue;
2691
2692 did = context_domain_id(&ce);
2693 if (did >= 0 && did < cap_ndoms(iommu->cap))
2694 set_bit(did, iommu->domain_ids);
2695
2696 set_context_copied(iommu, bus, devfn);
2697 new_ce[idx] = ce;
2698 }
2699
2700 tbl[tbl_idx + pos] = new_ce;
2701
2702 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705 memunmap(old_ce);
2706
2707out:
2708 return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713 struct context_entry **ctxt_tbls;
2714 struct root_entry *old_rt;
2715 phys_addr_t old_rt_phys;
2716 int ctxt_table_entries;
2717 u64 rtaddr_reg;
2718 int bus, ret;
2719 bool new_ext, ext;
2720
2721 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723 new_ext = !!sm_supported(iommu);
2724
2725 /*
2726 * The RTT bit can only be changed when translation is disabled,
2727 * but disabling translation means to open a window for data
2728 * corruption. So bail out and don't copy anything if we would
2729 * have to change the bit.
2730 */
2731 if (new_ext != ext)
2732 return -EINVAL;
2733
2734 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735 if (!iommu->copied_tables)
2736 return -ENOMEM;
2737
2738 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739 if (!old_rt_phys)
2740 return -EINVAL;
2741
2742 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743 if (!old_rt)
2744 return -ENOMEM;
2745
2746 /* This is too big for the stack - allocate it from slab */
2747 ctxt_table_entries = ext ? 512 : 256;
2748 ret = -ENOMEM;
2749 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750 if (!ctxt_tbls)
2751 goto out_unmap;
2752
2753 for (bus = 0; bus < 256; bus++) {
2754 ret = copy_context_table(iommu, &old_rt[bus],
2755 ctxt_tbls, bus, ext);
2756 if (ret) {
2757 pr_err("%s: Failed to copy context table for bus %d\n",
2758 iommu->name, bus);
2759 continue;
2760 }
2761 }
2762
2763 spin_lock(&iommu->lock);
2764
2765 /* Context tables are copied, now write them to the root_entry table */
2766 for (bus = 0; bus < 256; bus++) {
2767 int idx = ext ? bus * 2 : bus;
2768 u64 val;
2769
2770 if (ctxt_tbls[idx]) {
2771 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772 iommu->root_entry[bus].lo = val;
2773 }
2774
2775 if (!ext || !ctxt_tbls[idx + 1])
2776 continue;
2777
2778 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779 iommu->root_entry[bus].hi = val;
2780 }
2781
2782 spin_unlock(&iommu->lock);
2783
2784 kfree(ctxt_tbls);
2785
2786 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788 ret = 0;
2789
2790out_unmap:
2791 memunmap(old_rt);
2792
2793 return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799 struct intel_iommu *iommu = data;
2800 ioasid_t ioasid;
2801
2802 if (!iommu)
2803 return INVALID_IOASID;
2804 /*
2805 * VT-d virtual command interface always uses the full 20 bit
2806 * PASID range. Host can partition guest PASID range based on
2807 * policies but it is out of guest's control.
2808 */
2809 if (min < PASID_MIN || max > intel_pasid_max_id)
2810 return INVALID_IOASID;
2811
2812 if (vcmd_alloc_pasid(iommu, &ioasid))
2813 return INVALID_IOASID;
2814
2815 return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820 struct intel_iommu *iommu = data;
2821
2822 if (!iommu)
2823 return;
2824 /*
2825 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826 * We can only free the PASID when all the devices are unbound.
2827 */
2828 if (ioasid_find(NULL, ioasid, NULL)) {
2829 pr_alert("Cannot free active IOASID %d\n", ioasid);
2830 return;
2831 }
2832 vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837 /*
2838 * If we are running in the host, no need for custom allocator
2839 * in that PASIDs are allocated from the host system-wide.
2840 */
2841 if (!cap_caching_mode(iommu->cap))
2842 return;
2843
2844 if (!sm_supported(iommu)) {
2845 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846 return;
2847 }
2848
2849 /*
2850 * Register a custom PASID allocator if we are running in a guest,
2851 * guest PASID must be obtained via virtual command interface.
2852 * There can be multiple vIOMMUs in each guest but only one allocator
2853 * is active. All vIOMMU allocators will eventually be calling the same
2854 * host allocator.
2855 */
2856 if (!vccap_pasid(iommu->vccap))
2857 return;
2858
2859 pr_info("Register custom PASID allocator\n");
2860 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862 iommu->pasid_allocator.pdata = (void *)iommu;
2863 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865 /*
2866 * Disable scalable mode on this IOMMU if there
2867 * is no custom allocator. Mixing SM capable vIOMMU
2868 * and non-SM vIOMMU are not supported.
2869 */
2870 intel_iommu_sm = 0;
2871 }
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877 struct dmar_drhd_unit *drhd;
2878 struct intel_iommu *iommu;
2879 int ret;
2880
2881 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882 if (ret)
2883 goto free_iommu;
2884
2885 for_each_iommu(iommu, drhd) {
2886 if (drhd->ignored) {
2887 iommu_disable_translation(iommu);
2888 continue;
2889 }
2890
2891 /*
2892 * Find the max pasid size of all IOMMU's in the system.
2893 * We need to ensure the system pasid table is no bigger
2894 * than the smallest supported.
2895 */
2896 if (pasid_supported(iommu)) {
2897 u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899 intel_pasid_max_id = min_t(u32, temp,
2900 intel_pasid_max_id);
2901 }
2902
2903 intel_iommu_init_qi(iommu);
2904
2905 ret = iommu_init_domains(iommu);
2906 if (ret)
2907 goto free_iommu;
2908
2909 init_translation_status(iommu);
2910
2911 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912 iommu_disable_translation(iommu);
2913 clear_translation_pre_enabled(iommu);
2914 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915 iommu->name);
2916 }
2917
2918 /*
2919 * TBD:
2920 * we could share the same root & context tables
2921 * among all IOMMU's. Need to Split it later.
2922 */
2923 ret = iommu_alloc_root_entry(iommu);
2924 if (ret)
2925 goto free_iommu;
2926
2927 if (translation_pre_enabled(iommu)) {
2928 pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930 ret = copy_translation_tables(iommu);
2931 if (ret) {
2932 /*
2933 * We found the IOMMU with translation
2934 * enabled - but failed to copy over the
2935 * old root-entry table. Try to proceed
2936 * by disabling translation now and
2937 * allocating a clean root-entry table.
2938 * This might cause DMAR faults, but
2939 * probably the dump will still succeed.
2940 */
2941 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942 iommu->name);
2943 iommu_disable_translation(iommu);
2944 clear_translation_pre_enabled(iommu);
2945 } else {
2946 pr_info("Copied translation tables from previous kernel for %s\n",
2947 iommu->name);
2948 }
2949 }
2950
2951 if (!ecap_pass_through(iommu->ecap))
2952 hw_pass_through = 0;
2953 intel_svm_check(iommu);
2954 }
2955
2956 /*
2957 * Now that qi is enabled on all iommus, set the root entry and flush
2958 * caches. This is required on some Intel X58 chipsets, otherwise the
2959 * flush_context function will loop forever and the boot hangs.
2960 */
2961 for_each_active_iommu(iommu, drhd) {
2962 iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964 register_pasid_allocator(iommu);
2965#endif
2966 iommu_set_root_entry(iommu);
2967 }
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970 dmar_map_gfx = 0;
2971#endif
2972
2973 if (!dmar_map_gfx)
2974 iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976 check_tylersburg_isoch();
2977
2978 ret = si_domain_init(hw_pass_through);
2979 if (ret)
2980 goto free_iommu;
2981
2982 /*
2983 * for each drhd
2984 * enable fault log
2985 * global invalidate context cache
2986 * global invalidate iotlb
2987 * enable translation
2988 */
2989 for_each_iommu(iommu, drhd) {
2990 if (drhd->ignored) {
2991 /*
2992 * we always have to disable PMRs or DMA may fail on
2993 * this device
2994 */
2995 if (force_on)
2996 iommu_disable_protect_mem_regions(iommu);
2997 continue;
2998 }
2999
3000 iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004 /*
3005 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006 * could cause possible lock race condition.
3007 */
3008 up_write(&dmar_global_lock);
3009 ret = intel_svm_enable_prq(iommu);
3010 down_write(&dmar_global_lock);
3011 if (ret)
3012 goto free_iommu;
3013 }
3014#endif
3015 ret = dmar_set_interrupt(iommu);
3016 if (ret)
3017 goto free_iommu;
3018 }
3019
3020 return 0;
3021
3022free_iommu:
3023 for_each_active_iommu(iommu, drhd) {
3024 disable_dmar_iommu(iommu);
3025 free_dmar_iommu(iommu);
3026 }
3027 if (si_domain) {
3028 domain_exit(si_domain);
3029 si_domain = NULL;
3030 }
3031
3032 return ret;
3033}
3034
3035static void __init init_no_remapping_devices(void)
3036{
3037 struct dmar_drhd_unit *drhd;
3038 struct device *dev;
3039 int i;
3040
3041 for_each_drhd_unit(drhd) {
3042 if (!drhd->include_all) {
3043 for_each_active_dev_scope(drhd->devices,
3044 drhd->devices_cnt, i, dev)
3045 break;
3046 /* ignore DMAR unit if no devices exist */
3047 if (i == drhd->devices_cnt)
3048 drhd->ignored = 1;
3049 }
3050 }
3051
3052 for_each_active_drhd_unit(drhd) {
3053 if (drhd->include_all)
3054 continue;
3055
3056 for_each_active_dev_scope(drhd->devices,
3057 drhd->devices_cnt, i, dev)
3058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059 break;
3060 if (i < drhd->devices_cnt)
3061 continue;
3062
3063 /* This IOMMU has *only* gfx devices. Either bypass it or
3064 set the gfx_mapped flag, as appropriate */
3065 drhd->gfx_dedicated = 1;
3066 if (!dmar_map_gfx)
3067 drhd->ignored = 1;
3068 }
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074 struct dmar_drhd_unit *drhd;
3075 struct intel_iommu *iommu = NULL;
3076
3077 for_each_active_iommu(iommu, drhd)
3078 if (iommu->qi)
3079 dmar_reenable_qi(iommu);
3080
3081 for_each_iommu(iommu, drhd) {
3082 if (drhd->ignored) {
3083 /*
3084 * we always have to disable PMRs or DMA may fail on
3085 * this device
3086 */
3087 if (force_on)
3088 iommu_disable_protect_mem_regions(iommu);
3089 continue;
3090 }
3091
3092 iommu_flush_write_buffer(iommu);
3093 iommu_set_root_entry(iommu);
3094 iommu_enable_translation(iommu);
3095 iommu_disable_protect_mem_regions(iommu);
3096 }
3097
3098 return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103 struct dmar_drhd_unit *drhd;
3104 struct intel_iommu *iommu;
3105
3106 for_each_active_iommu(iommu, drhd) {
3107 iommu->flush.flush_context(iommu, 0, 0, 0,
3108 DMA_CCMD_GLOBAL_INVL);
3109 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110 DMA_TLB_GLOBAL_FLUSH);
3111 }
3112}
3113
3114static int iommu_suspend(void)
3115{
3116 struct dmar_drhd_unit *drhd;
3117 struct intel_iommu *iommu = NULL;
3118 unsigned long flag;
3119
3120 for_each_active_iommu(iommu, drhd) {
3121 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122 GFP_KERNEL);
3123 if (!iommu->iommu_state)
3124 goto nomem;
3125 }
3126
3127 iommu_flush_all();
3128
3129 for_each_active_iommu(iommu, drhd) {
3130 iommu_disable_translation(iommu);
3131
3132 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135 readl(iommu->reg + DMAR_FECTL_REG);
3136 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137 readl(iommu->reg + DMAR_FEDATA_REG);
3138 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139 readl(iommu->reg + DMAR_FEADDR_REG);
3140 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141 readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144 }
3145 return 0;
3146
3147nomem:
3148 for_each_active_iommu(iommu, drhd)
3149 kfree(iommu->iommu_state);
3150
3151 return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156 struct dmar_drhd_unit *drhd;
3157 struct intel_iommu *iommu = NULL;
3158 unsigned long flag;
3159
3160 if (init_iommu_hw()) {
3161 if (force_on)
3162 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163 else
3164 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165 return;
3166 }
3167
3168 for_each_active_iommu(iommu, drhd) {
3169
3170 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173 iommu->reg + DMAR_FECTL_REG);
3174 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175 iommu->reg + DMAR_FEDATA_REG);
3176 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177 iommu->reg + DMAR_FEADDR_REG);
3178 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179 iommu->reg + DMAR_FEUADDR_REG);
3180
3181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182 }
3183
3184 for_each_active_iommu(iommu, drhd)
3185 kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189 .resume = iommu_resume,
3190 .suspend = iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195 register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif /* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206 rmrr->end_address <= rmrr->base_address ||
3207 arch_rmrr_sanity_check(rmrr))
3208 return -EINVAL;
3209
3210 return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215 struct acpi_dmar_reserved_memory *rmrr;
3216 struct dmar_rmrr_unit *rmrru;
3217
3218 rmrr = (struct acpi_dmar_reserved_memory *)header;
3219 if (rmrr_sanity_check(rmrr)) {
3220 pr_warn(FW_BUG
3221 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223 rmrr->base_address, rmrr->end_address,
3224 dmi_get_system_info(DMI_BIOS_VENDOR),
3225 dmi_get_system_info(DMI_BIOS_VERSION),
3226 dmi_get_system_info(DMI_PRODUCT_VERSION));
3227 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228 }
3229
3230 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231 if (!rmrru)
3232 goto out;
3233
3234 rmrru->hdr = header;
3235
3236 rmrru->base_address = rmrr->base_address;
3237 rmrru->end_address = rmrr->end_address;
3238
3239 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240 ((void *)rmrr) + rmrr->header.length,
3241 &rmrru->devices_cnt);
3242 if (rmrru->devices_cnt && rmrru->devices == NULL)
3243 goto free_rmrru;
3244
3245 list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247 return 0;
3248free_rmrru:
3249 kfree(rmrru);
3250out:
3251 return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256 struct dmar_atsr_unit *atsru;
3257 struct acpi_dmar_atsr *tmp;
3258
3259 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260 dmar_rcu_check()) {
3261 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262 if (atsr->segment != tmp->segment)
3263 continue;
3264 if (atsr->header.length != tmp->header.length)
3265 continue;
3266 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267 return atsru;
3268 }
3269
3270 return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275 struct acpi_dmar_atsr *atsr;
3276 struct dmar_atsr_unit *atsru;
3277
3278 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279 return 0;
3280
3281 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282 atsru = dmar_find_atsr(atsr);
3283 if (atsru)
3284 return 0;
3285
3286 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287 if (!atsru)
3288 return -ENOMEM;
3289
3290 /*
3291 * If memory is allocated from slab by ACPI _DSM method, we need to
3292 * copy the memory content because the memory buffer will be freed
3293 * on return.
3294 */
3295 atsru->hdr = (void *)(atsru + 1);
3296 memcpy(atsru->hdr, hdr, hdr->length);
3297 atsru->include_all = atsr->flags & 0x1;
3298 if (!atsru->include_all) {
3299 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300 (void *)atsr + atsr->header.length,
3301 &atsru->devices_cnt);
3302 if (atsru->devices_cnt && atsru->devices == NULL) {
3303 kfree(atsru);
3304 return -ENOMEM;
3305 }
3306 }
3307
3308 list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310 return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316 kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321 struct acpi_dmar_atsr *atsr;
3322 struct dmar_atsr_unit *atsru;
3323
3324 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325 atsru = dmar_find_atsr(atsr);
3326 if (atsru) {
3327 list_del_rcu(&atsru->list);
3328 synchronize_rcu();
3329 intel_iommu_free_atsr(atsru);
3330 }
3331
3332 return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337 int i;
3338 struct device *dev;
3339 struct acpi_dmar_atsr *atsr;
3340 struct dmar_atsr_unit *atsru;
3341
3342 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343 atsru = dmar_find_atsr(atsr);
3344 if (!atsru)
3345 return 0;
3346
3347 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349 i, dev)
3350 return -EBUSY;
3351 }
3352
3353 return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358 struct dmar_satc_unit *satcu;
3359 struct acpi_dmar_satc *tmp;
3360
3361 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362 dmar_rcu_check()) {
3363 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364 if (satc->segment != tmp->segment)
3365 continue;
3366 if (satc->header.length != tmp->header.length)
3367 continue;
3368 if (memcmp(satc, tmp, satc->header.length) == 0)
3369 return satcu;
3370 }
3371
3372 return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377 struct acpi_dmar_satc *satc;
3378 struct dmar_satc_unit *satcu;
3379
3380 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381 return 0;
3382
3383 satc = container_of(hdr, struct acpi_dmar_satc, header);
3384 satcu = dmar_find_satc(satc);
3385 if (satcu)
3386 return 0;
3387
3388 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389 if (!satcu)
3390 return -ENOMEM;
3391
3392 satcu->hdr = (void *)(satcu + 1);
3393 memcpy(satcu->hdr, hdr, hdr->length);
3394 satcu->atc_required = satc->flags & 0x1;
3395 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396 (void *)satc + satc->header.length,
3397 &satcu->devices_cnt);
3398 if (satcu->devices_cnt && !satcu->devices) {
3399 kfree(satcu);
3400 return -ENOMEM;
3401 }
3402 list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404 return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409 int sp, ret;
3410 struct intel_iommu *iommu = dmaru->iommu;
3411
3412 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413 if (ret)
3414 goto out;
3415
3416 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417 pr_warn("%s: Doesn't support hardware pass through.\n",
3418 iommu->name);
3419 return -ENXIO;
3420 }
3421
3422 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424 pr_warn("%s: Doesn't support large page.\n",
3425 iommu->name);
3426 return -ENXIO;
3427 }
3428
3429 /*
3430 * Disable translation if already enabled prior to OS handover.
3431 */
3432 if (iommu->gcmd & DMA_GCMD_TE)
3433 iommu_disable_translation(iommu);
3434
3435 ret = iommu_init_domains(iommu);
3436 if (ret == 0)
3437 ret = iommu_alloc_root_entry(iommu);
3438 if (ret)
3439 goto out;
3440
3441 intel_svm_check(iommu);
3442
3443 if (dmaru->ignored) {
3444 /*
3445 * we always have to disable PMRs or DMA may fail on this device
3446 */
3447 if (force_on)
3448 iommu_disable_protect_mem_regions(iommu);
3449 return 0;
3450 }
3451
3452 intel_iommu_init_qi(iommu);
3453 iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457 ret = intel_svm_enable_prq(iommu);
3458 if (ret)
3459 goto disable_iommu;
3460 }
3461#endif
3462 ret = dmar_set_interrupt(iommu);
3463 if (ret)
3464 goto disable_iommu;
3465
3466 iommu_set_root_entry(iommu);
3467 iommu_enable_translation(iommu);
3468
3469 iommu_disable_protect_mem_regions(iommu);
3470 return 0;
3471
3472disable_iommu:
3473 disable_dmar_iommu(iommu);
3474out:
3475 free_dmar_iommu(iommu);
3476 return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481 int ret = 0;
3482 struct intel_iommu *iommu = dmaru->iommu;
3483
3484 if (!intel_iommu_enabled)
3485 return 0;
3486 if (iommu == NULL)
3487 return -EINVAL;
3488
3489 if (insert) {
3490 ret = intel_iommu_add(dmaru);
3491 } else {
3492 disable_dmar_iommu(iommu);
3493 free_dmar_iommu(iommu);
3494 }
3495
3496 return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502 struct dmar_atsr_unit *atsru, *atsr_n;
3503 struct dmar_satc_unit *satcu, *satc_n;
3504
3505 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506 list_del(&rmrru->list);
3507 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508 kfree(rmrru);
3509 }
3510
3511 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512 list_del(&atsru->list);
3513 intel_iommu_free_atsr(atsru);
3514 }
3515 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516 list_del(&satcu->list);
3517 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518 kfree(satcu);
3519 }
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524 struct dmar_satc_unit *satcu;
3525 struct acpi_dmar_satc *satc;
3526 struct device *tmp;
3527 int i;
3528
3529 dev = pci_physfn(dev);
3530 rcu_read_lock();
3531
3532 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534 if (satc->segment != pci_domain_nr(dev->bus))
3535 continue;
3536 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537 if (to_pci_dev(tmp) == dev)
3538 goto out;
3539 }
3540 satcu = NULL;
3541out:
3542 rcu_read_unlock();
3543 return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548 int i, ret = 1;
3549 struct pci_bus *bus;
3550 struct pci_dev *bridge = NULL;
3551 struct device *tmp;
3552 struct acpi_dmar_atsr *atsr;
3553 struct dmar_atsr_unit *atsru;
3554 struct dmar_satc_unit *satcu;
3555
3556 dev = pci_physfn(dev);
3557 satcu = dmar_find_matched_satc_unit(dev);
3558 if (satcu)
3559 /*
3560 * This device supports ATS as it is in SATC table.
3561 * When IOMMU is in legacy mode, enabling ATS is done
3562 * automatically by HW for the device that requires
3563 * ATS, hence OS should not enable this device ATS
3564 * to avoid duplicated TLB invalidation.
3565 */
3566 return !(satcu->atc_required && !sm_supported(iommu));
3567
3568 for (bus = dev->bus; bus; bus = bus->parent) {
3569 bridge = bus->self;
3570 /* If it's an integrated device, allow ATS */
3571 if (!bridge)
3572 return 1;
3573 /* Connected via non-PCIe: no ATS */
3574 if (!pci_is_pcie(bridge) ||
3575 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576 return 0;
3577 /* If we found the root port, look it up in the ATSR */
3578 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579 break;
3580 }
3581
3582 rcu_read_lock();
3583 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585 if (atsr->segment != pci_domain_nr(dev->bus))
3586 continue;
3587
3588 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589 if (tmp == &bridge->dev)
3590 goto out;
3591
3592 if (atsru->include_all)
3593 goto out;
3594 }
3595 ret = 0;
3596out:
3597 rcu_read_unlock();
3598
3599 return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604 int ret;
3605 struct dmar_rmrr_unit *rmrru;
3606 struct dmar_atsr_unit *atsru;
3607 struct dmar_satc_unit *satcu;
3608 struct acpi_dmar_atsr *atsr;
3609 struct acpi_dmar_reserved_memory *rmrr;
3610 struct acpi_dmar_satc *satc;
3611
3612 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613 return 0;
3614
3615 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616 rmrr = container_of(rmrru->hdr,
3617 struct acpi_dmar_reserved_memory, header);
3618 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620 ((void *)rmrr) + rmrr->header.length,
3621 rmrr->segment, rmrru->devices,
3622 rmrru->devices_cnt);
3623 if (ret < 0)
3624 return ret;
3625 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626 dmar_remove_dev_scope(info, rmrr->segment,
3627 rmrru->devices, rmrru->devices_cnt);
3628 }
3629 }
3630
3631 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632 if (atsru->include_all)
3633 continue;
3634
3635 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638 (void *)atsr + atsr->header.length,
3639 atsr->segment, atsru->devices,
3640 atsru->devices_cnt);
3641 if (ret > 0)
3642 break;
3643 else if (ret < 0)
3644 return ret;
3645 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646 if (dmar_remove_dev_scope(info, atsr->segment,
3647 atsru->devices, atsru->devices_cnt))
3648 break;
3649 }
3650 }
3651 list_for_each_entry(satcu, &dmar_satc_units, list) {
3652 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655 (void *)satc + satc->header.length,
3656 satc->segment, satcu->devices,
3657 satcu->devices_cnt);
3658 if (ret > 0)
3659 break;
3660 else if (ret < 0)
3661 return ret;
3662 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663 if (dmar_remove_dev_scope(info, satc->segment,
3664 satcu->devices, satcu->devices_cnt))
3665 break;
3666 }
3667 }
3668
3669 return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673 unsigned long val, void *v)
3674{
3675 struct memory_notify *mhp = v;
3676 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678 mhp->nr_pages - 1);
3679
3680 switch (val) {
3681 case MEM_GOING_ONLINE:
3682 if (iommu_domain_identity_map(si_domain,
3683 start_vpfn, last_vpfn)) {
3684 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685 start_vpfn, last_vpfn);
3686 return NOTIFY_BAD;
3687 }
3688 break;
3689
3690 case MEM_OFFLINE:
3691 case MEM_CANCEL_ONLINE:
3692 {
3693 struct dmar_drhd_unit *drhd;
3694 struct intel_iommu *iommu;
3695 LIST_HEAD(freelist);
3696
3697 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3698
3699 rcu_read_lock();
3700 for_each_active_iommu(iommu, drhd)
3701 iommu_flush_iotlb_psi(iommu, si_domain,
3702 start_vpfn, mhp->nr_pages,
3703 list_empty(&freelist), 0);
3704 rcu_read_unlock();
3705 put_pages_list(&freelist);
3706 }
3707 break;
3708 }
3709
3710 return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714 .notifier_call = intel_iommu_memory_notifier,
3715 .priority = 0
3716};
3717
3718static void intel_disable_iommus(void)
3719{
3720 struct intel_iommu *iommu = NULL;
3721 struct dmar_drhd_unit *drhd;
3722
3723 for_each_iommu(iommu, drhd)
3724 iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729 struct dmar_drhd_unit *drhd;
3730 struct intel_iommu *iommu = NULL;
3731
3732 if (no_iommu || dmar_disabled)
3733 return;
3734
3735 down_write(&dmar_global_lock);
3736
3737 /* Disable PMRs explicitly here. */
3738 for_each_iommu(iommu, drhd)
3739 iommu_disable_protect_mem_regions(iommu);
3740
3741 /* Make sure the IOMMUs are switched off */
3742 intel_disable_iommus();
3743
3744 up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751 return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755 struct device_attribute *attr, char *buf)
3756{
3757 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759 return sprintf(buf, "%d:%d\n",
3760 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765 struct device_attribute *attr, char *buf)
3766{
3767 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768 return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773 struct device_attribute *attr, char *buf)
3774{
3775 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776 return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781 struct device_attribute *attr, char *buf)
3782{
3783 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784 return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789 struct device_attribute *attr, char *buf)
3790{
3791 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797 struct device_attribute *attr, char *buf)
3798{
3799 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801 cap_ndoms(iommu->cap)));
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806 &dev_attr_version.attr,
3807 &dev_attr_address.attr,
3808 &dev_attr_cap.attr,
3809 &dev_attr_ecap.attr,
3810 &dev_attr_domains_supported.attr,
3811 &dev_attr_domains_used.attr,
3812 NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816 .name = "intel-iommu",
3817 .attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821 &intel_iommu_group,
3822 NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827 struct pci_dev *pdev = NULL;
3828
3829 for_each_pci_dev(pdev)
3830 if (pdev->external_facing) {
3831 pci_dev_put(pdev);
3832 return true;
3833 }
3834
3835 return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841 return 0;
3842
3843 if (no_iommu || dmar_disabled)
3844 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846 /*
3847 * If Intel-IOMMU is disabled by default, we will apply identity
3848 * map for all devices except those marked as being untrusted.
3849 */
3850 if (dmar_disabled)
3851 iommu_set_default_passthrough(false);
3852
3853 dmar_disabled = 0;
3854 no_iommu = 0;
3855
3856 return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861 struct dmar_drhd_unit *drhd;
3862 /* To avoid a -Wunused-but-set-variable warning. */
3863 struct intel_iommu *iommu __maybe_unused;
3864 struct device *dev;
3865 int i, ret = 0;
3866
3867 for_each_active_iommu(iommu, drhd) {
3868 for_each_active_dev_scope(drhd->devices,
3869 drhd->devices_cnt, i, dev) {
3870 struct acpi_device_physical_node *pn;
3871 struct iommu_group *group;
3872 struct acpi_device *adev;
3873
3874 if (dev->bus != &acpi_bus_type)
3875 continue;
3876
3877 adev = to_acpi_device(dev);
3878 mutex_lock(&adev->physical_node_lock);
3879 list_for_each_entry(pn,
3880 &adev->physical_node_list, node) {
3881 group = iommu_group_get(pn->dev);
3882 if (group) {
3883 iommu_group_put(group);
3884 continue;
3885 }
3886
3887 ret = iommu_probe_device(pn->dev);
3888 if (ret)
3889 break;
3890 }
3891 mutex_unlock(&adev->physical_node_lock);
3892
3893 if (ret)
3894 return ret;
3895 }
3896 }
3897
3898 return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903 if (!tboot_enabled())
3904 return 0;
3905
3906 if (no_iommu || dmar_disabled)
3907 pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909 dmar_disabled = 0;
3910 no_iommu = 0;
3911
3912 return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917 int ret = -ENODEV;
3918 struct dmar_drhd_unit *drhd;
3919 struct intel_iommu *iommu;
3920
3921 /*
3922 * Intel IOMMU is required for a TXT/tboot launch or platform
3923 * opt in, so enforce that.
3924 */
3925 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926 platform_optin_force_iommu();
3927
3928 down_write(&dmar_global_lock);
3929 if (dmar_table_init()) {
3930 if (force_on)
3931 panic("tboot: Failed to initialize DMAR table\n");
3932 goto out_free_dmar;
3933 }
3934
3935 if (dmar_dev_scope_init() < 0) {
3936 if (force_on)
3937 panic("tboot: Failed to initialize DMAR device scope\n");
3938 goto out_free_dmar;
3939 }
3940
3941 up_write(&dmar_global_lock);
3942
3943 /*
3944 * The bus notifier takes the dmar_global_lock, so lockdep will
3945 * complain later when we register it under the lock.
3946 */
3947 dmar_register_bus_notifier();
3948
3949 down_write(&dmar_global_lock);
3950
3951 if (!no_iommu)
3952 intel_iommu_debugfs_init();
3953
3954 if (no_iommu || dmar_disabled) {
3955 /*
3956 * We exit the function here to ensure IOMMU's remapping and
3957 * mempool aren't setup, which means that the IOMMU's PMRs
3958 * won't be disabled via the call to init_dmars(). So disable
3959 * it explicitly here. The PMRs were setup by tboot prior to
3960 * calling SENTER, but the kernel is expected to reset/tear
3961 * down the PMRs.
3962 */
3963 if (intel_iommu_tboot_noforce) {
3964 for_each_iommu(iommu, drhd)
3965 iommu_disable_protect_mem_regions(iommu);
3966 }
3967
3968 /*
3969 * Make sure the IOMMUs are switched off, even when we
3970 * boot into a kexec kernel and the previous kernel left
3971 * them enabled
3972 */
3973 intel_disable_iommus();
3974 goto out_free_dmar;
3975 }
3976
3977 if (list_empty(&dmar_rmrr_units))
3978 pr_info("No RMRR found\n");
3979
3980 if (list_empty(&dmar_atsr_units))
3981 pr_info("No ATSR found\n");
3982
3983 if (list_empty(&dmar_satc_units))
3984 pr_info("No SATC found\n");
3985
3986 init_no_remapping_devices();
3987
3988 ret = init_dmars();
3989 if (ret) {
3990 if (force_on)
3991 panic("tboot: Failed to initialize DMARs\n");
3992 pr_err("Initialization failed\n");
3993 goto out_free_dmar;
3994 }
3995 up_write(&dmar_global_lock);
3996
3997 init_iommu_pm_ops();
3998
3999 down_read(&dmar_global_lock);
4000 for_each_active_iommu(iommu, drhd) {
4001 /*
4002 * The flush queue implementation does not perform
4003 * page-selective invalidations that are required for efficient
4004 * TLB flushes in virtual environments. The benefit of batching
4005 * is likely to be much lower than the overhead of synchronizing
4006 * the virtual and physical IOMMU page-tables.
4007 */
4008 if (cap_caching_mode(iommu->cap)) {
4009 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010 iommu_set_dma_strict();
4011 }
4012 iommu_device_sysfs_add(&iommu->iommu, NULL,
4013 intel_iommu_groups,
4014 "%s", iommu->name);
4015 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4016 }
4017 up_read(&dmar_global_lock);
4018
4019 if (si_domain && !hw_pass_through)
4020 register_memory_notifier(&intel_iommu_memory_nb);
4021
4022 down_read(&dmar_global_lock);
4023 if (probe_acpi_namespace_devices())
4024 pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026 /* Finally, we enable the DMA remapping hardware. */
4027 for_each_iommu(iommu, drhd) {
4028 if (!drhd->ignored && !translation_pre_enabled(iommu))
4029 iommu_enable_translation(iommu);
4030
4031 iommu_disable_protect_mem_regions(iommu);
4032 }
4033 up_read(&dmar_global_lock);
4034
4035 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037 intel_iommu_enabled = 1;
4038
4039 return 0;
4040
4041out_free_dmar:
4042 intel_iommu_free_dmars();
4043 up_write(&dmar_global_lock);
4044 return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049 struct device_domain_info *info = opaque;
4050
4051 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052 return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices. If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064 return;
4065
4066 pci_for_each_dma_alias(to_pci_dev(info->dev),
4067 &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072 struct device_domain_info *info = dev_iommu_priv_get(dev);
4073 struct dmar_domain *domain = info->domain;
4074 struct intel_iommu *iommu = info->iommu;
4075 unsigned long flags;
4076
4077 if (!dev_is_real_dma_subdevice(info->dev)) {
4078 if (dev_is_pci(info->dev) && sm_supported(iommu))
4079 intel_pasid_tear_down_entry(iommu, info->dev,
4080 PASID_RID2PASID, false);
4081
4082 iommu_disable_pci_caps(info);
4083 domain_context_clear(info);
4084 }
4085
4086 spin_lock_irqsave(&domain->lock, flags);
4087 list_del(&info->link);
4088 spin_unlock_irqrestore(&domain->lock, flags);
4089
4090 domain_detach_iommu(domain, iommu);
4091 info->domain = NULL;
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101 struct device_domain_info *info = dev_iommu_priv_get(dev);
4102 struct intel_iommu *iommu = info->iommu;
4103 unsigned long flags;
4104
4105 iommu_disable_pci_caps(info);
4106 if (!dev_is_real_dma_subdevice(dev)) {
4107 if (sm_supported(iommu))
4108 intel_pasid_tear_down_entry(iommu, dev,
4109 PASID_RID2PASID, false);
4110 else
4111 domain_context_clear(info);
4112 }
4113
4114 if (!info->domain)
4115 return;
4116
4117 spin_lock_irqsave(&info->domain->lock, flags);
4118 list_del(&info->link);
4119 spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121 domain_detach_iommu(info->domain, iommu);
4122 info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127 int adjust_width;
4128
4129 /* calculate AGAW */
4130 domain->gaw = guest_width;
4131 adjust_width = guestwidth_to_adjustwidth(guest_width);
4132 domain->agaw = width_to_agaw(adjust_width);
4133
4134 domain->iommu_coherency = false;
4135 domain->iommu_superpage = 0;
4136 domain->max_addr = 0;
4137
4138 /* always allocate the top pgd */
4139 domain->pgd = alloc_pgtable_page(domain->nid);
4140 if (!domain->pgd)
4141 return -ENOMEM;
4142 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143 return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147 struct device *dev)
4148{
4149 device_block_translation(dev);
4150 return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
4154 .ops = &(const struct iommu_domain_ops) {
4155 .attach_dev = blocking_domain_attach_dev,
4156 .free = intel_iommu_domain_free
4157 }
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162 struct dmar_domain *dmar_domain;
4163 struct iommu_domain *domain;
4164
4165 switch (type) {
4166 case IOMMU_DOMAIN_BLOCKED:
4167 return &blocking_domain;
4168 case IOMMU_DOMAIN_DMA:
4169 case IOMMU_DOMAIN_DMA_FQ:
4170 case IOMMU_DOMAIN_UNMANAGED:
4171 dmar_domain = alloc_domain(type);
4172 if (!dmar_domain) {
4173 pr_err("Can't allocate dmar_domain\n");
4174 return NULL;
4175 }
4176 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177 pr_err("Domain initialization failed\n");
4178 domain_exit(dmar_domain);
4179 return NULL;
4180 }
4181
4182 domain = &dmar_domain->domain;
4183 domain->geometry.aperture_start = 0;
4184 domain->geometry.aperture_end =
4185 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186 domain->geometry.force_aperture = true;
4187
4188 return domain;
4189 case IOMMU_DOMAIN_IDENTITY:
4190 return &si_domain->domain;
4191 case IOMMU_DOMAIN_SVA:
4192 return intel_svm_domain_alloc();
4193 default:
4194 return NULL;
4195 }
4196
4197 return NULL;
4198}
4199
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202 if (domain != &si_domain->domain && domain != &blocking_domain)
4203 domain_exit(to_dmar_domain(domain));
4204}
4205
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207 struct device *dev)
4208{
4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 struct intel_iommu *iommu;
4211 int addr_width;
4212
4213 iommu = device_to_iommu(dev, NULL, NULL);
4214 if (!iommu)
4215 return -ENODEV;
4216
4217 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218 return -EINVAL;
4219
4220 /* check if this iommu agaw is sufficient for max mapped address */
4221 addr_width = agaw_to_width(iommu->agaw);
4222 if (addr_width > cap_mgaw(iommu->cap))
4223 addr_width = cap_mgaw(iommu->cap);
4224
4225 if (dmar_domain->max_addr > (1LL << addr_width))
4226 return -EINVAL;
4227 dmar_domain->gaw = addr_width;
4228
4229 /*
4230 * Knock out extra levels of page tables if necessary
4231 */
4232 while (iommu->agaw < dmar_domain->agaw) {
4233 struct dma_pte *pte;
4234
4235 pte = dmar_domain->pgd;
4236 if (dma_pte_present(pte)) {
4237 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238 free_pgtable_page(pte);
4239 }
4240 dmar_domain->agaw--;
4241 }
4242
4243 return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247 struct device *dev)
4248{
4249 struct device_domain_info *info = dev_iommu_priv_get(dev);
4250 int ret;
4251
4252 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253 device_is_rmrr_locked(dev)) {
4254 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4255 return -EPERM;
4256 }
4257
4258 if (info->domain)
4259 device_block_translation(dev);
4260
4261 ret = prepare_domain_attach_device(domain, dev);
4262 if (ret)
4263 return ret;
4264
4265 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4266}
4267
4268static int intel_iommu_map(struct iommu_domain *domain,
4269 unsigned long iova, phys_addr_t hpa,
4270 size_t size, int iommu_prot, gfp_t gfp)
4271{
4272 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273 u64 max_addr;
4274 int prot = 0;
4275
4276 if (iommu_prot & IOMMU_READ)
4277 prot |= DMA_PTE_READ;
4278 if (iommu_prot & IOMMU_WRITE)
4279 prot |= DMA_PTE_WRITE;
4280 if (dmar_domain->set_pte_snp)
4281 prot |= DMA_PTE_SNP;
4282
4283 max_addr = iova + size;
4284 if (dmar_domain->max_addr < max_addr) {
4285 u64 end;
4286
4287 /* check if minimum agaw is sufficient for mapped address */
4288 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289 if (end < max_addr) {
4290 pr_err("%s: iommu width (%d) is not "
4291 "sufficient for the mapped address (%llx)\n",
4292 __func__, dmar_domain->gaw, max_addr);
4293 return -EFAULT;
4294 }
4295 dmar_domain->max_addr = max_addr;
4296 }
4297 /* Round up size to next multiple of PAGE_SIZE, if it and
4298 the low bits of hpa would take us onto the next page */
4299 size = aligned_nrpages(hpa, size);
4300 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301 hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305 unsigned long iova, phys_addr_t paddr,
4306 size_t pgsize, size_t pgcount,
4307 int prot, gfp_t gfp, size_t *mapped)
4308{
4309 unsigned long pgshift = __ffs(pgsize);
4310 size_t size = pgcount << pgshift;
4311 int ret;
4312
4313 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314 return -EINVAL;
4315
4316 if (!IS_ALIGNED(iova | paddr, pgsize))
4317 return -EINVAL;
4318
4319 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320 if (!ret && mapped)
4321 *mapped = size;
4322
4323 return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327 unsigned long iova, size_t size,
4328 struct iommu_iotlb_gather *gather)
4329{
4330 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331 unsigned long start_pfn, last_pfn;
4332 int level = 0;
4333
4334 /* Cope with horrid API which requires us to unmap more than the
4335 size argument if it happens to be a large-page mapping. */
4336 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337
4338 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341 start_pfn = iova >> VTD_PAGE_SHIFT;
4342 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345
4346 if (dmar_domain->max_addr == iova + size)
4347 dmar_domain->max_addr = iova;
4348
4349 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350
4351 return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355 unsigned long iova,
4356 size_t pgsize, size_t pgcount,
4357 struct iommu_iotlb_gather *gather)
4358{
4359 unsigned long pgshift = __ffs(pgsize);
4360 size_t size = pgcount << pgshift;
4361
4362 return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366 struct iommu_iotlb_gather *gather)
4367{
4368 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369 unsigned long iova_pfn = IOVA_PFN(gather->start);
4370 size_t size = gather->end - gather->start;
4371 struct iommu_domain_info *info;
4372 unsigned long start_pfn;
4373 unsigned long nrpages;
4374 unsigned long i;
4375
4376 nrpages = aligned_nrpages(gather->start, size);
4377 start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379 xa_for_each(&dmar_domain->iommu_array, i, info)
4380 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381 start_pfn, nrpages,
4382 list_empty(&gather->freelist), 0);
4383
4384 put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388 dma_addr_t iova)
4389{
4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 struct dma_pte *pte;
4392 int level = 0;
4393 u64 phys = 0;
4394
4395 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396 if (pte && dma_pte_present(pte))
4397 phys = dma_pte_addr(pte) +
4398 (iova & (BIT_MASK(level_to_offset_bits(level) +
4399 VTD_PAGE_SHIFT) - 1));
4400
4401 return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406 struct device_domain_info *info;
4407 bool support = true;
4408
4409 assert_spin_locked(&domain->lock);
4410 list_for_each_entry(info, &domain->devices, link) {
4411 if (!ecap_sc_support(info->iommu->ecap)) {
4412 support = false;
4413 break;
4414 }
4415 }
4416
4417 return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422 struct device_domain_info *info;
4423
4424 assert_spin_locked(&domain->lock);
4425 /*
4426 * Second level page table supports per-PTE snoop control. The
4427 * iommu_map() interface will handle this by setting SNP bit.
4428 */
4429 if (!domain->use_first_level) {
4430 domain->set_pte_snp = true;
4431 return;
4432 }
4433
4434 list_for_each_entry(info, &domain->devices, link)
4435 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436 PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442 unsigned long flags;
4443
4444 if (dmar_domain->force_snooping)
4445 return true;
4446
4447 spin_lock_irqsave(&dmar_domain->lock, flags);
4448 if (!domain_support_force_snooping(dmar_domain)) {
4449 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450 return false;
4451 }
4452
4453 domain_set_force_snooping(dmar_domain);
4454 dmar_domain->force_snooping = true;
4455 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457 return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462 struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464 switch (cap) {
4465 case IOMMU_CAP_CACHE_COHERENCY:
4466 return true;
4467 case IOMMU_CAP_INTR_REMAP:
4468 return irq_remapping_enabled == 1;
4469 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470 return dmar_platform_optin();
4471 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472 return ecap_sc_support(info->iommu->ecap);
4473 default:
4474 return false;
4475 }
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481 struct device_domain_info *info;
4482 struct intel_iommu *iommu;
4483 u8 bus, devfn;
4484 int ret;
4485
4486 iommu = device_to_iommu(dev, &bus, &devfn);
4487 if (!iommu || !iommu->iommu.ops)
4488 return ERR_PTR(-ENODEV);
4489
4490 info = kzalloc(sizeof(*info), GFP_KERNEL);
4491 if (!info)
4492 return ERR_PTR(-ENOMEM);
4493
4494 if (dev_is_real_dma_subdevice(dev)) {
4495 info->bus = pdev->bus->number;
4496 info->devfn = pdev->devfn;
4497 info->segment = pci_domain_nr(pdev->bus);
4498 } else {
4499 info->bus = bus;
4500 info->devfn = devfn;
4501 info->segment = iommu->segment;
4502 }
4503
4504 info->dev = dev;
4505 info->iommu = iommu;
4506 if (dev_is_pci(dev)) {
4507 if (ecap_dev_iotlb_support(iommu->ecap) &&
4508 pci_ats_supported(pdev) &&
4509 dmar_ats_supported(pdev, iommu)) {
4510 info->ats_supported = 1;
4511 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512 }
4513 if (sm_supported(iommu)) {
4514 if (pasid_supported(iommu)) {
4515 int features = pci_pasid_features(pdev);
4516
4517 if (features >= 0)
4518 info->pasid_supported = features | 1;
4519 }
4520
4521 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522 pci_pri_supported(pdev))
4523 info->pri_supported = 1;
4524 }
4525 }
4526
4527 dev_iommu_priv_set(dev, info);
4528
4529 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530 ret = intel_pasid_alloc_table(dev);
4531 if (ret) {
4532 dev_err(dev, "PASID table allocation failed\n");
4533 dev_iommu_priv_set(dev, NULL);
4534 kfree(info);
4535 return ERR_PTR(ret);
4536 }
4537 }
4538
4539 return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544 struct device_domain_info *info = dev_iommu_priv_get(dev);
4545
4546 dmar_remove_one_dev_info(dev);
4547 intel_pasid_free_table(dev);
4548 dev_iommu_priv_set(dev, NULL);
4549 kfree(info);
4550 set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555 set_dma_ops(dev, NULL);
4556 iommu_setup_dma_ops(dev, 0, U64_MAX);
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560 struct list_head *head)
4561{
4562 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563 struct iommu_resv_region *reg;
4564 struct dmar_rmrr_unit *rmrr;
4565 struct device *i_dev;
4566 int i;
4567
4568 rcu_read_lock();
4569 for_each_rmrr_units(rmrr) {
4570 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571 i, i_dev) {
4572 struct iommu_resv_region *resv;
4573 enum iommu_resv_type type;
4574 size_t length;
4575
4576 if (i_dev != device &&
4577 !is_downstream_to_pci_bridge(device, i_dev))
4578 continue;
4579
4580 length = rmrr->end_address - rmrr->base_address + 1;
4581
4582 type = device_rmrr_is_relaxable(device) ?
4583 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585 resv = iommu_alloc_resv_region(rmrr->base_address,
4586 length, prot, type,
4587 GFP_ATOMIC);
4588 if (!resv)
4589 break;
4590
4591 list_add_tail(&resv->list, head);
4592 }
4593 }
4594 rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597 if (dev_is_pci(device)) {
4598 struct pci_dev *pdev = to_pci_dev(device);
4599
4600 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602 IOMMU_RESV_DIRECT_RELAXABLE,
4603 GFP_KERNEL);
4604 if (reg)
4605 list_add_tail(®->list, head);
4606 }
4607 }
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612 0, IOMMU_RESV_MSI, GFP_KERNEL);
4613 if (!reg)
4614 return;
4615 list_add_tail(®->list, head);
4616}
4617
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620 if (dev_is_pci(dev))
4621 return pci_device_group(dev);
4622 return generic_device_group(dev);
4623}
4624
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627 struct device_domain_info *info = dev_iommu_priv_get(dev);
4628 struct intel_iommu *iommu;
4629 int ret;
4630
4631 if (!info || dmar_disabled)
4632 return -EINVAL;
4633
4634 iommu = info->iommu;
4635 if (!iommu)
4636 return -EINVAL;
4637
4638 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639 return -ENODEV;
4640
4641 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642 return -EINVAL;
4643
4644 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645 if (!ret)
4646 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647
4648 return ret;
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
4652{
4653 struct device_domain_info *info = dev_iommu_priv_get(dev);
4654 struct intel_iommu *iommu = info->iommu;
4655 int ret;
4656
4657 ret = iommu_unregister_device_fault_handler(dev);
4658 if (!ret)
4659 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660
4661 return ret;
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
4665{
4666 struct device_domain_info *info = dev_iommu_priv_get(dev);
4667
4668 if (info && info->pri_supported)
4669 return 0;
4670
4671 return -ENODEV;
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677 switch (feat) {
4678 case IOMMU_DEV_FEAT_IOPF:
4679 return intel_iommu_enable_iopf(dev);
4680
4681 case IOMMU_DEV_FEAT_SVA:
4682 return intel_iommu_enable_sva(dev);
4683
4684 default:
4685 return -ENODEV;
4686 }
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692 switch (feat) {
4693 case IOMMU_DEV_FEAT_IOPF:
4694 return 0;
4695
4696 case IOMMU_DEV_FEAT_SVA:
4697 return intel_iommu_disable_sva(dev);
4698
4699 default:
4700 return -ENODEV;
4701 }
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
4705{
4706 struct device_domain_info *info = dev_iommu_priv_get(dev);
4707
4708 return translation_pre_enabled(info->iommu) && !info->domain;
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718 if (pdev->untrusted) {
4719 pci_info(pdev,
4720 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721 pdev->vendor, pdev->device);
4722 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723 return true;
4724 }
4725 return false;
4726}
4727
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729 unsigned long iova, size_t size)
4730{
4731 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732 unsigned long pages = aligned_nrpages(iova, size);
4733 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734 struct iommu_domain_info *info;
4735 unsigned long i;
4736
4737 xa_for_each(&dmar_domain->iommu_array, i, info)
4738 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744 struct iommu_domain *domain;
4745
4746 /* Domain type specific cleanup: */
4747 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748 if (domain) {
4749 switch (domain->type) {
4750 case IOMMU_DOMAIN_SVA:
4751 intel_svm_remove_dev_pasid(dev, pasid);
4752 break;
4753 default:
4754 /* should never reach here */
4755 WARN_ON(1);
4756 break;
4757 }
4758 }
4759
4760 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761}
4762
4763const struct iommu_ops intel_iommu_ops = {
4764 .capable = intel_iommu_capable,
4765 .domain_alloc = intel_iommu_domain_alloc,
4766 .probe_device = intel_iommu_probe_device,
4767 .probe_finalize = intel_iommu_probe_finalize,
4768 .release_device = intel_iommu_release_device,
4769 .get_resv_regions = intel_iommu_get_resv_regions,
4770 .device_group = intel_iommu_device_group,
4771 .dev_enable_feat = intel_iommu_dev_enable_feat,
4772 .dev_disable_feat = intel_iommu_dev_disable_feat,
4773 .is_attach_deferred = intel_iommu_is_attach_deferred,
4774 .def_domain_type = device_def_domain_type,
4775 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4776 .pgsize_bitmap = SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
4778 .page_response = intel_svm_page_response,
4779#endif
4780 .default_domain_ops = &(const struct iommu_domain_ops) {
4781 .attach_dev = intel_iommu_attach_device,
4782 .map_pages = intel_iommu_map_pages,
4783 .unmap_pages = intel_iommu_unmap_pages,
4784 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4785 .flush_iotlb_all = intel_flush_iotlb_all,
4786 .iotlb_sync = intel_iommu_tlb_sync,
4787 .iova_to_phys = intel_iommu_iova_to_phys,
4788 .free = intel_iommu_domain_free,
4789 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790 }
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795 if (risky_device(dev))
4796 return;
4797
4798 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799 dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839 if (risky_device(dev))
4840 return;
4841
4842 /*
4843 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844 * but needs it. Same seems to hold for the desktop versions.
4845 */
4846 pci_info(dev, "Forcing write-buffer flush capability\n");
4847 rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870 unsigned short ggc;
4871
4872 if (risky_device(dev))
4873 return;
4874
4875 if (pci_read_config_word(dev, GGC, &ggc))
4876 return;
4877
4878 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880 dmar_map_gfx = 0;
4881 } else if (dmar_map_gfx) {
4882 /* we have to ensure the gfx device is idle before we flush */
4883 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884 iommu_set_dma_strict();
4885 }
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894 unsigned short ver;
4895
4896 if (!IS_GFX_DEVICE(dev))
4897 return;
4898
4899 ver = (dev->device >> 8) & 0xff;
4900 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902 ver != 0x9a && ver != 0xa7)
4903 return;
4904
4905 if (risky_device(dev))
4906 return;
4907
4908 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909 iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914 ISOCH DMAR unit for the Azalia sound device, but not give it any
4915 TLB entries, which causes it to deadlock. Check for that. We do
4916 this in a function called from init_dmars(), instead of in a PCI
4917 quirk, because we don't want to print the obnoxious "BIOS broken"
4918 message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922 struct pci_dev *pdev;
4923 uint32_t vtisochctrl;
4924
4925 /* If there's no Azalia in the system anyway, forget it. */
4926 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927 if (!pdev)
4928 return;
4929
4930 if (risky_device(pdev)) {
4931 pci_dev_put(pdev);
4932 return;
4933 }
4934
4935 pci_dev_put(pdev);
4936
4937 /* System Management Registers. Might be hidden, in which case
4938 we can't do the sanity check. But that's OK, because the
4939 known-broken BIOSes _don't_ actually hide it, so far. */
4940 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941 if (!pdev)
4942 return;
4943
4944 if (risky_device(pdev)) {
4945 pci_dev_put(pdev);
4946 return;
4947 }
4948
4949 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950 pci_dev_put(pdev);
4951 return;
4952 }
4953
4954 pci_dev_put(pdev);
4955
4956 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957 if (vtisochctrl & 1)
4958 return;
4959
4960 /* Drop all bits other than the number of TLB entries */
4961 vtisochctrl &= 0x1c;
4962
4963 /* If we have the recommended number of TLB entries (16), fine. */
4964 if (vtisochctrl == 0x10)
4965 return;
4966
4967 /* Zero TLB entries? You get to ride the short bus to school. */
4968 if (!vtisochctrl) {
4969 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971 dmi_get_system_info(DMI_BIOS_VENDOR),
4972 dmi_get_system_info(DMI_BIOS_VERSION),
4973 dmi_get_system_info(DMI_PRODUCT_VERSION));
4974 iommu_identity_mapping |= IDENTMAP_AZALIA;
4975 return;
4976 }
4977
4978 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979 vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 * exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 * VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009 unsigned long address, unsigned long mask,
5010 u32 pasid, u16 qdep)
5011{
5012 u16 sid;
5013
5014 if (likely(!info->dtlb_extra_inval))
5015 return;
5016
5017 sid = PCI_DEVID(info->bus, info->devfn);
5018 if (pasid == PASID_RID2PASID) {
5019 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020 qdep, address, mask);
5021 } else {
5022 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023 pasid, qdep, address, mask);
5024 }
5025}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/init.h>
17#include <linux/bitmap.h>
18#include <linux/debugfs.h>
19#include <linux/export.h>
20#include <linux/slab.h>
21#include <linux/irq.h>
22#include <linux/interrupt.h>
23#include <linux/spinlock.h>
24#include <linux/pci.h>
25#include <linux/dmar.h>
26#include <linux/dma-mapping.h>
27#include <linux/mempool.h>
28#include <linux/memory.h>
29#include <linux/cpu.h>
30#include <linux/timer.h>
31#include <linux/io.h>
32#include <linux/iova.h>
33#include <linux/iommu.h>
34#include <linux/intel-iommu.h>
35#include <linux/syscore_ops.h>
36#include <linux/tboot.h>
37#include <linux/dmi.h>
38#include <linux/pci-ats.h>
39#include <linux/memblock.h>
40#include <linux/dma-contiguous.h>
41#include <linux/dma-direct.h>
42#include <linux/crash_dump.h>
43#include <linux/numa.h>
44#include <linux/swiotlb.h>
45#include <asm/irq_remapping.h>
46#include <asm/cacheflush.h>
47#include <asm/iommu.h>
48#include <trace/events/intel_iommu.h>
49
50#include "../irq_remapping.h"
51#include "pasid.h"
52
53#define ROOT_SIZE VTD_PAGE_SIZE
54#define CONTEXT_SIZE VTD_PAGE_SIZE
55
56#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61#define IOAPIC_RANGE_START (0xfee00000)
62#define IOAPIC_RANGE_END (0xfeefffff)
63#define IOVA_START_ADDR (0x1000)
64
65#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67#define MAX_AGAW_WIDTH 64
68#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79/* IO virtual address start page frame number */
80#define IOVA_START_PFN (1)
81
82#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
83
84/* page table handling */
85#define LEVEL_STRIDE (9)
86#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87
88/*
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
92 * that we support.
93 *
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
97 *
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
100 *
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
103 */
104#define INTEL_IOMMU_PGSIZES (~0xFFFUL)
105
106static inline int agaw_to_level(int agaw)
107{
108 return agaw + 2;
109}
110
111static inline int agaw_to_width(int agaw)
112{
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114}
115
116static inline int width_to_agaw(int width)
117{
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119}
120
121static inline unsigned int level_to_offset_bits(int level)
122{
123 return (level - 1) * LEVEL_STRIDE;
124}
125
126static inline int pfn_level_offset(u64 pfn, int level)
127{
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129}
130
131static inline u64 level_mask(int level)
132{
133 return -1ULL << level_to_offset_bits(level);
134}
135
136static inline u64 level_size(int level)
137{
138 return 1ULL << level_to_offset_bits(level);
139}
140
141static inline u64 align_to_level(u64 pfn, int level)
142{
143 return (pfn + level_size(level) - 1) & level_mask(level);
144}
145
146static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147{
148 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149}
150
151/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154{
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156}
157
158static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159{
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161}
162static inline unsigned long page_to_dma_pfn(struct page *pg)
163{
164 return mm_to_dma_pfn(page_to_pfn(pg));
165}
166static inline unsigned long virt_to_dma_pfn(void *p)
167{
168 return page_to_dma_pfn(virt_to_page(p));
169}
170
171/* global iommu list, set NULL for ignored DMAR units */
172static struct intel_iommu **g_iommus;
173
174static void __init check_tylersburg_isoch(void);
175static int rwbf_quirk;
176
177/*
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
180 */
181static int force_on = 0;
182int intel_iommu_tboot_noforce;
183static int no_platform_optin;
184
185#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187/*
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 * if marked present.
190 */
191static phys_addr_t root_entry_lctp(struct root_entry *re)
192{
193 if (!(re->lo & 1))
194 return 0;
195
196 return re->lo & VTD_PAGE_MASK;
197}
198
199/*
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 * if marked present.
202 */
203static phys_addr_t root_entry_uctp(struct root_entry *re)
204{
205 if (!(re->hi & 1))
206 return 0;
207
208 return re->hi & VTD_PAGE_MASK;
209}
210
211static inline void context_clear_pasid_enable(struct context_entry *context)
212{
213 context->lo &= ~(1ULL << 11);
214}
215
216static inline bool context_pasid_enabled(struct context_entry *context)
217{
218 return !!(context->lo & (1ULL << 11));
219}
220
221static inline void context_set_copied(struct context_entry *context)
222{
223 context->hi |= (1ull << 3);
224}
225
226static inline bool context_copied(struct context_entry *context)
227{
228 return !!(context->hi & (1ULL << 3));
229}
230
231static inline bool __context_present(struct context_entry *context)
232{
233 return (context->lo & 1);
234}
235
236bool context_present(struct context_entry *context)
237{
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
241}
242
243static inline void context_set_present(struct context_entry *context)
244{
245 context->lo |= 1;
246}
247
248static inline void context_set_fault_enable(struct context_entry *context)
249{
250 context->lo &= (((u64)-1) << 2) | 1;
251}
252
253static inline void context_set_translation_type(struct context_entry *context,
254 unsigned long value)
255{
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
258}
259
260static inline void context_set_address_root(struct context_entry *context,
261 unsigned long value)
262{
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
265}
266
267static inline void context_set_address_width(struct context_entry *context,
268 unsigned long value)
269{
270 context->hi |= value & 7;
271}
272
273static inline void context_set_domain_id(struct context_entry *context,
274 unsigned long value)
275{
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
277}
278
279static inline int context_domain_id(struct context_entry *c)
280{
281 return((c->hi >> 8) & 0xffff);
282}
283
284static inline void context_clear_entry(struct context_entry *context)
285{
286 context->lo = 0;
287 context->hi = 0;
288}
289
290/*
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
295 */
296static struct dmar_domain *si_domain;
297static int hw_pass_through = 1;
298
299#define for_each_domain_iommu(idx, domain) \
300 for (idx = 0; idx < g_num_of_iommus; idx++) \
301 if (domain->iommu_refcnt[idx])
302
303struct dmar_rmrr_unit {
304 struct list_head list; /* list of rmrr units */
305 struct acpi_dmar_header *hdr; /* ACPI header */
306 u64 base_address; /* reserved base address*/
307 u64 end_address; /* reserved end address */
308 struct dmar_dev_scope *devices; /* target devices */
309 int devices_cnt; /* target device count */
310};
311
312struct dmar_atsr_unit {
313 struct list_head list; /* list of ATSR units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 struct dmar_dev_scope *devices; /* target devices */
316 int devices_cnt; /* target device count */
317 u8 include_all:1; /* include all ports */
318};
319
320static LIST_HEAD(dmar_atsr_units);
321static LIST_HEAD(dmar_rmrr_units);
322
323#define for_each_rmrr_units(rmrr) \
324 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
325
326/* bitmap for indexing intel_iommus */
327static int g_num_of_iommus;
328
329static void domain_exit(struct dmar_domain *domain);
330static void domain_remove_dev_info(struct dmar_domain *domain);
331static void dmar_remove_one_dev_info(struct device *dev);
332static void __dmar_remove_one_dev_info(struct device_domain_info *info);
333static int intel_iommu_attach_device(struct iommu_domain *domain,
334 struct device *dev);
335static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
336 dma_addr_t iova);
337
338#ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
339int dmar_disabled = 0;
340#else
341int dmar_disabled = 1;
342#endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
343
344#ifdef CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
345int intel_iommu_sm = 1;
346#else
347int intel_iommu_sm;
348#endif /* CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
349
350int intel_iommu_enabled = 0;
351EXPORT_SYMBOL_GPL(intel_iommu_enabled);
352
353static int dmar_map_gfx = 1;
354static int dmar_forcedac;
355static int intel_iommu_strict;
356static int intel_iommu_superpage = 1;
357static int iommu_identity_mapping;
358static int intel_no_bounce;
359static int iommu_skip_te_disable;
360
361#define IDENTMAP_GFX 2
362#define IDENTMAP_AZALIA 4
363
364int intel_iommu_gfx_mapped;
365EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
366
367#define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
368struct device_domain_info *get_domain_info(struct device *dev)
369{
370 struct device_domain_info *info;
371
372 if (!dev)
373 return NULL;
374
375 info = dev_iommu_priv_get(dev);
376 if (unlikely(info == DEFER_DEVICE_DOMAIN_INFO))
377 return NULL;
378
379 return info;
380}
381
382DEFINE_SPINLOCK(device_domain_lock);
383static LIST_HEAD(device_domain_list);
384
385#define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
386 to_pci_dev(d)->untrusted)
387
388/*
389 * Iterate over elements in device_domain_list and call the specified
390 * callback @fn against each element.
391 */
392int for_each_device_domain(int (*fn)(struct device_domain_info *info,
393 void *data), void *data)
394{
395 int ret = 0;
396 unsigned long flags;
397 struct device_domain_info *info;
398
399 spin_lock_irqsave(&device_domain_lock, flags);
400 list_for_each_entry(info, &device_domain_list, global) {
401 ret = fn(info, data);
402 if (ret) {
403 spin_unlock_irqrestore(&device_domain_lock, flags);
404 return ret;
405 }
406 }
407 spin_unlock_irqrestore(&device_domain_lock, flags);
408
409 return 0;
410}
411
412const struct iommu_ops intel_iommu_ops;
413
414static bool translation_pre_enabled(struct intel_iommu *iommu)
415{
416 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
417}
418
419static void clear_translation_pre_enabled(struct intel_iommu *iommu)
420{
421 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
422}
423
424static void init_translation_status(struct intel_iommu *iommu)
425{
426 u32 gsts;
427
428 gsts = readl(iommu->reg + DMAR_GSTS_REG);
429 if (gsts & DMA_GSTS_TES)
430 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
431}
432
433static int __init intel_iommu_setup(char *str)
434{
435 if (!str)
436 return -EINVAL;
437 while (*str) {
438 if (!strncmp(str, "on", 2)) {
439 dmar_disabled = 0;
440 pr_info("IOMMU enabled\n");
441 } else if (!strncmp(str, "off", 3)) {
442 dmar_disabled = 1;
443 no_platform_optin = 1;
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
446 dmar_map_gfx = 0;
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
450 dmar_forcedac = 1;
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_on", 5)) {
458 pr_info("Intel-IOMMU: scalable mode supported\n");
459 intel_iommu_sm = 1;
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
461 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
462 intel_iommu_tboot_noforce = 1;
463 } else if (!strncmp(str, "nobounce", 8)) {
464 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
465 intel_no_bounce = 1;
466 }
467
468 str += strcspn(str, ",");
469 while (*str == ',')
470 str++;
471 }
472 return 0;
473}
474__setup("intel_iommu=", intel_iommu_setup);
475
476static struct kmem_cache *iommu_domain_cache;
477static struct kmem_cache *iommu_devinfo_cache;
478
479static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
480{
481 struct dmar_domain **domains;
482 int idx = did >> 8;
483
484 domains = iommu->domains[idx];
485 if (!domains)
486 return NULL;
487
488 return domains[did & 0xff];
489}
490
491static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
492 struct dmar_domain *domain)
493{
494 struct dmar_domain **domains;
495 int idx = did >> 8;
496
497 if (!iommu->domains[idx]) {
498 size_t size = 256 * sizeof(struct dmar_domain *);
499 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 }
501
502 domains = iommu->domains[idx];
503 if (WARN_ON(!domains))
504 return;
505 else
506 domains[did & 0xff] = domain;
507}
508
509void *alloc_pgtable_page(int node)
510{
511 struct page *page;
512 void *vaddr = NULL;
513
514 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
515 if (page)
516 vaddr = page_address(page);
517 return vaddr;
518}
519
520void free_pgtable_page(void *vaddr)
521{
522 free_page((unsigned long)vaddr);
523}
524
525static inline void *alloc_domain_mem(void)
526{
527 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528}
529
530static void free_domain_mem(void *vaddr)
531{
532 kmem_cache_free(iommu_domain_cache, vaddr);
533}
534
535static inline void * alloc_devinfo_mem(void)
536{
537 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538}
539
540static inline void free_devinfo_mem(void *vaddr)
541{
542 kmem_cache_free(iommu_devinfo_cache, vaddr);
543}
544
545static inline int domain_type_is_si(struct dmar_domain *domain)
546{
547 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
548}
549
550static inline bool domain_use_first_level(struct dmar_domain *domain)
551{
552 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
553}
554
555static inline int domain_pfn_supported(struct dmar_domain *domain,
556 unsigned long pfn)
557{
558 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561}
562
563static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564{
565 unsigned long sagaw;
566 int agaw = -1;
567
568 sagaw = cap_sagaw(iommu->cap);
569 for (agaw = width_to_agaw(max_gaw);
570 agaw >= 0; agaw--) {
571 if (test_bit(agaw, &sagaw))
572 break;
573 }
574
575 return agaw;
576}
577
578/*
579 * Calculate max SAGAW for each iommu.
580 */
581int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582{
583 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584}
585
586/*
587 * calculate agaw for each iommu.
588 * "SAGAW" may be different across iommus, use a default agaw, and
589 * get a supported less agaw for iommus that don't support the default agaw.
590 */
591int iommu_calculate_agaw(struct intel_iommu *iommu)
592{
593 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594}
595
596/* This functionin only returns single iommu in a domain */
597struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598{
599 int iommu_id;
600
601 /* si_domain and vm domain should not get here. */
602 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603 return NULL;
604
605 for_each_domain_iommu(iommu_id, domain)
606 break;
607
608 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609 return NULL;
610
611 return g_iommus[iommu_id];
612}
613
614static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
615{
616 return sm_supported(iommu) ?
617 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
618}
619
620static void domain_update_iommu_coherency(struct dmar_domain *domain)
621{
622 struct dmar_drhd_unit *drhd;
623 struct intel_iommu *iommu;
624 bool found = false;
625 int i;
626
627 domain->iommu_coherency = 1;
628
629 for_each_domain_iommu(i, domain) {
630 found = true;
631 if (!iommu_paging_structure_coherency(g_iommus[i])) {
632 domain->iommu_coherency = 0;
633 break;
634 }
635 }
636 if (found)
637 return;
638
639 /* No hardware attached; use lowest common denominator */
640 rcu_read_lock();
641 for_each_active_iommu(iommu, drhd) {
642 if (!iommu_paging_structure_coherency(iommu)) {
643 domain->iommu_coherency = 0;
644 break;
645 }
646 }
647 rcu_read_unlock();
648}
649
650static int domain_update_iommu_snooping(struct intel_iommu *skip)
651{
652 struct dmar_drhd_unit *drhd;
653 struct intel_iommu *iommu;
654 int ret = 1;
655
656 rcu_read_lock();
657 for_each_active_iommu(iommu, drhd) {
658 if (iommu != skip) {
659 if (!ecap_sc_support(iommu->ecap)) {
660 ret = 0;
661 break;
662 }
663 }
664 }
665 rcu_read_unlock();
666
667 return ret;
668}
669
670static int domain_update_iommu_superpage(struct dmar_domain *domain,
671 struct intel_iommu *skip)
672{
673 struct dmar_drhd_unit *drhd;
674 struct intel_iommu *iommu;
675 int mask = 0x3;
676
677 if (!intel_iommu_superpage) {
678 return 0;
679 }
680
681 /* set iommu_superpage to the smallest common denominator */
682 rcu_read_lock();
683 for_each_active_iommu(iommu, drhd) {
684 if (iommu != skip) {
685 if (domain && domain_use_first_level(domain)) {
686 if (!cap_fl1gp_support(iommu->cap))
687 mask = 0x1;
688 } else {
689 mask &= cap_super_page_val(iommu->cap);
690 }
691
692 if (!mask)
693 break;
694 }
695 }
696 rcu_read_unlock();
697
698 return fls(mask);
699}
700
701/* Some capabilities may be different across iommus */
702static void domain_update_iommu_cap(struct dmar_domain *domain)
703{
704 domain_update_iommu_coherency(domain);
705 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
706 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
707}
708
709struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
710 u8 devfn, int alloc)
711{
712 struct root_entry *root = &iommu->root_entry[bus];
713 struct context_entry *context;
714 u64 *entry;
715
716 entry = &root->lo;
717 if (sm_supported(iommu)) {
718 if (devfn >= 0x80) {
719 devfn -= 0x80;
720 entry = &root->hi;
721 }
722 devfn *= 2;
723 }
724 if (*entry & 1)
725 context = phys_to_virt(*entry & VTD_PAGE_MASK);
726 else {
727 unsigned long phy_addr;
728 if (!alloc)
729 return NULL;
730
731 context = alloc_pgtable_page(iommu->node);
732 if (!context)
733 return NULL;
734
735 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
736 phy_addr = virt_to_phys((void *)context);
737 *entry = phy_addr | 1;
738 __iommu_flush_cache(iommu, entry, sizeof(*entry));
739 }
740 return &context[devfn];
741}
742
743static bool attach_deferred(struct device *dev)
744{
745 return dev_iommu_priv_get(dev) == DEFER_DEVICE_DOMAIN_INFO;
746}
747
748/**
749 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
750 * sub-hierarchy of a candidate PCI-PCI bridge
751 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
752 * @bridge: the candidate PCI-PCI bridge
753 *
754 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
755 */
756static bool
757is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
758{
759 struct pci_dev *pdev, *pbridge;
760
761 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
762 return false;
763
764 pdev = to_pci_dev(dev);
765 pbridge = to_pci_dev(bridge);
766
767 if (pbridge->subordinate &&
768 pbridge->subordinate->number <= pdev->bus->number &&
769 pbridge->subordinate->busn_res.end >= pdev->bus->number)
770 return true;
771
772 return false;
773}
774
775static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
776{
777 struct dmar_drhd_unit *drhd;
778 u32 vtbar;
779 int rc;
780
781 /* We know that this device on this chipset has its own IOMMU.
782 * If we find it under a different IOMMU, then the BIOS is lying
783 * to us. Hope that the IOMMU for this device is actually
784 * disabled, and it needs no translation...
785 */
786 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
787 if (rc) {
788 /* "can't" happen */
789 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
790 return false;
791 }
792 vtbar &= 0xffff0000;
793
794 /* we know that the this iommu should be at offset 0xa000 from vtbar */
795 drhd = dmar_find_matched_drhd_unit(pdev);
796 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
797 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
798 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
799 return true;
800 }
801
802 return false;
803}
804
805static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
806{
807 if (!iommu || iommu->drhd->ignored)
808 return true;
809
810 if (dev_is_pci(dev)) {
811 struct pci_dev *pdev = to_pci_dev(dev);
812
813 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
814 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
815 quirk_ioat_snb_local_iommu(pdev))
816 return true;
817 }
818
819 return false;
820}
821
822struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
823{
824 struct dmar_drhd_unit *drhd = NULL;
825 struct pci_dev *pdev = NULL;
826 struct intel_iommu *iommu;
827 struct device *tmp;
828 u16 segment = 0;
829 int i;
830
831 if (!dev)
832 return NULL;
833
834 if (dev_is_pci(dev)) {
835 struct pci_dev *pf_pdev;
836
837 pdev = pci_real_dma_dev(to_pci_dev(dev));
838
839 /* VFs aren't listed in scope tables; we need to look up
840 * the PF instead to find the IOMMU. */
841 pf_pdev = pci_physfn(pdev);
842 dev = &pf_pdev->dev;
843 segment = pci_domain_nr(pdev->bus);
844 } else if (has_acpi_companion(dev))
845 dev = &ACPI_COMPANION(dev)->dev;
846
847 rcu_read_lock();
848 for_each_iommu(iommu, drhd) {
849 if (pdev && segment != drhd->segment)
850 continue;
851
852 for_each_active_dev_scope(drhd->devices,
853 drhd->devices_cnt, i, tmp) {
854 if (tmp == dev) {
855 /* For a VF use its original BDF# not that of the PF
856 * which we used for the IOMMU lookup. Strictly speaking
857 * we could do this for all PCI devices; we only need to
858 * get the BDF# from the scope table for ACPI matches. */
859 if (pdev && pdev->is_virtfn)
860 goto got_pdev;
861
862 if (bus && devfn) {
863 *bus = drhd->devices[i].bus;
864 *devfn = drhd->devices[i].devfn;
865 }
866 goto out;
867 }
868
869 if (is_downstream_to_pci_bridge(dev, tmp))
870 goto got_pdev;
871 }
872
873 if (pdev && drhd->include_all) {
874 got_pdev:
875 if (bus && devfn) {
876 *bus = pdev->bus->number;
877 *devfn = pdev->devfn;
878 }
879 goto out;
880 }
881 }
882 iommu = NULL;
883 out:
884 if (iommu_is_dummy(iommu, dev))
885 iommu = NULL;
886
887 rcu_read_unlock();
888
889 return iommu;
890}
891
892static void domain_flush_cache(struct dmar_domain *domain,
893 void *addr, int size)
894{
895 if (!domain->iommu_coherency)
896 clflush_cache_range(addr, size);
897}
898
899static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
900{
901 struct context_entry *context;
902 int ret = 0;
903 unsigned long flags;
904
905 spin_lock_irqsave(&iommu->lock, flags);
906 context = iommu_context_addr(iommu, bus, devfn, 0);
907 if (context)
908 ret = context_present(context);
909 spin_unlock_irqrestore(&iommu->lock, flags);
910 return ret;
911}
912
913static void free_context_table(struct intel_iommu *iommu)
914{
915 int i;
916 unsigned long flags;
917 struct context_entry *context;
918
919 spin_lock_irqsave(&iommu->lock, flags);
920 if (!iommu->root_entry) {
921 goto out;
922 }
923 for (i = 0; i < ROOT_ENTRY_NR; i++) {
924 context = iommu_context_addr(iommu, i, 0, 0);
925 if (context)
926 free_pgtable_page(context);
927
928 if (!sm_supported(iommu))
929 continue;
930
931 context = iommu_context_addr(iommu, i, 0x80, 0);
932 if (context)
933 free_pgtable_page(context);
934
935 }
936 free_pgtable_page(iommu->root_entry);
937 iommu->root_entry = NULL;
938out:
939 spin_unlock_irqrestore(&iommu->lock, flags);
940}
941
942static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
943 unsigned long pfn, int *target_level)
944{
945 struct dma_pte *parent, *pte;
946 int level = agaw_to_level(domain->agaw);
947 int offset;
948
949 BUG_ON(!domain->pgd);
950
951 if (!domain_pfn_supported(domain, pfn))
952 /* Address beyond IOMMU's addressing capabilities. */
953 return NULL;
954
955 parent = domain->pgd;
956
957 while (1) {
958 void *tmp_page;
959
960 offset = pfn_level_offset(pfn, level);
961 pte = &parent[offset];
962 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
963 break;
964 if (level == *target_level)
965 break;
966
967 if (!dma_pte_present(pte)) {
968 uint64_t pteval;
969
970 tmp_page = alloc_pgtable_page(domain->nid);
971
972 if (!tmp_page)
973 return NULL;
974
975 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
976 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
977 if (domain_use_first_level(domain))
978 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US;
979 if (cmpxchg64(&pte->val, 0ULL, pteval))
980 /* Someone else set it while we were thinking; use theirs. */
981 free_pgtable_page(tmp_page);
982 else
983 domain_flush_cache(domain, pte, sizeof(*pte));
984 }
985 if (level == 1)
986 break;
987
988 parent = phys_to_virt(dma_pte_addr(pte));
989 level--;
990 }
991
992 if (!*target_level)
993 *target_level = level;
994
995 return pte;
996}
997
998/* return address's pte at specific level */
999static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1000 unsigned long pfn,
1001 int level, int *large_page)
1002{
1003 struct dma_pte *parent, *pte;
1004 int total = agaw_to_level(domain->agaw);
1005 int offset;
1006
1007 parent = domain->pgd;
1008 while (level <= total) {
1009 offset = pfn_level_offset(pfn, total);
1010 pte = &parent[offset];
1011 if (level == total)
1012 return pte;
1013
1014 if (!dma_pte_present(pte)) {
1015 *large_page = total;
1016 break;
1017 }
1018
1019 if (dma_pte_superpage(pte)) {
1020 *large_page = total;
1021 return pte;
1022 }
1023
1024 parent = phys_to_virt(dma_pte_addr(pte));
1025 total--;
1026 }
1027 return NULL;
1028}
1029
1030/* clear last level pte, a tlb flush should be followed */
1031static void dma_pte_clear_range(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn)
1034{
1035 unsigned int large_page;
1036 struct dma_pte *first_pte, *pte;
1037
1038 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1039 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1040 BUG_ON(start_pfn > last_pfn);
1041
1042 /* we don't need lock here; nobody else touches the iova range */
1043 do {
1044 large_page = 1;
1045 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1046 if (!pte) {
1047 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1048 continue;
1049 }
1050 do {
1051 dma_clear_pte(pte);
1052 start_pfn += lvl_to_nr_pages(large_page);
1053 pte++;
1054 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1055
1056 domain_flush_cache(domain, first_pte,
1057 (void *)pte - (void *)first_pte);
1058
1059 } while (start_pfn && start_pfn <= last_pfn);
1060}
1061
1062static void dma_pte_free_level(struct dmar_domain *domain, int level,
1063 int retain_level, struct dma_pte *pte,
1064 unsigned long pfn, unsigned long start_pfn,
1065 unsigned long last_pfn)
1066{
1067 pfn = max(start_pfn, pfn);
1068 pte = &pte[pfn_level_offset(pfn, level)];
1069
1070 do {
1071 unsigned long level_pfn;
1072 struct dma_pte *level_pte;
1073
1074 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1075 goto next;
1076
1077 level_pfn = pfn & level_mask(level);
1078 level_pte = phys_to_virt(dma_pte_addr(pte));
1079
1080 if (level > 2) {
1081 dma_pte_free_level(domain, level - 1, retain_level,
1082 level_pte, level_pfn, start_pfn,
1083 last_pfn);
1084 }
1085
1086 /*
1087 * Free the page table if we're below the level we want to
1088 * retain and the range covers the entire table.
1089 */
1090 if (level < retain_level && !(start_pfn > level_pfn ||
1091 last_pfn < level_pfn + level_size(level) - 1)) {
1092 dma_clear_pte(pte);
1093 domain_flush_cache(domain, pte, sizeof(*pte));
1094 free_pgtable_page(level_pte);
1095 }
1096next:
1097 pfn += level_size(level);
1098 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1099}
1100
1101/*
1102 * clear last level (leaf) ptes and free page table pages below the
1103 * level we wish to keep intact.
1104 */
1105static void dma_pte_free_pagetable(struct dmar_domain *domain,
1106 unsigned long start_pfn,
1107 unsigned long last_pfn,
1108 int retain_level)
1109{
1110 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1111 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1112 BUG_ON(start_pfn > last_pfn);
1113
1114 dma_pte_clear_range(domain, start_pfn, last_pfn);
1115
1116 /* We don't need lock here; nobody else touches the iova range */
1117 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1118 domain->pgd, 0, start_pfn, last_pfn);
1119
1120 /* free pgd */
1121 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1122 free_pgtable_page(domain->pgd);
1123 domain->pgd = NULL;
1124 }
1125}
1126
1127/* When a page at a given level is being unlinked from its parent, we don't
1128 need to *modify* it at all. All we need to do is make a list of all the
1129 pages which can be freed just as soon as we've flushed the IOTLB and we
1130 know the hardware page-walk will no longer touch them.
1131 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1132 be freed. */
1133static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1134 int level, struct dma_pte *pte,
1135 struct page *freelist)
1136{
1137 struct page *pg;
1138
1139 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1140 pg->freelist = freelist;
1141 freelist = pg;
1142
1143 if (level == 1)
1144 return freelist;
1145
1146 pte = page_address(pg);
1147 do {
1148 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1149 freelist = dma_pte_list_pagetables(domain, level - 1,
1150 pte, freelist);
1151 pte++;
1152 } while (!first_pte_in_page(pte));
1153
1154 return freelist;
1155}
1156
1157static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1158 struct dma_pte *pte, unsigned long pfn,
1159 unsigned long start_pfn,
1160 unsigned long last_pfn,
1161 struct page *freelist)
1162{
1163 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1164
1165 pfn = max(start_pfn, pfn);
1166 pte = &pte[pfn_level_offset(pfn, level)];
1167
1168 do {
1169 unsigned long level_pfn;
1170
1171 if (!dma_pte_present(pte))
1172 goto next;
1173
1174 level_pfn = pfn & level_mask(level);
1175
1176 /* If range covers entire pagetable, free it */
1177 if (start_pfn <= level_pfn &&
1178 last_pfn >= level_pfn + level_size(level) - 1) {
1179 /* These suborbinate page tables are going away entirely. Don't
1180 bother to clear them; we're just going to *free* them. */
1181 if (level > 1 && !dma_pte_superpage(pte))
1182 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1183
1184 dma_clear_pte(pte);
1185 if (!first_pte)
1186 first_pte = pte;
1187 last_pte = pte;
1188 } else if (level > 1) {
1189 /* Recurse down into a level that isn't *entirely* obsolete */
1190 freelist = dma_pte_clear_level(domain, level - 1,
1191 phys_to_virt(dma_pte_addr(pte)),
1192 level_pfn, start_pfn, last_pfn,
1193 freelist);
1194 }
1195next:
1196 pfn += level_size(level);
1197 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1198
1199 if (first_pte)
1200 domain_flush_cache(domain, first_pte,
1201 (void *)++last_pte - (void *)first_pte);
1202
1203 return freelist;
1204}
1205
1206/* We can't just free the pages because the IOMMU may still be walking
1207 the page tables, and may have cached the intermediate levels. The
1208 pages can only be freed after the IOTLB flush has been done. */
1209static struct page *domain_unmap(struct dmar_domain *domain,
1210 unsigned long start_pfn,
1211 unsigned long last_pfn)
1212{
1213 struct page *freelist;
1214
1215 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1216 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1217 BUG_ON(start_pfn > last_pfn);
1218
1219 /* we don't need lock here; nobody else touches the iova range */
1220 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1221 domain->pgd, 0, start_pfn, last_pfn, NULL);
1222
1223 /* free pgd */
1224 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1225 struct page *pgd_page = virt_to_page(domain->pgd);
1226 pgd_page->freelist = freelist;
1227 freelist = pgd_page;
1228
1229 domain->pgd = NULL;
1230 }
1231
1232 return freelist;
1233}
1234
1235static void dma_free_pagelist(struct page *freelist)
1236{
1237 struct page *pg;
1238
1239 while ((pg = freelist)) {
1240 freelist = pg->freelist;
1241 free_pgtable_page(page_address(pg));
1242 }
1243}
1244
1245static void iova_entry_free(unsigned long data)
1246{
1247 struct page *freelist = (struct page *)data;
1248
1249 dma_free_pagelist(freelist);
1250}
1251
1252/* iommu handling */
1253static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1254{
1255 struct root_entry *root;
1256 unsigned long flags;
1257
1258 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1259 if (!root) {
1260 pr_err("Allocating root entry for %s failed\n",
1261 iommu->name);
1262 return -ENOMEM;
1263 }
1264
1265 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1266
1267 spin_lock_irqsave(&iommu->lock, flags);
1268 iommu->root_entry = root;
1269 spin_unlock_irqrestore(&iommu->lock, flags);
1270
1271 return 0;
1272}
1273
1274static void iommu_set_root_entry(struct intel_iommu *iommu)
1275{
1276 u64 addr;
1277 u32 sts;
1278 unsigned long flag;
1279
1280 addr = virt_to_phys(iommu->root_entry);
1281 if (sm_supported(iommu))
1282 addr |= DMA_RTADDR_SMT;
1283
1284 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1285 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1286
1287 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1288
1289 /* Make sure hardware complete it */
1290 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1291 readl, (sts & DMA_GSTS_RTPS), sts);
1292
1293 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1294}
1295
1296void iommu_flush_write_buffer(struct intel_iommu *iommu)
1297{
1298 u32 val;
1299 unsigned long flag;
1300
1301 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1302 return;
1303
1304 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1305 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1306
1307 /* Make sure hardware complete it */
1308 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309 readl, (!(val & DMA_GSTS_WBFS)), val);
1310
1311 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312}
1313
1314/* return value determine if we need a write buffer flush */
1315static void __iommu_flush_context(struct intel_iommu *iommu,
1316 u16 did, u16 source_id, u8 function_mask,
1317 u64 type)
1318{
1319 u64 val = 0;
1320 unsigned long flag;
1321
1322 switch (type) {
1323 case DMA_CCMD_GLOBAL_INVL:
1324 val = DMA_CCMD_GLOBAL_INVL;
1325 break;
1326 case DMA_CCMD_DOMAIN_INVL:
1327 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1328 break;
1329 case DMA_CCMD_DEVICE_INVL:
1330 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1331 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1332 break;
1333 default:
1334 BUG();
1335 }
1336 val |= DMA_CCMD_ICC;
1337
1338 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1339 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1340
1341 /* Make sure hardware complete it */
1342 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1343 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1344
1345 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1346}
1347
1348/* return value determine if we need a write buffer flush */
1349static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1350 u64 addr, unsigned int size_order, u64 type)
1351{
1352 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1353 u64 val = 0, val_iva = 0;
1354 unsigned long flag;
1355
1356 switch (type) {
1357 case DMA_TLB_GLOBAL_FLUSH:
1358 /* global flush doesn't need set IVA_REG */
1359 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1360 break;
1361 case DMA_TLB_DSI_FLUSH:
1362 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1363 break;
1364 case DMA_TLB_PSI_FLUSH:
1365 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1366 /* IH bit is passed in as part of address */
1367 val_iva = size_order | addr;
1368 break;
1369 default:
1370 BUG();
1371 }
1372 /* Note: set drain read/write */
1373#if 0
1374 /*
1375 * This is probably to be super secure.. Looks like we can
1376 * ignore it without any impact.
1377 */
1378 if (cap_read_drain(iommu->cap))
1379 val |= DMA_TLB_READ_DRAIN;
1380#endif
1381 if (cap_write_drain(iommu->cap))
1382 val |= DMA_TLB_WRITE_DRAIN;
1383
1384 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1385 /* Note: Only uses first TLB reg currently */
1386 if (val_iva)
1387 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1388 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1389
1390 /* Make sure hardware complete it */
1391 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1392 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1393
1394 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1395
1396 /* check IOTLB invalidation granularity */
1397 if (DMA_TLB_IAIG(val) == 0)
1398 pr_err("Flush IOTLB failed\n");
1399 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1400 pr_debug("TLB flush request %Lx, actual %Lx\n",
1401 (unsigned long long)DMA_TLB_IIRG(type),
1402 (unsigned long long)DMA_TLB_IAIG(val));
1403}
1404
1405static struct device_domain_info *
1406iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1407 u8 bus, u8 devfn)
1408{
1409 struct device_domain_info *info;
1410
1411 assert_spin_locked(&device_domain_lock);
1412
1413 if (!iommu->qi)
1414 return NULL;
1415
1416 list_for_each_entry(info, &domain->devices, link)
1417 if (info->iommu == iommu && info->bus == bus &&
1418 info->devfn == devfn) {
1419 if (info->ats_supported && info->dev)
1420 return info;
1421 break;
1422 }
1423
1424 return NULL;
1425}
1426
1427static void domain_update_iotlb(struct dmar_domain *domain)
1428{
1429 struct device_domain_info *info;
1430 bool has_iotlb_device = false;
1431
1432 assert_spin_locked(&device_domain_lock);
1433
1434 list_for_each_entry(info, &domain->devices, link) {
1435 struct pci_dev *pdev;
1436
1437 if (!info->dev || !dev_is_pci(info->dev))
1438 continue;
1439
1440 pdev = to_pci_dev(info->dev);
1441 if (pdev->ats_enabled) {
1442 has_iotlb_device = true;
1443 break;
1444 }
1445 }
1446
1447 domain->has_iotlb_device = has_iotlb_device;
1448}
1449
1450static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1451{
1452 struct pci_dev *pdev;
1453
1454 assert_spin_locked(&device_domain_lock);
1455
1456 if (!info || !dev_is_pci(info->dev))
1457 return;
1458
1459 pdev = to_pci_dev(info->dev);
1460 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1461 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1462 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1463 * reserved, which should be set to 0.
1464 */
1465 if (!ecap_dit(info->iommu->ecap))
1466 info->pfsid = 0;
1467 else {
1468 struct pci_dev *pf_pdev;
1469
1470 /* pdev will be returned if device is not a vf */
1471 pf_pdev = pci_physfn(pdev);
1472 info->pfsid = pci_dev_id(pf_pdev);
1473 }
1474
1475#ifdef CONFIG_INTEL_IOMMU_SVM
1476 /* The PCIe spec, in its wisdom, declares that the behaviour of
1477 the device if you enable PASID support after ATS support is
1478 undefined. So always enable PASID support on devices which
1479 have it, even if we can't yet know if we're ever going to
1480 use it. */
1481 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1482 info->pasid_enabled = 1;
1483
1484 if (info->pri_supported &&
1485 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1486 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1487 info->pri_enabled = 1;
1488#endif
1489 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1490 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491 info->ats_enabled = 1;
1492 domain_update_iotlb(info->domain);
1493 info->ats_qdep = pci_ats_queue_depth(pdev);
1494 }
1495}
1496
1497static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498{
1499 struct pci_dev *pdev;
1500
1501 assert_spin_locked(&device_domain_lock);
1502
1503 if (!dev_is_pci(info->dev))
1504 return;
1505
1506 pdev = to_pci_dev(info->dev);
1507
1508 if (info->ats_enabled) {
1509 pci_disable_ats(pdev);
1510 info->ats_enabled = 0;
1511 domain_update_iotlb(info->domain);
1512 }
1513#ifdef CONFIG_INTEL_IOMMU_SVM
1514 if (info->pri_enabled) {
1515 pci_disable_pri(pdev);
1516 info->pri_enabled = 0;
1517 }
1518 if (info->pasid_enabled) {
1519 pci_disable_pasid(pdev);
1520 info->pasid_enabled = 0;
1521 }
1522#endif
1523}
1524
1525static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526 u64 addr, unsigned mask)
1527{
1528 u16 sid, qdep;
1529 unsigned long flags;
1530 struct device_domain_info *info;
1531
1532 if (!domain->has_iotlb_device)
1533 return;
1534
1535 spin_lock_irqsave(&device_domain_lock, flags);
1536 list_for_each_entry(info, &domain->devices, link) {
1537 if (!info->ats_enabled)
1538 continue;
1539
1540 sid = info->bus << 8 | info->devfn;
1541 qdep = info->ats_qdep;
1542 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1543 qdep, addr, mask);
1544 }
1545 spin_unlock_irqrestore(&device_domain_lock, flags);
1546}
1547
1548static void domain_flush_piotlb(struct intel_iommu *iommu,
1549 struct dmar_domain *domain,
1550 u64 addr, unsigned long npages, bool ih)
1551{
1552 u16 did = domain->iommu_did[iommu->seq_id];
1553
1554 if (domain->default_pasid)
1555 qi_flush_piotlb(iommu, did, domain->default_pasid,
1556 addr, npages, ih);
1557
1558 if (!list_empty(&domain->devices))
1559 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1560}
1561
1562static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1563 struct dmar_domain *domain,
1564 unsigned long pfn, unsigned int pages,
1565 int ih, int map)
1566{
1567 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1568 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1569 u16 did = domain->iommu_did[iommu->seq_id];
1570
1571 BUG_ON(pages == 0);
1572
1573 if (ih)
1574 ih = 1 << 6;
1575
1576 if (domain_use_first_level(domain)) {
1577 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1578 } else {
1579 /*
1580 * Fallback to domain selective flush if no PSI support or
1581 * the size is too big. PSI requires page size to be 2 ^ x,
1582 * and the base address is naturally aligned to the size.
1583 */
1584 if (!cap_pgsel_inv(iommu->cap) ||
1585 mask > cap_max_amask_val(iommu->cap))
1586 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1587 DMA_TLB_DSI_FLUSH);
1588 else
1589 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1590 DMA_TLB_PSI_FLUSH);
1591 }
1592
1593 /*
1594 * In caching mode, changes of pages from non-present to present require
1595 * flush. However, device IOTLB doesn't need to be flushed in this case.
1596 */
1597 if (!cap_caching_mode(iommu->cap) || !map)
1598 iommu_flush_dev_iotlb(domain, addr, mask);
1599}
1600
1601/* Notification for newly created mappings */
1602static inline void __mapping_notify_one(struct intel_iommu *iommu,
1603 struct dmar_domain *domain,
1604 unsigned long pfn, unsigned int pages)
1605{
1606 /*
1607 * It's a non-present to present mapping. Only flush if caching mode
1608 * and second level.
1609 */
1610 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1611 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1612 else
1613 iommu_flush_write_buffer(iommu);
1614}
1615
1616static void iommu_flush_iova(struct iova_domain *iovad)
1617{
1618 struct dmar_domain *domain;
1619 int idx;
1620
1621 domain = container_of(iovad, struct dmar_domain, iovad);
1622
1623 for_each_domain_iommu(idx, domain) {
1624 struct intel_iommu *iommu = g_iommus[idx];
1625 u16 did = domain->iommu_did[iommu->seq_id];
1626
1627 if (domain_use_first_level(domain))
1628 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1629 else
1630 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1631 DMA_TLB_DSI_FLUSH);
1632
1633 if (!cap_caching_mode(iommu->cap))
1634 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1635 0, MAX_AGAW_PFN_WIDTH);
1636 }
1637}
1638
1639static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1640{
1641 u32 pmen;
1642 unsigned long flags;
1643
1644 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1645 return;
1646
1647 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1648 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1649 pmen &= ~DMA_PMEN_EPM;
1650 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1651
1652 /* wait for the protected region status bit to clear */
1653 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1654 readl, !(pmen & DMA_PMEN_PRS), pmen);
1655
1656 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1657}
1658
1659static void iommu_enable_translation(struct intel_iommu *iommu)
1660{
1661 u32 sts;
1662 unsigned long flags;
1663
1664 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1665 iommu->gcmd |= DMA_GCMD_TE;
1666 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1667
1668 /* Make sure hardware complete it */
1669 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1670 readl, (sts & DMA_GSTS_TES), sts);
1671
1672 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1673}
1674
1675static void iommu_disable_translation(struct intel_iommu *iommu)
1676{
1677 u32 sts;
1678 unsigned long flag;
1679
1680 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1681 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1682 return;
1683
1684 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1685 iommu->gcmd &= ~DMA_GCMD_TE;
1686 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1687
1688 /* Make sure hardware complete it */
1689 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1690 readl, (!(sts & DMA_GSTS_TES)), sts);
1691
1692 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1693}
1694
1695static int iommu_init_domains(struct intel_iommu *iommu)
1696{
1697 u32 ndomains, nlongs;
1698 size_t size;
1699
1700 ndomains = cap_ndoms(iommu->cap);
1701 pr_debug("%s: Number of Domains supported <%d>\n",
1702 iommu->name, ndomains);
1703 nlongs = BITS_TO_LONGS(ndomains);
1704
1705 spin_lock_init(&iommu->lock);
1706
1707 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1708 if (!iommu->domain_ids) {
1709 pr_err("%s: Allocating domain id array failed\n",
1710 iommu->name);
1711 return -ENOMEM;
1712 }
1713
1714 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1715 iommu->domains = kzalloc(size, GFP_KERNEL);
1716
1717 if (iommu->domains) {
1718 size = 256 * sizeof(struct dmar_domain *);
1719 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1720 }
1721
1722 if (!iommu->domains || !iommu->domains[0]) {
1723 pr_err("%s: Allocating domain array failed\n",
1724 iommu->name);
1725 kfree(iommu->domain_ids);
1726 kfree(iommu->domains);
1727 iommu->domain_ids = NULL;
1728 iommu->domains = NULL;
1729 return -ENOMEM;
1730 }
1731
1732 /*
1733 * If Caching mode is set, then invalid translations are tagged
1734 * with domain-id 0, hence we need to pre-allocate it. We also
1735 * use domain-id 0 as a marker for non-allocated domain-id, so
1736 * make sure it is not used for a real domain.
1737 */
1738 set_bit(0, iommu->domain_ids);
1739
1740 /*
1741 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1742 * entry for first-level or pass-through translation modes should
1743 * be programmed with a domain id different from those used for
1744 * second-level or nested translation. We reserve a domain id for
1745 * this purpose.
1746 */
1747 if (sm_supported(iommu))
1748 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1749
1750 return 0;
1751}
1752
1753static void disable_dmar_iommu(struct intel_iommu *iommu)
1754{
1755 struct device_domain_info *info, *tmp;
1756 unsigned long flags;
1757
1758 if (!iommu->domains || !iommu->domain_ids)
1759 return;
1760
1761 spin_lock_irqsave(&device_domain_lock, flags);
1762 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1763 if (info->iommu != iommu)
1764 continue;
1765
1766 if (!info->dev || !info->domain)
1767 continue;
1768
1769 __dmar_remove_one_dev_info(info);
1770 }
1771 spin_unlock_irqrestore(&device_domain_lock, flags);
1772
1773 if (iommu->gcmd & DMA_GCMD_TE)
1774 iommu_disable_translation(iommu);
1775}
1776
1777static void free_dmar_iommu(struct intel_iommu *iommu)
1778{
1779 if ((iommu->domains) && (iommu->domain_ids)) {
1780 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1781 int i;
1782
1783 for (i = 0; i < elems; i++)
1784 kfree(iommu->domains[i]);
1785 kfree(iommu->domains);
1786 kfree(iommu->domain_ids);
1787 iommu->domains = NULL;
1788 iommu->domain_ids = NULL;
1789 }
1790
1791 g_iommus[iommu->seq_id] = NULL;
1792
1793 /* free context mapping */
1794 free_context_table(iommu);
1795
1796#ifdef CONFIG_INTEL_IOMMU_SVM
1797 if (pasid_supported(iommu)) {
1798 if (ecap_prs(iommu->ecap))
1799 intel_svm_finish_prq(iommu);
1800 }
1801 if (ecap_vcs(iommu->ecap) && vccap_pasid(iommu->vccap))
1802 ioasid_unregister_allocator(&iommu->pasid_allocator);
1803
1804#endif
1805}
1806
1807/*
1808 * Check and return whether first level is used by default for
1809 * DMA translation.
1810 */
1811static bool first_level_by_default(void)
1812{
1813 struct dmar_drhd_unit *drhd;
1814 struct intel_iommu *iommu;
1815 static int first_level_support = -1;
1816
1817 if (likely(first_level_support != -1))
1818 return first_level_support;
1819
1820 first_level_support = 1;
1821
1822 rcu_read_lock();
1823 for_each_active_iommu(iommu, drhd) {
1824 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1825 first_level_support = 0;
1826 break;
1827 }
1828 }
1829 rcu_read_unlock();
1830
1831 return first_level_support;
1832}
1833
1834static struct dmar_domain *alloc_domain(int flags)
1835{
1836 struct dmar_domain *domain;
1837
1838 domain = alloc_domain_mem();
1839 if (!domain)
1840 return NULL;
1841
1842 memset(domain, 0, sizeof(*domain));
1843 domain->nid = NUMA_NO_NODE;
1844 domain->flags = flags;
1845 if (first_level_by_default())
1846 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1847 domain->has_iotlb_device = false;
1848 INIT_LIST_HEAD(&domain->devices);
1849
1850 return domain;
1851}
1852
1853/* Must be called with iommu->lock */
1854static int domain_attach_iommu(struct dmar_domain *domain,
1855 struct intel_iommu *iommu)
1856{
1857 unsigned long ndomains;
1858 int num;
1859
1860 assert_spin_locked(&device_domain_lock);
1861 assert_spin_locked(&iommu->lock);
1862
1863 domain->iommu_refcnt[iommu->seq_id] += 1;
1864 domain->iommu_count += 1;
1865 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1866 ndomains = cap_ndoms(iommu->cap);
1867 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1868
1869 if (num >= ndomains) {
1870 pr_err("%s: No free domain ids\n", iommu->name);
1871 domain->iommu_refcnt[iommu->seq_id] -= 1;
1872 domain->iommu_count -= 1;
1873 return -ENOSPC;
1874 }
1875
1876 set_bit(num, iommu->domain_ids);
1877 set_iommu_domain(iommu, num, domain);
1878
1879 domain->iommu_did[iommu->seq_id] = num;
1880 domain->nid = iommu->node;
1881
1882 domain_update_iommu_cap(domain);
1883 }
1884
1885 return 0;
1886}
1887
1888static int domain_detach_iommu(struct dmar_domain *domain,
1889 struct intel_iommu *iommu)
1890{
1891 int num, count;
1892
1893 assert_spin_locked(&device_domain_lock);
1894 assert_spin_locked(&iommu->lock);
1895
1896 domain->iommu_refcnt[iommu->seq_id] -= 1;
1897 count = --domain->iommu_count;
1898 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1899 num = domain->iommu_did[iommu->seq_id];
1900 clear_bit(num, iommu->domain_ids);
1901 set_iommu_domain(iommu, num, NULL);
1902
1903 domain_update_iommu_cap(domain);
1904 domain->iommu_did[iommu->seq_id] = 0;
1905 }
1906
1907 return count;
1908}
1909
1910static struct iova_domain reserved_iova_list;
1911static struct lock_class_key reserved_rbtree_key;
1912
1913static int dmar_init_reserved_ranges(void)
1914{
1915 struct pci_dev *pdev = NULL;
1916 struct iova *iova;
1917 int i;
1918
1919 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1920
1921 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1922 &reserved_rbtree_key);
1923
1924 /* IOAPIC ranges shouldn't be accessed by DMA */
1925 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1926 IOVA_PFN(IOAPIC_RANGE_END));
1927 if (!iova) {
1928 pr_err("Reserve IOAPIC range failed\n");
1929 return -ENODEV;
1930 }
1931
1932 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1933 for_each_pci_dev(pdev) {
1934 struct resource *r;
1935
1936 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1937 r = &pdev->resource[i];
1938 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1939 continue;
1940 iova = reserve_iova(&reserved_iova_list,
1941 IOVA_PFN(r->start),
1942 IOVA_PFN(r->end));
1943 if (!iova) {
1944 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1945 return -ENODEV;
1946 }
1947 }
1948 }
1949 return 0;
1950}
1951
1952static inline int guestwidth_to_adjustwidth(int gaw)
1953{
1954 int agaw;
1955 int r = (gaw - 12) % 9;
1956
1957 if (r == 0)
1958 agaw = gaw;
1959 else
1960 agaw = gaw + 9 - r;
1961 if (agaw > 64)
1962 agaw = 64;
1963 return agaw;
1964}
1965
1966static void domain_exit(struct dmar_domain *domain)
1967{
1968
1969 /* Remove associated devices and clear attached or cached domains */
1970 domain_remove_dev_info(domain);
1971
1972 /* destroy iovas */
1973 if (domain->domain.type == IOMMU_DOMAIN_DMA)
1974 put_iova_domain(&domain->iovad);
1975
1976 if (domain->pgd) {
1977 struct page *freelist;
1978
1979 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1980 dma_free_pagelist(freelist);
1981 }
1982
1983 free_domain_mem(domain);
1984}
1985
1986/*
1987 * Get the PASID directory size for scalable mode context entry.
1988 * Value of X in the PDTS field of a scalable mode context entry
1989 * indicates PASID directory with 2^(X + 7) entries.
1990 */
1991static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1992{
1993 int pds, max_pde;
1994
1995 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1996 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1997 if (pds < 7)
1998 return 0;
1999
2000 return pds - 7;
2001}
2002
2003/*
2004 * Set the RID_PASID field of a scalable mode context entry. The
2005 * IOMMU hardware will use the PASID value set in this field for
2006 * DMA translations of DMA requests without PASID.
2007 */
2008static inline void
2009context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2010{
2011 context->hi |= pasid & ((1 << 20) - 1);
2012}
2013
2014/*
2015 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2016 * entry.
2017 */
2018static inline void context_set_sm_dte(struct context_entry *context)
2019{
2020 context->lo |= (1 << 2);
2021}
2022
2023/*
2024 * Set the PRE(Page Request Enable) field of a scalable mode context
2025 * entry.
2026 */
2027static inline void context_set_sm_pre(struct context_entry *context)
2028{
2029 context->lo |= (1 << 4);
2030}
2031
2032/* Convert value to context PASID directory size field coding. */
2033#define context_pdts(pds) (((pds) & 0x7) << 9)
2034
2035static int domain_context_mapping_one(struct dmar_domain *domain,
2036 struct intel_iommu *iommu,
2037 struct pasid_table *table,
2038 u8 bus, u8 devfn)
2039{
2040 u16 did = domain->iommu_did[iommu->seq_id];
2041 int translation = CONTEXT_TT_MULTI_LEVEL;
2042 struct device_domain_info *info = NULL;
2043 struct context_entry *context;
2044 unsigned long flags;
2045 int ret;
2046
2047 WARN_ON(did == 0);
2048
2049 if (hw_pass_through && domain_type_is_si(domain))
2050 translation = CONTEXT_TT_PASS_THROUGH;
2051
2052 pr_debug("Set context mapping for %02x:%02x.%d\n",
2053 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2054
2055 BUG_ON(!domain->pgd);
2056
2057 spin_lock_irqsave(&device_domain_lock, flags);
2058 spin_lock(&iommu->lock);
2059
2060 ret = -ENOMEM;
2061 context = iommu_context_addr(iommu, bus, devfn, 1);
2062 if (!context)
2063 goto out_unlock;
2064
2065 ret = 0;
2066 if (context_present(context))
2067 goto out_unlock;
2068
2069 /*
2070 * For kdump cases, old valid entries may be cached due to the
2071 * in-flight DMA and copied pgtable, but there is no unmapping
2072 * behaviour for them, thus we need an explicit cache flush for
2073 * the newly-mapped device. For kdump, at this point, the device
2074 * is supposed to finish reset at its driver probe stage, so no
2075 * in-flight DMA will exist, and we don't need to worry anymore
2076 * hereafter.
2077 */
2078 if (context_copied(context)) {
2079 u16 did_old = context_domain_id(context);
2080
2081 if (did_old < cap_ndoms(iommu->cap)) {
2082 iommu->flush.flush_context(iommu, did_old,
2083 (((u16)bus) << 8) | devfn,
2084 DMA_CCMD_MASK_NOBIT,
2085 DMA_CCMD_DEVICE_INVL);
2086 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2087 DMA_TLB_DSI_FLUSH);
2088 }
2089 }
2090
2091 context_clear_entry(context);
2092
2093 if (sm_supported(iommu)) {
2094 unsigned long pds;
2095
2096 WARN_ON(!table);
2097
2098 /* Setup the PASID DIR pointer: */
2099 pds = context_get_sm_pds(table);
2100 context->lo = (u64)virt_to_phys(table->table) |
2101 context_pdts(pds);
2102
2103 /* Setup the RID_PASID field: */
2104 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2105
2106 /*
2107 * Setup the Device-TLB enable bit and Page request
2108 * Enable bit:
2109 */
2110 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2111 if (info && info->ats_supported)
2112 context_set_sm_dte(context);
2113 if (info && info->pri_supported)
2114 context_set_sm_pre(context);
2115 } else {
2116 struct dma_pte *pgd = domain->pgd;
2117 int agaw;
2118
2119 context_set_domain_id(context, did);
2120
2121 if (translation != CONTEXT_TT_PASS_THROUGH) {
2122 /*
2123 * Skip top levels of page tables for iommu which has
2124 * less agaw than default. Unnecessary for PT mode.
2125 */
2126 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2127 ret = -ENOMEM;
2128 pgd = phys_to_virt(dma_pte_addr(pgd));
2129 if (!dma_pte_present(pgd))
2130 goto out_unlock;
2131 }
2132
2133 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2134 if (info && info->ats_supported)
2135 translation = CONTEXT_TT_DEV_IOTLB;
2136 else
2137 translation = CONTEXT_TT_MULTI_LEVEL;
2138
2139 context_set_address_root(context, virt_to_phys(pgd));
2140 context_set_address_width(context, agaw);
2141 } else {
2142 /*
2143 * In pass through mode, AW must be programmed to
2144 * indicate the largest AGAW value supported by
2145 * hardware. And ASR is ignored by hardware.
2146 */
2147 context_set_address_width(context, iommu->msagaw);
2148 }
2149
2150 context_set_translation_type(context, translation);
2151 }
2152
2153 context_set_fault_enable(context);
2154 context_set_present(context);
2155 if (!ecap_coherent(iommu->ecap))
2156 clflush_cache_range(context, sizeof(*context));
2157
2158 /*
2159 * It's a non-present to present mapping. If hardware doesn't cache
2160 * non-present entry we only need to flush the write-buffer. If the
2161 * _does_ cache non-present entries, then it does so in the special
2162 * domain #0, which we have to flush:
2163 */
2164 if (cap_caching_mode(iommu->cap)) {
2165 iommu->flush.flush_context(iommu, 0,
2166 (((u16)bus) << 8) | devfn,
2167 DMA_CCMD_MASK_NOBIT,
2168 DMA_CCMD_DEVICE_INVL);
2169 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2170 } else {
2171 iommu_flush_write_buffer(iommu);
2172 }
2173 iommu_enable_dev_iotlb(info);
2174
2175 ret = 0;
2176
2177out_unlock:
2178 spin_unlock(&iommu->lock);
2179 spin_unlock_irqrestore(&device_domain_lock, flags);
2180
2181 return ret;
2182}
2183
2184struct domain_context_mapping_data {
2185 struct dmar_domain *domain;
2186 struct intel_iommu *iommu;
2187 struct pasid_table *table;
2188};
2189
2190static int domain_context_mapping_cb(struct pci_dev *pdev,
2191 u16 alias, void *opaque)
2192{
2193 struct domain_context_mapping_data *data = opaque;
2194
2195 return domain_context_mapping_one(data->domain, data->iommu,
2196 data->table, PCI_BUS_NUM(alias),
2197 alias & 0xff);
2198}
2199
2200static int
2201domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2202{
2203 struct domain_context_mapping_data data;
2204 struct pasid_table *table;
2205 struct intel_iommu *iommu;
2206 u8 bus, devfn;
2207
2208 iommu = device_to_iommu(dev, &bus, &devfn);
2209 if (!iommu)
2210 return -ENODEV;
2211
2212 table = intel_pasid_get_table(dev);
2213
2214 if (!dev_is_pci(dev))
2215 return domain_context_mapping_one(domain, iommu, table,
2216 bus, devfn);
2217
2218 data.domain = domain;
2219 data.iommu = iommu;
2220 data.table = table;
2221
2222 return pci_for_each_dma_alias(to_pci_dev(dev),
2223 &domain_context_mapping_cb, &data);
2224}
2225
2226static int domain_context_mapped_cb(struct pci_dev *pdev,
2227 u16 alias, void *opaque)
2228{
2229 struct intel_iommu *iommu = opaque;
2230
2231 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2232}
2233
2234static int domain_context_mapped(struct device *dev)
2235{
2236 struct intel_iommu *iommu;
2237 u8 bus, devfn;
2238
2239 iommu = device_to_iommu(dev, &bus, &devfn);
2240 if (!iommu)
2241 return -ENODEV;
2242
2243 if (!dev_is_pci(dev))
2244 return device_context_mapped(iommu, bus, devfn);
2245
2246 return !pci_for_each_dma_alias(to_pci_dev(dev),
2247 domain_context_mapped_cb, iommu);
2248}
2249
2250/* Returns a number of VTD pages, but aligned to MM page size */
2251static inline unsigned long aligned_nrpages(unsigned long host_addr,
2252 size_t size)
2253{
2254 host_addr &= ~PAGE_MASK;
2255 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2256}
2257
2258/* Return largest possible superpage level for a given mapping */
2259static inline int hardware_largepage_caps(struct dmar_domain *domain,
2260 unsigned long iov_pfn,
2261 unsigned long phy_pfn,
2262 unsigned long pages)
2263{
2264 int support, level = 1;
2265 unsigned long pfnmerge;
2266
2267 support = domain->iommu_superpage;
2268
2269 /* To use a large page, the virtual *and* physical addresses
2270 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2271 of them will mean we have to use smaller pages. So just
2272 merge them and check both at once. */
2273 pfnmerge = iov_pfn | phy_pfn;
2274
2275 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2276 pages >>= VTD_STRIDE_SHIFT;
2277 if (!pages)
2278 break;
2279 pfnmerge >>= VTD_STRIDE_SHIFT;
2280 level++;
2281 support--;
2282 }
2283 return level;
2284}
2285
2286static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2287 struct scatterlist *sg, unsigned long phys_pfn,
2288 unsigned long nr_pages, int prot)
2289{
2290 struct dma_pte *first_pte = NULL, *pte = NULL;
2291 phys_addr_t pteval;
2292 unsigned long sg_res = 0;
2293 unsigned int largepage_lvl = 0;
2294 unsigned long lvl_pages = 0;
2295 u64 attr;
2296
2297 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2298
2299 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2300 return -EINVAL;
2301
2302 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2303 if (domain_use_first_level(domain))
2304 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD | DMA_FL_PTE_US;
2305
2306 if (!sg) {
2307 sg_res = nr_pages;
2308 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2309 }
2310
2311 while (nr_pages > 0) {
2312 uint64_t tmp;
2313
2314 if (!sg_res) {
2315 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2316
2317 sg_res = aligned_nrpages(sg->offset, sg->length);
2318 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2319 sg->dma_length = sg->length;
2320 pteval = (sg_phys(sg) - pgoff) | attr;
2321 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2322 }
2323
2324 if (!pte) {
2325 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2326
2327 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2328 if (!pte)
2329 return -ENOMEM;
2330 /* It is large page*/
2331 if (largepage_lvl > 1) {
2332 unsigned long nr_superpages, end_pfn;
2333
2334 pteval |= DMA_PTE_LARGE_PAGE;
2335 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2336
2337 nr_superpages = sg_res / lvl_pages;
2338 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2339
2340 /*
2341 * Ensure that old small page tables are
2342 * removed to make room for superpage(s).
2343 * We're adding new large pages, so make sure
2344 * we don't remove their parent tables.
2345 */
2346 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2347 largepage_lvl + 1);
2348 } else {
2349 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2350 }
2351
2352 }
2353 /* We don't need lock here, nobody else
2354 * touches the iova range
2355 */
2356 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2357 if (tmp) {
2358 static int dumps = 5;
2359 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2360 iov_pfn, tmp, (unsigned long long)pteval);
2361 if (dumps) {
2362 dumps--;
2363 debug_dma_dump_mappings(NULL);
2364 }
2365 WARN_ON(1);
2366 }
2367
2368 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2369
2370 BUG_ON(nr_pages < lvl_pages);
2371 BUG_ON(sg_res < lvl_pages);
2372
2373 nr_pages -= lvl_pages;
2374 iov_pfn += lvl_pages;
2375 phys_pfn += lvl_pages;
2376 pteval += lvl_pages * VTD_PAGE_SIZE;
2377 sg_res -= lvl_pages;
2378
2379 /* If the next PTE would be the first in a new page, then we
2380 need to flush the cache on the entries we've just written.
2381 And then we'll need to recalculate 'pte', so clear it and
2382 let it get set again in the if (!pte) block above.
2383
2384 If we're done (!nr_pages) we need to flush the cache too.
2385
2386 Also if we've been setting superpages, we may need to
2387 recalculate 'pte' and switch back to smaller pages for the
2388 end of the mapping, if the trailing size is not enough to
2389 use another superpage (i.e. sg_res < lvl_pages). */
2390 pte++;
2391 if (!nr_pages || first_pte_in_page(pte) ||
2392 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2393 domain_flush_cache(domain, first_pte,
2394 (void *)pte - (void *)first_pte);
2395 pte = NULL;
2396 }
2397
2398 if (!sg_res && nr_pages)
2399 sg = sg_next(sg);
2400 }
2401 return 0;
2402}
2403
2404static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2405 struct scatterlist *sg, unsigned long phys_pfn,
2406 unsigned long nr_pages, int prot)
2407{
2408 int iommu_id, ret;
2409 struct intel_iommu *iommu;
2410
2411 /* Do the real mapping first */
2412 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2413 if (ret)
2414 return ret;
2415
2416 for_each_domain_iommu(iommu_id, domain) {
2417 iommu = g_iommus[iommu_id];
2418 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2419 }
2420
2421 return 0;
2422}
2423
2424static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425 struct scatterlist *sg, unsigned long nr_pages,
2426 int prot)
2427{
2428 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2429}
2430
2431static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2432 unsigned long phys_pfn, unsigned long nr_pages,
2433 int prot)
2434{
2435 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2436}
2437
2438static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2439{
2440 unsigned long flags;
2441 struct context_entry *context;
2442 u16 did_old;
2443
2444 if (!iommu)
2445 return;
2446
2447 spin_lock_irqsave(&iommu->lock, flags);
2448 context = iommu_context_addr(iommu, bus, devfn, 0);
2449 if (!context) {
2450 spin_unlock_irqrestore(&iommu->lock, flags);
2451 return;
2452 }
2453 did_old = context_domain_id(context);
2454 context_clear_entry(context);
2455 __iommu_flush_cache(iommu, context, sizeof(*context));
2456 spin_unlock_irqrestore(&iommu->lock, flags);
2457 iommu->flush.flush_context(iommu,
2458 did_old,
2459 (((u16)bus) << 8) | devfn,
2460 DMA_CCMD_MASK_NOBIT,
2461 DMA_CCMD_DEVICE_INVL);
2462 iommu->flush.flush_iotlb(iommu,
2463 did_old,
2464 0,
2465 0,
2466 DMA_TLB_DSI_FLUSH);
2467}
2468
2469static inline void unlink_domain_info(struct device_domain_info *info)
2470{
2471 assert_spin_locked(&device_domain_lock);
2472 list_del(&info->link);
2473 list_del(&info->global);
2474 if (info->dev)
2475 dev_iommu_priv_set(info->dev, NULL);
2476}
2477
2478static void domain_remove_dev_info(struct dmar_domain *domain)
2479{
2480 struct device_domain_info *info, *tmp;
2481 unsigned long flags;
2482
2483 spin_lock_irqsave(&device_domain_lock, flags);
2484 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2485 __dmar_remove_one_dev_info(info);
2486 spin_unlock_irqrestore(&device_domain_lock, flags);
2487}
2488
2489struct dmar_domain *find_domain(struct device *dev)
2490{
2491 struct device_domain_info *info;
2492
2493 if (unlikely(attach_deferred(dev)))
2494 return NULL;
2495
2496 /* No lock here, assumes no domain exit in normal case */
2497 info = get_domain_info(dev);
2498 if (likely(info))
2499 return info->domain;
2500
2501 return NULL;
2502}
2503
2504static void do_deferred_attach(struct device *dev)
2505{
2506 struct iommu_domain *domain;
2507
2508 dev_iommu_priv_set(dev, NULL);
2509 domain = iommu_get_domain_for_dev(dev);
2510 if (domain)
2511 intel_iommu_attach_device(domain, dev);
2512}
2513
2514static inline struct device_domain_info *
2515dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2516{
2517 struct device_domain_info *info;
2518
2519 list_for_each_entry(info, &device_domain_list, global)
2520 if (info->segment == segment && info->bus == bus &&
2521 info->devfn == devfn)
2522 return info;
2523
2524 return NULL;
2525}
2526
2527static int domain_setup_first_level(struct intel_iommu *iommu,
2528 struct dmar_domain *domain,
2529 struct device *dev,
2530 int pasid)
2531{
2532 int flags = PASID_FLAG_SUPERVISOR_MODE;
2533 struct dma_pte *pgd = domain->pgd;
2534 int agaw, level;
2535
2536 /*
2537 * Skip top levels of page tables for iommu which has
2538 * less agaw than default. Unnecessary for PT mode.
2539 */
2540 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2541 pgd = phys_to_virt(dma_pte_addr(pgd));
2542 if (!dma_pte_present(pgd))
2543 return -ENOMEM;
2544 }
2545
2546 level = agaw_to_level(agaw);
2547 if (level != 4 && level != 5)
2548 return -EINVAL;
2549
2550 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2551
2552 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2553 domain->iommu_did[iommu->seq_id],
2554 flags);
2555}
2556
2557static bool dev_is_real_dma_subdevice(struct device *dev)
2558{
2559 return dev && dev_is_pci(dev) &&
2560 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2561}
2562
2563static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2564 int bus, int devfn,
2565 struct device *dev,
2566 struct dmar_domain *domain)
2567{
2568 struct dmar_domain *found = NULL;
2569 struct device_domain_info *info;
2570 unsigned long flags;
2571 int ret;
2572
2573 info = alloc_devinfo_mem();
2574 if (!info)
2575 return NULL;
2576
2577 if (!dev_is_real_dma_subdevice(dev)) {
2578 info->bus = bus;
2579 info->devfn = devfn;
2580 info->segment = iommu->segment;
2581 } else {
2582 struct pci_dev *pdev = to_pci_dev(dev);
2583
2584 info->bus = pdev->bus->number;
2585 info->devfn = pdev->devfn;
2586 info->segment = pci_domain_nr(pdev->bus);
2587 }
2588
2589 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2590 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2591 info->ats_qdep = 0;
2592 info->dev = dev;
2593 info->domain = domain;
2594 info->iommu = iommu;
2595 info->pasid_table = NULL;
2596 info->auxd_enabled = 0;
2597 INIT_LIST_HEAD(&info->auxiliary_domains);
2598
2599 if (dev && dev_is_pci(dev)) {
2600 struct pci_dev *pdev = to_pci_dev(info->dev);
2601
2602 if (ecap_dev_iotlb_support(iommu->ecap) &&
2603 pci_ats_supported(pdev) &&
2604 dmar_find_matched_atsr_unit(pdev))
2605 info->ats_supported = 1;
2606
2607 if (sm_supported(iommu)) {
2608 if (pasid_supported(iommu)) {
2609 int features = pci_pasid_features(pdev);
2610 if (features >= 0)
2611 info->pasid_supported = features | 1;
2612 }
2613
2614 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2615 pci_pri_supported(pdev))
2616 info->pri_supported = 1;
2617 }
2618 }
2619
2620 spin_lock_irqsave(&device_domain_lock, flags);
2621 if (dev)
2622 found = find_domain(dev);
2623
2624 if (!found) {
2625 struct device_domain_info *info2;
2626 info2 = dmar_search_domain_by_dev_info(info->segment, info->bus,
2627 info->devfn);
2628 if (info2) {
2629 found = info2->domain;
2630 info2->dev = dev;
2631 }
2632 }
2633
2634 if (found) {
2635 spin_unlock_irqrestore(&device_domain_lock, flags);
2636 free_devinfo_mem(info);
2637 /* Caller must free the original domain */
2638 return found;
2639 }
2640
2641 spin_lock(&iommu->lock);
2642 ret = domain_attach_iommu(domain, iommu);
2643 spin_unlock(&iommu->lock);
2644
2645 if (ret) {
2646 spin_unlock_irqrestore(&device_domain_lock, flags);
2647 free_devinfo_mem(info);
2648 return NULL;
2649 }
2650
2651 list_add(&info->link, &domain->devices);
2652 list_add(&info->global, &device_domain_list);
2653 if (dev)
2654 dev_iommu_priv_set(dev, info);
2655 spin_unlock_irqrestore(&device_domain_lock, flags);
2656
2657 /* PASID table is mandatory for a PCI device in scalable mode. */
2658 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2659 ret = intel_pasid_alloc_table(dev);
2660 if (ret) {
2661 dev_err(dev, "PASID table allocation failed\n");
2662 dmar_remove_one_dev_info(dev);
2663 return NULL;
2664 }
2665
2666 /* Setup the PASID entry for requests without PASID: */
2667 spin_lock_irqsave(&iommu->lock, flags);
2668 if (hw_pass_through && domain_type_is_si(domain))
2669 ret = intel_pasid_setup_pass_through(iommu, domain,
2670 dev, PASID_RID2PASID);
2671 else if (domain_use_first_level(domain))
2672 ret = domain_setup_first_level(iommu, domain, dev,
2673 PASID_RID2PASID);
2674 else
2675 ret = intel_pasid_setup_second_level(iommu, domain,
2676 dev, PASID_RID2PASID);
2677 spin_unlock_irqrestore(&iommu->lock, flags);
2678 if (ret) {
2679 dev_err(dev, "Setup RID2PASID failed\n");
2680 dmar_remove_one_dev_info(dev);
2681 return NULL;
2682 }
2683 }
2684
2685 if (dev && domain_context_mapping(domain, dev)) {
2686 dev_err(dev, "Domain context map failed\n");
2687 dmar_remove_one_dev_info(dev);
2688 return NULL;
2689 }
2690
2691 return domain;
2692}
2693
2694static int iommu_domain_identity_map(struct dmar_domain *domain,
2695 unsigned long first_vpfn,
2696 unsigned long last_vpfn)
2697{
2698 /*
2699 * RMRR range might have overlap with physical memory range,
2700 * clear it first
2701 */
2702 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2703
2704 return __domain_mapping(domain, first_vpfn, NULL,
2705 first_vpfn, last_vpfn - first_vpfn + 1,
2706 DMA_PTE_READ|DMA_PTE_WRITE);
2707}
2708
2709static int md_domain_init(struct dmar_domain *domain, int guest_width);
2710
2711static int __init si_domain_init(int hw)
2712{
2713 struct dmar_rmrr_unit *rmrr;
2714 struct device *dev;
2715 int i, nid, ret;
2716
2717 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2718 if (!si_domain)
2719 return -EFAULT;
2720
2721 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2722 domain_exit(si_domain);
2723 return -EFAULT;
2724 }
2725
2726 if (hw)
2727 return 0;
2728
2729 for_each_online_node(nid) {
2730 unsigned long start_pfn, end_pfn;
2731 int i;
2732
2733 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2734 ret = iommu_domain_identity_map(si_domain,
2735 mm_to_dma_pfn(start_pfn),
2736 mm_to_dma_pfn(end_pfn));
2737 if (ret)
2738 return ret;
2739 }
2740 }
2741
2742 /*
2743 * Identity map the RMRRs so that devices with RMRRs could also use
2744 * the si_domain.
2745 */
2746 for_each_rmrr_units(rmrr) {
2747 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2748 i, dev) {
2749 unsigned long long start = rmrr->base_address;
2750 unsigned long long end = rmrr->end_address;
2751
2752 if (WARN_ON(end < start ||
2753 end >> agaw_to_width(si_domain->agaw)))
2754 continue;
2755
2756 ret = iommu_domain_identity_map(si_domain,
2757 mm_to_dma_pfn(start >> PAGE_SHIFT),
2758 mm_to_dma_pfn(end >> PAGE_SHIFT));
2759 if (ret)
2760 return ret;
2761 }
2762 }
2763
2764 return 0;
2765}
2766
2767static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2768{
2769 struct dmar_domain *ndomain;
2770 struct intel_iommu *iommu;
2771 u8 bus, devfn;
2772
2773 iommu = device_to_iommu(dev, &bus, &devfn);
2774 if (!iommu)
2775 return -ENODEV;
2776
2777 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2778 if (ndomain != domain)
2779 return -EBUSY;
2780
2781 return 0;
2782}
2783
2784static bool device_has_rmrr(struct device *dev)
2785{
2786 struct dmar_rmrr_unit *rmrr;
2787 struct device *tmp;
2788 int i;
2789
2790 rcu_read_lock();
2791 for_each_rmrr_units(rmrr) {
2792 /*
2793 * Return TRUE if this RMRR contains the device that
2794 * is passed in.
2795 */
2796 for_each_active_dev_scope(rmrr->devices,
2797 rmrr->devices_cnt, i, tmp)
2798 if (tmp == dev ||
2799 is_downstream_to_pci_bridge(dev, tmp)) {
2800 rcu_read_unlock();
2801 return true;
2802 }
2803 }
2804 rcu_read_unlock();
2805 return false;
2806}
2807
2808/**
2809 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2810 * is relaxable (ie. is allowed to be not enforced under some conditions)
2811 * @dev: device handle
2812 *
2813 * We assume that PCI USB devices with RMRRs have them largely
2814 * for historical reasons and that the RMRR space is not actively used post
2815 * boot. This exclusion may change if vendors begin to abuse it.
2816 *
2817 * The same exception is made for graphics devices, with the requirement that
2818 * any use of the RMRR regions will be torn down before assigning the device
2819 * to a guest.
2820 *
2821 * Return: true if the RMRR is relaxable, false otherwise
2822 */
2823static bool device_rmrr_is_relaxable(struct device *dev)
2824{
2825 struct pci_dev *pdev;
2826
2827 if (!dev_is_pci(dev))
2828 return false;
2829
2830 pdev = to_pci_dev(dev);
2831 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832 return true;
2833 else
2834 return false;
2835}
2836
2837/*
2838 * There are a couple cases where we need to restrict the functionality of
2839 * devices associated with RMRRs. The first is when evaluating a device for
2840 * identity mapping because problems exist when devices are moved in and out
2841 * of domains and their respective RMRR information is lost. This means that
2842 * a device with associated RMRRs will never be in a "passthrough" domain.
2843 * The second is use of the device through the IOMMU API. This interface
2844 * expects to have full control of the IOVA space for the device. We cannot
2845 * satisfy both the requirement that RMRR access is maintained and have an
2846 * unencumbered IOVA space. We also have no ability to quiesce the device's
2847 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2848 * We therefore prevent devices associated with an RMRR from participating in
2849 * the IOMMU API, which eliminates them from device assignment.
2850 *
2851 * In both cases, devices which have relaxable RMRRs are not concerned by this
2852 * restriction. See device_rmrr_is_relaxable comment.
2853 */
2854static bool device_is_rmrr_locked(struct device *dev)
2855{
2856 if (!device_has_rmrr(dev))
2857 return false;
2858
2859 if (device_rmrr_is_relaxable(dev))
2860 return false;
2861
2862 return true;
2863}
2864
2865/*
2866 * Return the required default domain type for a specific device.
2867 *
2868 * @dev: the device in query
2869 * @startup: true if this is during early boot
2870 *
2871 * Returns:
2872 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2873 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2874 * - 0: both identity and dynamic domains work for this device
2875 */
2876static int device_def_domain_type(struct device *dev)
2877{
2878 if (dev_is_pci(dev)) {
2879 struct pci_dev *pdev = to_pci_dev(dev);
2880
2881 /*
2882 * Prevent any device marked as untrusted from getting
2883 * placed into the statically identity mapping domain.
2884 */
2885 if (pdev->untrusted)
2886 return IOMMU_DOMAIN_DMA;
2887
2888 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889 return IOMMU_DOMAIN_IDENTITY;
2890
2891 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892 return IOMMU_DOMAIN_IDENTITY;
2893 }
2894
2895 return 0;
2896}
2897
2898static void intel_iommu_init_qi(struct intel_iommu *iommu)
2899{
2900 /*
2901 * Start from the sane iommu hardware state.
2902 * If the queued invalidation is already initialized by us
2903 * (for example, while enabling interrupt-remapping) then
2904 * we got the things already rolling from a sane state.
2905 */
2906 if (!iommu->qi) {
2907 /*
2908 * Clear any previous faults.
2909 */
2910 dmar_fault(-1, iommu);
2911 /*
2912 * Disable queued invalidation if supported and already enabled
2913 * before OS handover.
2914 */
2915 dmar_disable_qi(iommu);
2916 }
2917
2918 if (dmar_enable_qi(iommu)) {
2919 /*
2920 * Queued Invalidate not enabled, use Register Based Invalidate
2921 */
2922 iommu->flush.flush_context = __iommu_flush_context;
2923 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2924 pr_info("%s: Using Register based invalidation\n",
2925 iommu->name);
2926 } else {
2927 iommu->flush.flush_context = qi_flush_context;
2928 iommu->flush.flush_iotlb = qi_flush_iotlb;
2929 pr_info("%s: Using Queued invalidation\n", iommu->name);
2930 }
2931}
2932
2933static int copy_context_table(struct intel_iommu *iommu,
2934 struct root_entry *old_re,
2935 struct context_entry **tbl,
2936 int bus, bool ext)
2937{
2938 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2939 struct context_entry *new_ce = NULL, ce;
2940 struct context_entry *old_ce = NULL;
2941 struct root_entry re;
2942 phys_addr_t old_ce_phys;
2943
2944 tbl_idx = ext ? bus * 2 : bus;
2945 memcpy(&re, old_re, sizeof(re));
2946
2947 for (devfn = 0; devfn < 256; devfn++) {
2948 /* First calculate the correct index */
2949 idx = (ext ? devfn * 2 : devfn) % 256;
2950
2951 if (idx == 0) {
2952 /* First save what we may have and clean up */
2953 if (new_ce) {
2954 tbl[tbl_idx] = new_ce;
2955 __iommu_flush_cache(iommu, new_ce,
2956 VTD_PAGE_SIZE);
2957 pos = 1;
2958 }
2959
2960 if (old_ce)
2961 memunmap(old_ce);
2962
2963 ret = 0;
2964 if (devfn < 0x80)
2965 old_ce_phys = root_entry_lctp(&re);
2966 else
2967 old_ce_phys = root_entry_uctp(&re);
2968
2969 if (!old_ce_phys) {
2970 if (ext && devfn == 0) {
2971 /* No LCTP, try UCTP */
2972 devfn = 0x7f;
2973 continue;
2974 } else {
2975 goto out;
2976 }
2977 }
2978
2979 ret = -ENOMEM;
2980 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2981 MEMREMAP_WB);
2982 if (!old_ce)
2983 goto out;
2984
2985 new_ce = alloc_pgtable_page(iommu->node);
2986 if (!new_ce)
2987 goto out_unmap;
2988
2989 ret = 0;
2990 }
2991
2992 /* Now copy the context entry */
2993 memcpy(&ce, old_ce + idx, sizeof(ce));
2994
2995 if (!__context_present(&ce))
2996 continue;
2997
2998 did = context_domain_id(&ce);
2999 if (did >= 0 && did < cap_ndoms(iommu->cap))
3000 set_bit(did, iommu->domain_ids);
3001
3002 /*
3003 * We need a marker for copied context entries. This
3004 * marker needs to work for the old format as well as
3005 * for extended context entries.
3006 *
3007 * Bit 67 of the context entry is used. In the old
3008 * format this bit is available to software, in the
3009 * extended format it is the PGE bit, but PGE is ignored
3010 * by HW if PASIDs are disabled (and thus still
3011 * available).
3012 *
3013 * So disable PASIDs first and then mark the entry
3014 * copied. This means that we don't copy PASID
3015 * translations from the old kernel, but this is fine as
3016 * faults there are not fatal.
3017 */
3018 context_clear_pasid_enable(&ce);
3019 context_set_copied(&ce);
3020
3021 new_ce[idx] = ce;
3022 }
3023
3024 tbl[tbl_idx + pos] = new_ce;
3025
3026 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3027
3028out_unmap:
3029 memunmap(old_ce);
3030
3031out:
3032 return ret;
3033}
3034
3035static int copy_translation_tables(struct intel_iommu *iommu)
3036{
3037 struct context_entry **ctxt_tbls;
3038 struct root_entry *old_rt;
3039 phys_addr_t old_rt_phys;
3040 int ctxt_table_entries;
3041 unsigned long flags;
3042 u64 rtaddr_reg;
3043 int bus, ret;
3044 bool new_ext, ext;
3045
3046 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3047 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3048 new_ext = !!ecap_ecs(iommu->ecap);
3049
3050 /*
3051 * The RTT bit can only be changed when translation is disabled,
3052 * but disabling translation means to open a window for data
3053 * corruption. So bail out and don't copy anything if we would
3054 * have to change the bit.
3055 */
3056 if (new_ext != ext)
3057 return -EINVAL;
3058
3059 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3060 if (!old_rt_phys)
3061 return -EINVAL;
3062
3063 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3064 if (!old_rt)
3065 return -ENOMEM;
3066
3067 /* This is too big for the stack - allocate it from slab */
3068 ctxt_table_entries = ext ? 512 : 256;
3069 ret = -ENOMEM;
3070 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3071 if (!ctxt_tbls)
3072 goto out_unmap;
3073
3074 for (bus = 0; bus < 256; bus++) {
3075 ret = copy_context_table(iommu, &old_rt[bus],
3076 ctxt_tbls, bus, ext);
3077 if (ret) {
3078 pr_err("%s: Failed to copy context table for bus %d\n",
3079 iommu->name, bus);
3080 continue;
3081 }
3082 }
3083
3084 spin_lock_irqsave(&iommu->lock, flags);
3085
3086 /* Context tables are copied, now write them to the root_entry table */
3087 for (bus = 0; bus < 256; bus++) {
3088 int idx = ext ? bus * 2 : bus;
3089 u64 val;
3090
3091 if (ctxt_tbls[idx]) {
3092 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3093 iommu->root_entry[bus].lo = val;
3094 }
3095
3096 if (!ext || !ctxt_tbls[idx + 1])
3097 continue;
3098
3099 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3100 iommu->root_entry[bus].hi = val;
3101 }
3102
3103 spin_unlock_irqrestore(&iommu->lock, flags);
3104
3105 kfree(ctxt_tbls);
3106
3107 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3108
3109 ret = 0;
3110
3111out_unmap:
3112 memunmap(old_rt);
3113
3114 return ret;
3115}
3116
3117#ifdef CONFIG_INTEL_IOMMU_SVM
3118static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
3119{
3120 struct intel_iommu *iommu = data;
3121 ioasid_t ioasid;
3122
3123 if (!iommu)
3124 return INVALID_IOASID;
3125 /*
3126 * VT-d virtual command interface always uses the full 20 bit
3127 * PASID range. Host can partition guest PASID range based on
3128 * policies but it is out of guest's control.
3129 */
3130 if (min < PASID_MIN || max > intel_pasid_max_id)
3131 return INVALID_IOASID;
3132
3133 if (vcmd_alloc_pasid(iommu, &ioasid))
3134 return INVALID_IOASID;
3135
3136 return ioasid;
3137}
3138
3139static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
3140{
3141 struct intel_iommu *iommu = data;
3142
3143 if (!iommu)
3144 return;
3145 /*
3146 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
3147 * We can only free the PASID when all the devices are unbound.
3148 */
3149 if (ioasid_find(NULL, ioasid, NULL)) {
3150 pr_alert("Cannot free active IOASID %d\n", ioasid);
3151 return;
3152 }
3153 vcmd_free_pasid(iommu, ioasid);
3154}
3155
3156static void register_pasid_allocator(struct intel_iommu *iommu)
3157{
3158 /*
3159 * If we are running in the host, no need for custom allocator
3160 * in that PASIDs are allocated from the host system-wide.
3161 */
3162 if (!cap_caching_mode(iommu->cap))
3163 return;
3164
3165 if (!sm_supported(iommu)) {
3166 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
3167 return;
3168 }
3169
3170 /*
3171 * Register a custom PASID allocator if we are running in a guest,
3172 * guest PASID must be obtained via virtual command interface.
3173 * There can be multiple vIOMMUs in each guest but only one allocator
3174 * is active. All vIOMMU allocators will eventually be calling the same
3175 * host allocator.
3176 */
3177 if (!ecap_vcs(iommu->ecap) || !vccap_pasid(iommu->vccap))
3178 return;
3179
3180 pr_info("Register custom PASID allocator\n");
3181 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
3182 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
3183 iommu->pasid_allocator.pdata = (void *)iommu;
3184 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
3185 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
3186 /*
3187 * Disable scalable mode on this IOMMU if there
3188 * is no custom allocator. Mixing SM capable vIOMMU
3189 * and non-SM vIOMMU are not supported.
3190 */
3191 intel_iommu_sm = 0;
3192 }
3193}
3194#endif
3195
3196static int __init init_dmars(void)
3197{
3198 struct dmar_drhd_unit *drhd;
3199 struct intel_iommu *iommu;
3200 int ret;
3201
3202 /*
3203 * for each drhd
3204 * allocate root
3205 * initialize and program root entry to not present
3206 * endfor
3207 */
3208 for_each_drhd_unit(drhd) {
3209 /*
3210 * lock not needed as this is only incremented in the single
3211 * threaded kernel __init code path all other access are read
3212 * only
3213 */
3214 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3215 g_num_of_iommus++;
3216 continue;
3217 }
3218 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3219 }
3220
3221 /* Preallocate enough resources for IOMMU hot-addition */
3222 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3223 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3224
3225 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3226 GFP_KERNEL);
3227 if (!g_iommus) {
3228 pr_err("Allocating global iommu array failed\n");
3229 ret = -ENOMEM;
3230 goto error;
3231 }
3232
3233 for_each_iommu(iommu, drhd) {
3234 if (drhd->ignored) {
3235 iommu_disable_translation(iommu);
3236 continue;
3237 }
3238
3239 /*
3240 * Find the max pasid size of all IOMMU's in the system.
3241 * We need to ensure the system pasid table is no bigger
3242 * than the smallest supported.
3243 */
3244 if (pasid_supported(iommu)) {
3245 u32 temp = 2 << ecap_pss(iommu->ecap);
3246
3247 intel_pasid_max_id = min_t(u32, temp,
3248 intel_pasid_max_id);
3249 }
3250
3251 g_iommus[iommu->seq_id] = iommu;
3252
3253 intel_iommu_init_qi(iommu);
3254
3255 ret = iommu_init_domains(iommu);
3256 if (ret)
3257 goto free_iommu;
3258
3259 init_translation_status(iommu);
3260
3261 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3262 iommu_disable_translation(iommu);
3263 clear_translation_pre_enabled(iommu);
3264 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3265 iommu->name);
3266 }
3267
3268 /*
3269 * TBD:
3270 * we could share the same root & context tables
3271 * among all IOMMU's. Need to Split it later.
3272 */
3273 ret = iommu_alloc_root_entry(iommu);
3274 if (ret)
3275 goto free_iommu;
3276
3277 if (translation_pre_enabled(iommu)) {
3278 pr_info("Translation already enabled - trying to copy translation structures\n");
3279
3280 ret = copy_translation_tables(iommu);
3281 if (ret) {
3282 /*
3283 * We found the IOMMU with translation
3284 * enabled - but failed to copy over the
3285 * old root-entry table. Try to proceed
3286 * by disabling translation now and
3287 * allocating a clean root-entry table.
3288 * This might cause DMAR faults, but
3289 * probably the dump will still succeed.
3290 */
3291 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3292 iommu->name);
3293 iommu_disable_translation(iommu);
3294 clear_translation_pre_enabled(iommu);
3295 } else {
3296 pr_info("Copied translation tables from previous kernel for %s\n",
3297 iommu->name);
3298 }
3299 }
3300
3301 if (!ecap_pass_through(iommu->ecap))
3302 hw_pass_through = 0;
3303 intel_svm_check(iommu);
3304 }
3305
3306 /*
3307 * Now that qi is enabled on all iommus, set the root entry and flush
3308 * caches. This is required on some Intel X58 chipsets, otherwise the
3309 * flush_context function will loop forever and the boot hangs.
3310 */
3311 for_each_active_iommu(iommu, drhd) {
3312 iommu_flush_write_buffer(iommu);
3313#ifdef CONFIG_INTEL_IOMMU_SVM
3314 register_pasid_allocator(iommu);
3315#endif
3316 iommu_set_root_entry(iommu);
3317 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3318 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3319 }
3320
3321#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3322 dmar_map_gfx = 0;
3323#endif
3324
3325 if (!dmar_map_gfx)
3326 iommu_identity_mapping |= IDENTMAP_GFX;
3327
3328 check_tylersburg_isoch();
3329
3330 ret = si_domain_init(hw_pass_through);
3331 if (ret)
3332 goto free_iommu;
3333
3334 /*
3335 * for each drhd
3336 * enable fault log
3337 * global invalidate context cache
3338 * global invalidate iotlb
3339 * enable translation
3340 */
3341 for_each_iommu(iommu, drhd) {
3342 if (drhd->ignored) {
3343 /*
3344 * we always have to disable PMRs or DMA may fail on
3345 * this device
3346 */
3347 if (force_on)
3348 iommu_disable_protect_mem_regions(iommu);
3349 continue;
3350 }
3351
3352 iommu_flush_write_buffer(iommu);
3353
3354#ifdef CONFIG_INTEL_IOMMU_SVM
3355 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3356 /*
3357 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3358 * could cause possible lock race condition.
3359 */
3360 up_write(&dmar_global_lock);
3361 ret = intel_svm_enable_prq(iommu);
3362 down_write(&dmar_global_lock);
3363 if (ret)
3364 goto free_iommu;
3365 }
3366#endif
3367 ret = dmar_set_interrupt(iommu);
3368 if (ret)
3369 goto free_iommu;
3370 }
3371
3372 return 0;
3373
3374free_iommu:
3375 for_each_active_iommu(iommu, drhd) {
3376 disable_dmar_iommu(iommu);
3377 free_dmar_iommu(iommu);
3378 }
3379
3380 kfree(g_iommus);
3381
3382error:
3383 return ret;
3384}
3385
3386/* This takes a number of _MM_ pages, not VTD pages */
3387static unsigned long intel_alloc_iova(struct device *dev,
3388 struct dmar_domain *domain,
3389 unsigned long nrpages, uint64_t dma_mask)
3390{
3391 unsigned long iova_pfn;
3392
3393 /*
3394 * Restrict dma_mask to the width that the iommu can handle.
3395 * First-level translation restricts the input-address to a
3396 * canonical address (i.e., address bits 63:N have the same
3397 * value as address bit [N-1], where N is 48-bits with 4-level
3398 * paging and 57-bits with 5-level paging). Hence, skip bit
3399 * [N-1].
3400 */
3401 if (domain_use_first_level(domain))
3402 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3403 dma_mask);
3404 else
3405 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3406 dma_mask);
3407
3408 /* Ensure we reserve the whole size-aligned region */
3409 nrpages = __roundup_pow_of_two(nrpages);
3410
3411 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3412 /*
3413 * First try to allocate an io virtual address in
3414 * DMA_BIT_MASK(32) and if that fails then try allocating
3415 * from higher range
3416 */
3417 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3418 IOVA_PFN(DMA_BIT_MASK(32)), false);
3419 if (iova_pfn)
3420 return iova_pfn;
3421 }
3422 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3423 IOVA_PFN(dma_mask), true);
3424 if (unlikely(!iova_pfn)) {
3425 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3426 nrpages);
3427 return 0;
3428 }
3429
3430 return iova_pfn;
3431}
3432
3433static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3434 size_t size, int dir, u64 dma_mask)
3435{
3436 struct dmar_domain *domain;
3437 phys_addr_t start_paddr;
3438 unsigned long iova_pfn;
3439 int prot = 0;
3440 int ret;
3441 struct intel_iommu *iommu;
3442 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3443
3444 BUG_ON(dir == DMA_NONE);
3445
3446 if (unlikely(attach_deferred(dev)))
3447 do_deferred_attach(dev);
3448
3449 domain = find_domain(dev);
3450 if (!domain)
3451 return DMA_MAPPING_ERROR;
3452
3453 iommu = domain_get_iommu(domain);
3454 size = aligned_nrpages(paddr, size);
3455
3456 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3457 if (!iova_pfn)
3458 goto error;
3459
3460 /*
3461 * Check if DMAR supports zero-length reads on write only
3462 * mappings..
3463 */
3464 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3465 !cap_zlr(iommu->cap))
3466 prot |= DMA_PTE_READ;
3467 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3468 prot |= DMA_PTE_WRITE;
3469 /*
3470 * paddr - (paddr + size) might be partial page, we should map the whole
3471 * page. Note: if two part of one page are separately mapped, we
3472 * might have two guest_addr mapping to the same host paddr, but this
3473 * is not a big problem
3474 */
3475 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3476 mm_to_dma_pfn(paddr_pfn), size, prot);
3477 if (ret)
3478 goto error;
3479
3480 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3481 start_paddr += paddr & ~PAGE_MASK;
3482
3483 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3484
3485 return start_paddr;
3486
3487error:
3488 if (iova_pfn)
3489 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3490 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3491 size, (unsigned long long)paddr, dir);
3492 return DMA_MAPPING_ERROR;
3493}
3494
3495static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3496 unsigned long offset, size_t size,
3497 enum dma_data_direction dir,
3498 unsigned long attrs)
3499{
3500 return __intel_map_single(dev, page_to_phys(page) + offset,
3501 size, dir, *dev->dma_mask);
3502}
3503
3504static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3505 size_t size, enum dma_data_direction dir,
3506 unsigned long attrs)
3507{
3508 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3509}
3510
3511static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3512{
3513 struct dmar_domain *domain;
3514 unsigned long start_pfn, last_pfn;
3515 unsigned long nrpages;
3516 unsigned long iova_pfn;
3517 struct intel_iommu *iommu;
3518 struct page *freelist;
3519 struct pci_dev *pdev = NULL;
3520
3521 domain = find_domain(dev);
3522 BUG_ON(!domain);
3523
3524 iommu = domain_get_iommu(domain);
3525
3526 iova_pfn = IOVA_PFN(dev_addr);
3527
3528 nrpages = aligned_nrpages(dev_addr, size);
3529 start_pfn = mm_to_dma_pfn(iova_pfn);
3530 last_pfn = start_pfn + nrpages - 1;
3531
3532 if (dev_is_pci(dev))
3533 pdev = to_pci_dev(dev);
3534
3535 freelist = domain_unmap(domain, start_pfn, last_pfn);
3536 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3537 !has_iova_flush_queue(&domain->iovad)) {
3538 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3539 nrpages, !freelist, 0);
3540 /* free iova */
3541 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3542 dma_free_pagelist(freelist);
3543 } else {
3544 queue_iova(&domain->iovad, iova_pfn, nrpages,
3545 (unsigned long)freelist);
3546 /*
3547 * queue up the release of the unmap to save the 1/6th of the
3548 * cpu used up by the iotlb flush operation...
3549 */
3550 }
3551
3552 trace_unmap_single(dev, dev_addr, size);
3553}
3554
3555static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3556 size_t size, enum dma_data_direction dir,
3557 unsigned long attrs)
3558{
3559 intel_unmap(dev, dev_addr, size);
3560}
3561
3562static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3563 size_t size, enum dma_data_direction dir, unsigned long attrs)
3564{
3565 intel_unmap(dev, dev_addr, size);
3566}
3567
3568static void *intel_alloc_coherent(struct device *dev, size_t size,
3569 dma_addr_t *dma_handle, gfp_t flags,
3570 unsigned long attrs)
3571{
3572 struct page *page = NULL;
3573 int order;
3574
3575 if (unlikely(attach_deferred(dev)))
3576 do_deferred_attach(dev);
3577
3578 size = PAGE_ALIGN(size);
3579 order = get_order(size);
3580
3581 if (gfpflags_allow_blocking(flags)) {
3582 unsigned int count = size >> PAGE_SHIFT;
3583
3584 page = dma_alloc_from_contiguous(dev, count, order,
3585 flags & __GFP_NOWARN);
3586 }
3587
3588 if (!page)
3589 page = alloc_pages(flags, order);
3590 if (!page)
3591 return NULL;
3592 memset(page_address(page), 0, size);
3593
3594 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3595 DMA_BIDIRECTIONAL,
3596 dev->coherent_dma_mask);
3597 if (*dma_handle != DMA_MAPPING_ERROR)
3598 return page_address(page);
3599 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3600 __free_pages(page, order);
3601
3602 return NULL;
3603}
3604
3605static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3606 dma_addr_t dma_handle, unsigned long attrs)
3607{
3608 int order;
3609 struct page *page = virt_to_page(vaddr);
3610
3611 size = PAGE_ALIGN(size);
3612 order = get_order(size);
3613
3614 intel_unmap(dev, dma_handle, size);
3615 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3616 __free_pages(page, order);
3617}
3618
3619static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3620 int nelems, enum dma_data_direction dir,
3621 unsigned long attrs)
3622{
3623 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3624 unsigned long nrpages = 0;
3625 struct scatterlist *sg;
3626 int i;
3627
3628 for_each_sg(sglist, sg, nelems, i) {
3629 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3630 }
3631
3632 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3633
3634 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3635}
3636
3637static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3638 enum dma_data_direction dir, unsigned long attrs)
3639{
3640 int i;
3641 struct dmar_domain *domain;
3642 size_t size = 0;
3643 int prot = 0;
3644 unsigned long iova_pfn;
3645 int ret;
3646 struct scatterlist *sg;
3647 unsigned long start_vpfn;
3648 struct intel_iommu *iommu;
3649
3650 BUG_ON(dir == DMA_NONE);
3651
3652 if (unlikely(attach_deferred(dev)))
3653 do_deferred_attach(dev);
3654
3655 domain = find_domain(dev);
3656 if (!domain)
3657 return 0;
3658
3659 iommu = domain_get_iommu(domain);
3660
3661 for_each_sg(sglist, sg, nelems, i)
3662 size += aligned_nrpages(sg->offset, sg->length);
3663
3664 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3665 *dev->dma_mask);
3666 if (!iova_pfn) {
3667 sglist->dma_length = 0;
3668 return 0;
3669 }
3670
3671 /*
3672 * Check if DMAR supports zero-length reads on write only
3673 * mappings..
3674 */
3675 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3676 !cap_zlr(iommu->cap))
3677 prot |= DMA_PTE_READ;
3678 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3679 prot |= DMA_PTE_WRITE;
3680
3681 start_vpfn = mm_to_dma_pfn(iova_pfn);
3682
3683 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3684 if (unlikely(ret)) {
3685 dma_pte_free_pagetable(domain, start_vpfn,
3686 start_vpfn + size - 1,
3687 agaw_to_level(domain->agaw) + 1);
3688 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3689 return 0;
3690 }
3691
3692 for_each_sg(sglist, sg, nelems, i)
3693 trace_map_sg(dev, i + 1, nelems, sg);
3694
3695 return nelems;
3696}
3697
3698static u64 intel_get_required_mask(struct device *dev)
3699{
3700 return DMA_BIT_MASK(32);
3701}
3702
3703static const struct dma_map_ops intel_dma_ops = {
3704 .alloc = intel_alloc_coherent,
3705 .free = intel_free_coherent,
3706 .map_sg = intel_map_sg,
3707 .unmap_sg = intel_unmap_sg,
3708 .map_page = intel_map_page,
3709 .unmap_page = intel_unmap_page,
3710 .map_resource = intel_map_resource,
3711 .unmap_resource = intel_unmap_resource,
3712 .dma_supported = dma_direct_supported,
3713 .mmap = dma_common_mmap,
3714 .get_sgtable = dma_common_get_sgtable,
3715 .get_required_mask = intel_get_required_mask,
3716};
3717
3718static void
3719bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3720 enum dma_data_direction dir, enum dma_sync_target target)
3721{
3722 struct dmar_domain *domain;
3723 phys_addr_t tlb_addr;
3724
3725 domain = find_domain(dev);
3726 if (WARN_ON(!domain))
3727 return;
3728
3729 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3730 if (is_swiotlb_buffer(tlb_addr))
3731 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3732}
3733
3734static dma_addr_t
3735bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3736 enum dma_data_direction dir, unsigned long attrs,
3737 u64 dma_mask)
3738{
3739 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3740 struct dmar_domain *domain;
3741 struct intel_iommu *iommu;
3742 unsigned long iova_pfn;
3743 unsigned long nrpages;
3744 phys_addr_t tlb_addr;
3745 int prot = 0;
3746 int ret;
3747
3748 if (unlikely(attach_deferred(dev)))
3749 do_deferred_attach(dev);
3750
3751 domain = find_domain(dev);
3752
3753 if (WARN_ON(dir == DMA_NONE || !domain))
3754 return DMA_MAPPING_ERROR;
3755
3756 iommu = domain_get_iommu(domain);
3757 if (WARN_ON(!iommu))
3758 return DMA_MAPPING_ERROR;
3759
3760 nrpages = aligned_nrpages(0, size);
3761 iova_pfn = intel_alloc_iova(dev, domain,
3762 dma_to_mm_pfn(nrpages), dma_mask);
3763 if (!iova_pfn)
3764 return DMA_MAPPING_ERROR;
3765
3766 /*
3767 * Check if DMAR supports zero-length reads on write only
3768 * mappings..
3769 */
3770 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3771 !cap_zlr(iommu->cap))
3772 prot |= DMA_PTE_READ;
3773 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3774 prot |= DMA_PTE_WRITE;
3775
3776 /*
3777 * If both the physical buffer start address and size are
3778 * page aligned, we don't need to use a bounce page.
3779 */
3780 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3781 tlb_addr = swiotlb_tbl_map_single(dev,
3782 __phys_to_dma(dev, io_tlb_start),
3783 paddr, size, aligned_size, dir, attrs);
3784 if (tlb_addr == DMA_MAPPING_ERROR) {
3785 goto swiotlb_error;
3786 } else {
3787 /* Cleanup the padding area. */
3788 void *padding_start = phys_to_virt(tlb_addr);
3789 size_t padding_size = aligned_size;
3790
3791 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3792 (dir == DMA_TO_DEVICE ||
3793 dir == DMA_BIDIRECTIONAL)) {
3794 padding_start += size;
3795 padding_size -= size;
3796 }
3797
3798 memset(padding_start, 0, padding_size);
3799 }
3800 } else {
3801 tlb_addr = paddr;
3802 }
3803
3804 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3805 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3806 if (ret)
3807 goto mapping_error;
3808
3809 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3810
3811 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3812
3813mapping_error:
3814 if (is_swiotlb_buffer(tlb_addr))
3815 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3816 aligned_size, dir, attrs);
3817swiotlb_error:
3818 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3819 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3820 size, (unsigned long long)paddr, dir);
3821
3822 return DMA_MAPPING_ERROR;
3823}
3824
3825static void
3826bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3827 enum dma_data_direction dir, unsigned long attrs)
3828{
3829 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3830 struct dmar_domain *domain;
3831 phys_addr_t tlb_addr;
3832
3833 domain = find_domain(dev);
3834 if (WARN_ON(!domain))
3835 return;
3836
3837 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3838 if (WARN_ON(!tlb_addr))
3839 return;
3840
3841 intel_unmap(dev, dev_addr, size);
3842 if (is_swiotlb_buffer(tlb_addr))
3843 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3844 aligned_size, dir, attrs);
3845
3846 trace_bounce_unmap_single(dev, dev_addr, size);
3847}
3848
3849static dma_addr_t
3850bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3851 size_t size, enum dma_data_direction dir, unsigned long attrs)
3852{
3853 return bounce_map_single(dev, page_to_phys(page) + offset,
3854 size, dir, attrs, *dev->dma_mask);
3855}
3856
3857static dma_addr_t
3858bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3859 enum dma_data_direction dir, unsigned long attrs)
3860{
3861 return bounce_map_single(dev, phys_addr, size,
3862 dir, attrs, *dev->dma_mask);
3863}
3864
3865static void
3866bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3867 enum dma_data_direction dir, unsigned long attrs)
3868{
3869 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3870}
3871
3872static void
3873bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3874 enum dma_data_direction dir, unsigned long attrs)
3875{
3876 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3877}
3878
3879static void
3880bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3881 enum dma_data_direction dir, unsigned long attrs)
3882{
3883 struct scatterlist *sg;
3884 int i;
3885
3886 for_each_sg(sglist, sg, nelems, i)
3887 bounce_unmap_page(dev, sg->dma_address,
3888 sg_dma_len(sg), dir, attrs);
3889}
3890
3891static int
3892bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3893 enum dma_data_direction dir, unsigned long attrs)
3894{
3895 int i;
3896 struct scatterlist *sg;
3897
3898 for_each_sg(sglist, sg, nelems, i) {
3899 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3900 sg->offset, sg->length,
3901 dir, attrs);
3902 if (sg->dma_address == DMA_MAPPING_ERROR)
3903 goto out_unmap;
3904 sg_dma_len(sg) = sg->length;
3905 }
3906
3907 for_each_sg(sglist, sg, nelems, i)
3908 trace_bounce_map_sg(dev, i + 1, nelems, sg);
3909
3910 return nelems;
3911
3912out_unmap:
3913 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3914 return 0;
3915}
3916
3917static void
3918bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3919 size_t size, enum dma_data_direction dir)
3920{
3921 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3922}
3923
3924static void
3925bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
3926 size_t size, enum dma_data_direction dir)
3927{
3928 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
3929}
3930
3931static void
3932bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
3933 int nelems, enum dma_data_direction dir)
3934{
3935 struct scatterlist *sg;
3936 int i;
3937
3938 for_each_sg(sglist, sg, nelems, i)
3939 bounce_sync_single(dev, sg_dma_address(sg),
3940 sg_dma_len(sg), dir, SYNC_FOR_CPU);
3941}
3942
3943static void
3944bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
3945 int nelems, enum dma_data_direction dir)
3946{
3947 struct scatterlist *sg;
3948 int i;
3949
3950 for_each_sg(sglist, sg, nelems, i)
3951 bounce_sync_single(dev, sg_dma_address(sg),
3952 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
3953}
3954
3955static const struct dma_map_ops bounce_dma_ops = {
3956 .alloc = intel_alloc_coherent,
3957 .free = intel_free_coherent,
3958 .map_sg = bounce_map_sg,
3959 .unmap_sg = bounce_unmap_sg,
3960 .map_page = bounce_map_page,
3961 .unmap_page = bounce_unmap_page,
3962 .sync_single_for_cpu = bounce_sync_single_for_cpu,
3963 .sync_single_for_device = bounce_sync_single_for_device,
3964 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
3965 .sync_sg_for_device = bounce_sync_sg_for_device,
3966 .map_resource = bounce_map_resource,
3967 .unmap_resource = bounce_unmap_resource,
3968 .dma_supported = dma_direct_supported,
3969};
3970
3971static inline int iommu_domain_cache_init(void)
3972{
3973 int ret = 0;
3974
3975 iommu_domain_cache = kmem_cache_create("iommu_domain",
3976 sizeof(struct dmar_domain),
3977 0,
3978 SLAB_HWCACHE_ALIGN,
3979
3980 NULL);
3981 if (!iommu_domain_cache) {
3982 pr_err("Couldn't create iommu_domain cache\n");
3983 ret = -ENOMEM;
3984 }
3985
3986 return ret;
3987}
3988
3989static inline int iommu_devinfo_cache_init(void)
3990{
3991 int ret = 0;
3992
3993 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3994 sizeof(struct device_domain_info),
3995 0,
3996 SLAB_HWCACHE_ALIGN,
3997 NULL);
3998 if (!iommu_devinfo_cache) {
3999 pr_err("Couldn't create devinfo cache\n");
4000 ret = -ENOMEM;
4001 }
4002
4003 return ret;
4004}
4005
4006static int __init iommu_init_mempool(void)
4007{
4008 int ret;
4009 ret = iova_cache_get();
4010 if (ret)
4011 return ret;
4012
4013 ret = iommu_domain_cache_init();
4014 if (ret)
4015 goto domain_error;
4016
4017 ret = iommu_devinfo_cache_init();
4018 if (!ret)
4019 return ret;
4020
4021 kmem_cache_destroy(iommu_domain_cache);
4022domain_error:
4023 iova_cache_put();
4024
4025 return -ENOMEM;
4026}
4027
4028static void __init iommu_exit_mempool(void)
4029{
4030 kmem_cache_destroy(iommu_devinfo_cache);
4031 kmem_cache_destroy(iommu_domain_cache);
4032 iova_cache_put();
4033}
4034
4035static void __init init_no_remapping_devices(void)
4036{
4037 struct dmar_drhd_unit *drhd;
4038 struct device *dev;
4039 int i;
4040
4041 for_each_drhd_unit(drhd) {
4042 if (!drhd->include_all) {
4043 for_each_active_dev_scope(drhd->devices,
4044 drhd->devices_cnt, i, dev)
4045 break;
4046 /* ignore DMAR unit if no devices exist */
4047 if (i == drhd->devices_cnt)
4048 drhd->ignored = 1;
4049 }
4050 }
4051
4052 for_each_active_drhd_unit(drhd) {
4053 if (drhd->include_all)
4054 continue;
4055
4056 for_each_active_dev_scope(drhd->devices,
4057 drhd->devices_cnt, i, dev)
4058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4059 break;
4060 if (i < drhd->devices_cnt)
4061 continue;
4062
4063 /* This IOMMU has *only* gfx devices. Either bypass it or
4064 set the gfx_mapped flag, as appropriate */
4065 drhd->gfx_dedicated = 1;
4066 if (!dmar_map_gfx)
4067 drhd->ignored = 1;
4068 }
4069}
4070
4071#ifdef CONFIG_SUSPEND
4072static int init_iommu_hw(void)
4073{
4074 struct dmar_drhd_unit *drhd;
4075 struct intel_iommu *iommu = NULL;
4076
4077 for_each_active_iommu(iommu, drhd)
4078 if (iommu->qi)
4079 dmar_reenable_qi(iommu);
4080
4081 for_each_iommu(iommu, drhd) {
4082 if (drhd->ignored) {
4083 /*
4084 * we always have to disable PMRs or DMA may fail on
4085 * this device
4086 */
4087 if (force_on)
4088 iommu_disable_protect_mem_regions(iommu);
4089 continue;
4090 }
4091
4092 iommu_flush_write_buffer(iommu);
4093
4094 iommu_set_root_entry(iommu);
4095
4096 iommu->flush.flush_context(iommu, 0, 0, 0,
4097 DMA_CCMD_GLOBAL_INVL);
4098 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4099 iommu_enable_translation(iommu);
4100 iommu_disable_protect_mem_regions(iommu);
4101 }
4102
4103 return 0;
4104}
4105
4106static void iommu_flush_all(void)
4107{
4108 struct dmar_drhd_unit *drhd;
4109 struct intel_iommu *iommu;
4110
4111 for_each_active_iommu(iommu, drhd) {
4112 iommu->flush.flush_context(iommu, 0, 0, 0,
4113 DMA_CCMD_GLOBAL_INVL);
4114 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4115 DMA_TLB_GLOBAL_FLUSH);
4116 }
4117}
4118
4119static int iommu_suspend(void)
4120{
4121 struct dmar_drhd_unit *drhd;
4122 struct intel_iommu *iommu = NULL;
4123 unsigned long flag;
4124
4125 for_each_active_iommu(iommu, drhd) {
4126 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4127 GFP_ATOMIC);
4128 if (!iommu->iommu_state)
4129 goto nomem;
4130 }
4131
4132 iommu_flush_all();
4133
4134 for_each_active_iommu(iommu, drhd) {
4135 iommu_disable_translation(iommu);
4136
4137 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4138
4139 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4140 readl(iommu->reg + DMAR_FECTL_REG);
4141 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4142 readl(iommu->reg + DMAR_FEDATA_REG);
4143 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4144 readl(iommu->reg + DMAR_FEADDR_REG);
4145 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4146 readl(iommu->reg + DMAR_FEUADDR_REG);
4147
4148 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4149 }
4150 return 0;
4151
4152nomem:
4153 for_each_active_iommu(iommu, drhd)
4154 kfree(iommu->iommu_state);
4155
4156 return -ENOMEM;
4157}
4158
4159static void iommu_resume(void)
4160{
4161 struct dmar_drhd_unit *drhd;
4162 struct intel_iommu *iommu = NULL;
4163 unsigned long flag;
4164
4165 if (init_iommu_hw()) {
4166 if (force_on)
4167 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4168 else
4169 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4170 return;
4171 }
4172
4173 for_each_active_iommu(iommu, drhd) {
4174
4175 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4176
4177 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4178 iommu->reg + DMAR_FECTL_REG);
4179 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4180 iommu->reg + DMAR_FEDATA_REG);
4181 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4182 iommu->reg + DMAR_FEADDR_REG);
4183 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4184 iommu->reg + DMAR_FEUADDR_REG);
4185
4186 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4187 }
4188
4189 for_each_active_iommu(iommu, drhd)
4190 kfree(iommu->iommu_state);
4191}
4192
4193static struct syscore_ops iommu_syscore_ops = {
4194 .resume = iommu_resume,
4195 .suspend = iommu_suspend,
4196};
4197
4198static void __init init_iommu_pm_ops(void)
4199{
4200 register_syscore_ops(&iommu_syscore_ops);
4201}
4202
4203#else
4204static inline void init_iommu_pm_ops(void) {}
4205#endif /* CONFIG_PM */
4206
4207static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4208{
4209 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4210 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4211 rmrr->end_address <= rmrr->base_address ||
4212 arch_rmrr_sanity_check(rmrr))
4213 return -EINVAL;
4214
4215 return 0;
4216}
4217
4218int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4219{
4220 struct acpi_dmar_reserved_memory *rmrr;
4221 struct dmar_rmrr_unit *rmrru;
4222
4223 rmrr = (struct acpi_dmar_reserved_memory *)header;
4224 if (rmrr_sanity_check(rmrr)) {
4225 pr_warn(FW_BUG
4226 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4227 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4228 rmrr->base_address, rmrr->end_address,
4229 dmi_get_system_info(DMI_BIOS_VENDOR),
4230 dmi_get_system_info(DMI_BIOS_VERSION),
4231 dmi_get_system_info(DMI_PRODUCT_VERSION));
4232 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4233 }
4234
4235 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4236 if (!rmrru)
4237 goto out;
4238
4239 rmrru->hdr = header;
4240
4241 rmrru->base_address = rmrr->base_address;
4242 rmrru->end_address = rmrr->end_address;
4243
4244 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4245 ((void *)rmrr) + rmrr->header.length,
4246 &rmrru->devices_cnt);
4247 if (rmrru->devices_cnt && rmrru->devices == NULL)
4248 goto free_rmrru;
4249
4250 list_add(&rmrru->list, &dmar_rmrr_units);
4251
4252 return 0;
4253free_rmrru:
4254 kfree(rmrru);
4255out:
4256 return -ENOMEM;
4257}
4258
4259static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4260{
4261 struct dmar_atsr_unit *atsru;
4262 struct acpi_dmar_atsr *tmp;
4263
4264 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
4265 dmar_rcu_check()) {
4266 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4267 if (atsr->segment != tmp->segment)
4268 continue;
4269 if (atsr->header.length != tmp->header.length)
4270 continue;
4271 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4272 return atsru;
4273 }
4274
4275 return NULL;
4276}
4277
4278int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4279{
4280 struct acpi_dmar_atsr *atsr;
4281 struct dmar_atsr_unit *atsru;
4282
4283 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4284 return 0;
4285
4286 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4287 atsru = dmar_find_atsr(atsr);
4288 if (atsru)
4289 return 0;
4290
4291 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4292 if (!atsru)
4293 return -ENOMEM;
4294
4295 /*
4296 * If memory is allocated from slab by ACPI _DSM method, we need to
4297 * copy the memory content because the memory buffer will be freed
4298 * on return.
4299 */
4300 atsru->hdr = (void *)(atsru + 1);
4301 memcpy(atsru->hdr, hdr, hdr->length);
4302 atsru->include_all = atsr->flags & 0x1;
4303 if (!atsru->include_all) {
4304 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4305 (void *)atsr + atsr->header.length,
4306 &atsru->devices_cnt);
4307 if (atsru->devices_cnt && atsru->devices == NULL) {
4308 kfree(atsru);
4309 return -ENOMEM;
4310 }
4311 }
4312
4313 list_add_rcu(&atsru->list, &dmar_atsr_units);
4314
4315 return 0;
4316}
4317
4318static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4319{
4320 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4321 kfree(atsru);
4322}
4323
4324int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325{
4326 struct acpi_dmar_atsr *atsr;
4327 struct dmar_atsr_unit *atsru;
4328
4329 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4330 atsru = dmar_find_atsr(atsr);
4331 if (atsru) {
4332 list_del_rcu(&atsru->list);
4333 synchronize_rcu();
4334 intel_iommu_free_atsr(atsru);
4335 }
4336
4337 return 0;
4338}
4339
4340int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4341{
4342 int i;
4343 struct device *dev;
4344 struct acpi_dmar_atsr *atsr;
4345 struct dmar_atsr_unit *atsru;
4346
4347 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4348 atsru = dmar_find_atsr(atsr);
4349 if (!atsru)
4350 return 0;
4351
4352 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4353 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4354 i, dev)
4355 return -EBUSY;
4356 }
4357
4358 return 0;
4359}
4360
4361static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4362{
4363 int sp, ret;
4364 struct intel_iommu *iommu = dmaru->iommu;
4365
4366 if (g_iommus[iommu->seq_id])
4367 return 0;
4368
4369 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4370 pr_warn("%s: Doesn't support hardware pass through.\n",
4371 iommu->name);
4372 return -ENXIO;
4373 }
4374 if (!ecap_sc_support(iommu->ecap) &&
4375 domain_update_iommu_snooping(iommu)) {
4376 pr_warn("%s: Doesn't support snooping.\n",
4377 iommu->name);
4378 return -ENXIO;
4379 }
4380 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4381 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4382 pr_warn("%s: Doesn't support large page.\n",
4383 iommu->name);
4384 return -ENXIO;
4385 }
4386
4387 /*
4388 * Disable translation if already enabled prior to OS handover.
4389 */
4390 if (iommu->gcmd & DMA_GCMD_TE)
4391 iommu_disable_translation(iommu);
4392
4393 g_iommus[iommu->seq_id] = iommu;
4394 ret = iommu_init_domains(iommu);
4395 if (ret == 0)
4396 ret = iommu_alloc_root_entry(iommu);
4397 if (ret)
4398 goto out;
4399
4400 intel_svm_check(iommu);
4401
4402 if (dmaru->ignored) {
4403 /*
4404 * we always have to disable PMRs or DMA may fail on this device
4405 */
4406 if (force_on)
4407 iommu_disable_protect_mem_regions(iommu);
4408 return 0;
4409 }
4410
4411 intel_iommu_init_qi(iommu);
4412 iommu_flush_write_buffer(iommu);
4413
4414#ifdef CONFIG_INTEL_IOMMU_SVM
4415 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4416 ret = intel_svm_enable_prq(iommu);
4417 if (ret)
4418 goto disable_iommu;
4419 }
4420#endif
4421 ret = dmar_set_interrupt(iommu);
4422 if (ret)
4423 goto disable_iommu;
4424
4425 iommu_set_root_entry(iommu);
4426 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4427 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4428 iommu_enable_translation(iommu);
4429
4430 iommu_disable_protect_mem_regions(iommu);
4431 return 0;
4432
4433disable_iommu:
4434 disable_dmar_iommu(iommu);
4435out:
4436 free_dmar_iommu(iommu);
4437 return ret;
4438}
4439
4440int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4441{
4442 int ret = 0;
4443 struct intel_iommu *iommu = dmaru->iommu;
4444
4445 if (!intel_iommu_enabled)
4446 return 0;
4447 if (iommu == NULL)
4448 return -EINVAL;
4449
4450 if (insert) {
4451 ret = intel_iommu_add(dmaru);
4452 } else {
4453 disable_dmar_iommu(iommu);
4454 free_dmar_iommu(iommu);
4455 }
4456
4457 return ret;
4458}
4459
4460static void intel_iommu_free_dmars(void)
4461{
4462 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4463 struct dmar_atsr_unit *atsru, *atsr_n;
4464
4465 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4466 list_del(&rmrru->list);
4467 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4468 kfree(rmrru);
4469 }
4470
4471 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4472 list_del(&atsru->list);
4473 intel_iommu_free_atsr(atsru);
4474 }
4475}
4476
4477int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4478{
4479 int i, ret = 1;
4480 struct pci_bus *bus;
4481 struct pci_dev *bridge = NULL;
4482 struct device *tmp;
4483 struct acpi_dmar_atsr *atsr;
4484 struct dmar_atsr_unit *atsru;
4485
4486 dev = pci_physfn(dev);
4487 for (bus = dev->bus; bus; bus = bus->parent) {
4488 bridge = bus->self;
4489 /* If it's an integrated device, allow ATS */
4490 if (!bridge)
4491 return 1;
4492 /* Connected via non-PCIe: no ATS */
4493 if (!pci_is_pcie(bridge) ||
4494 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4495 return 0;
4496 /* If we found the root port, look it up in the ATSR */
4497 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4498 break;
4499 }
4500
4501 rcu_read_lock();
4502 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4503 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4504 if (atsr->segment != pci_domain_nr(dev->bus))
4505 continue;
4506
4507 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4508 if (tmp == &bridge->dev)
4509 goto out;
4510
4511 if (atsru->include_all)
4512 goto out;
4513 }
4514 ret = 0;
4515out:
4516 rcu_read_unlock();
4517
4518 return ret;
4519}
4520
4521int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4522{
4523 int ret;
4524 struct dmar_rmrr_unit *rmrru;
4525 struct dmar_atsr_unit *atsru;
4526 struct acpi_dmar_atsr *atsr;
4527 struct acpi_dmar_reserved_memory *rmrr;
4528
4529 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4530 return 0;
4531
4532 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4533 rmrr = container_of(rmrru->hdr,
4534 struct acpi_dmar_reserved_memory, header);
4535 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4536 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4537 ((void *)rmrr) + rmrr->header.length,
4538 rmrr->segment, rmrru->devices,
4539 rmrru->devices_cnt);
4540 if (ret < 0)
4541 return ret;
4542 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4543 dmar_remove_dev_scope(info, rmrr->segment,
4544 rmrru->devices, rmrru->devices_cnt);
4545 }
4546 }
4547
4548 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4549 if (atsru->include_all)
4550 continue;
4551
4552 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4553 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4554 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4555 (void *)atsr + atsr->header.length,
4556 atsr->segment, atsru->devices,
4557 atsru->devices_cnt);
4558 if (ret > 0)
4559 break;
4560 else if (ret < 0)
4561 return ret;
4562 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4563 if (dmar_remove_dev_scope(info, atsr->segment,
4564 atsru->devices, atsru->devices_cnt))
4565 break;
4566 }
4567 }
4568
4569 return 0;
4570}
4571
4572static int intel_iommu_memory_notifier(struct notifier_block *nb,
4573 unsigned long val, void *v)
4574{
4575 struct memory_notify *mhp = v;
4576 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4577 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
4578 mhp->nr_pages - 1);
4579
4580 switch (val) {
4581 case MEM_GOING_ONLINE:
4582 if (iommu_domain_identity_map(si_domain,
4583 start_vpfn, last_vpfn)) {
4584 pr_warn("Failed to build identity map for [%lx-%lx]\n",
4585 start_vpfn, last_vpfn);
4586 return NOTIFY_BAD;
4587 }
4588 break;
4589
4590 case MEM_OFFLINE:
4591 case MEM_CANCEL_ONLINE:
4592 {
4593 struct dmar_drhd_unit *drhd;
4594 struct intel_iommu *iommu;
4595 struct page *freelist;
4596
4597 freelist = domain_unmap(si_domain,
4598 start_vpfn, last_vpfn);
4599
4600 rcu_read_lock();
4601 for_each_active_iommu(iommu, drhd)
4602 iommu_flush_iotlb_psi(iommu, si_domain,
4603 start_vpfn, mhp->nr_pages,
4604 !freelist, 0);
4605 rcu_read_unlock();
4606 dma_free_pagelist(freelist);
4607 }
4608 break;
4609 }
4610
4611 return NOTIFY_OK;
4612}
4613
4614static struct notifier_block intel_iommu_memory_nb = {
4615 .notifier_call = intel_iommu_memory_notifier,
4616 .priority = 0
4617};
4618
4619static void free_all_cpu_cached_iovas(unsigned int cpu)
4620{
4621 int i;
4622
4623 for (i = 0; i < g_num_of_iommus; i++) {
4624 struct intel_iommu *iommu = g_iommus[i];
4625 struct dmar_domain *domain;
4626 int did;
4627
4628 if (!iommu)
4629 continue;
4630
4631 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4632 domain = get_iommu_domain(iommu, (u16)did);
4633
4634 if (!domain || domain->domain.type != IOMMU_DOMAIN_DMA)
4635 continue;
4636
4637 free_cpu_cached_iovas(cpu, &domain->iovad);
4638 }
4639 }
4640}
4641
4642static int intel_iommu_cpu_dead(unsigned int cpu)
4643{
4644 free_all_cpu_cached_iovas(cpu);
4645 return 0;
4646}
4647
4648static void intel_disable_iommus(void)
4649{
4650 struct intel_iommu *iommu = NULL;
4651 struct dmar_drhd_unit *drhd;
4652
4653 for_each_iommu(iommu, drhd)
4654 iommu_disable_translation(iommu);
4655}
4656
4657void intel_iommu_shutdown(void)
4658{
4659 struct dmar_drhd_unit *drhd;
4660 struct intel_iommu *iommu = NULL;
4661
4662 if (no_iommu || dmar_disabled)
4663 return;
4664
4665 down_write(&dmar_global_lock);
4666
4667 /* Disable PMRs explicitly here. */
4668 for_each_iommu(iommu, drhd)
4669 iommu_disable_protect_mem_regions(iommu);
4670
4671 /* Make sure the IOMMUs are switched off */
4672 intel_disable_iommus();
4673
4674 up_write(&dmar_global_lock);
4675}
4676
4677static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4678{
4679 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4680
4681 return container_of(iommu_dev, struct intel_iommu, iommu);
4682}
4683
4684static ssize_t intel_iommu_show_version(struct device *dev,
4685 struct device_attribute *attr,
4686 char *buf)
4687{
4688 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4689 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4690 return sprintf(buf, "%d:%d\n",
4691 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4692}
4693static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4694
4695static ssize_t intel_iommu_show_address(struct device *dev,
4696 struct device_attribute *attr,
4697 char *buf)
4698{
4699 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4700 return sprintf(buf, "%llx\n", iommu->reg_phys);
4701}
4702static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4703
4704static ssize_t intel_iommu_show_cap(struct device *dev,
4705 struct device_attribute *attr,
4706 char *buf)
4707{
4708 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4709 return sprintf(buf, "%llx\n", iommu->cap);
4710}
4711static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4712
4713static ssize_t intel_iommu_show_ecap(struct device *dev,
4714 struct device_attribute *attr,
4715 char *buf)
4716{
4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 return sprintf(buf, "%llx\n", iommu->ecap);
4719}
4720static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4721
4722static ssize_t intel_iommu_show_ndoms(struct device *dev,
4723 struct device_attribute *attr,
4724 char *buf)
4725{
4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4728}
4729static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4730
4731static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4732 struct device_attribute *attr,
4733 char *buf)
4734{
4735 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4737 cap_ndoms(iommu->cap)));
4738}
4739static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4740
4741static struct attribute *intel_iommu_attrs[] = {
4742 &dev_attr_version.attr,
4743 &dev_attr_address.attr,
4744 &dev_attr_cap.attr,
4745 &dev_attr_ecap.attr,
4746 &dev_attr_domains_supported.attr,
4747 &dev_attr_domains_used.attr,
4748 NULL,
4749};
4750
4751static struct attribute_group intel_iommu_group = {
4752 .name = "intel-iommu",
4753 .attrs = intel_iommu_attrs,
4754};
4755
4756const struct attribute_group *intel_iommu_groups[] = {
4757 &intel_iommu_group,
4758 NULL,
4759};
4760
4761static inline bool has_external_pci(void)
4762{
4763 struct pci_dev *pdev = NULL;
4764
4765 for_each_pci_dev(pdev)
4766 if (pdev->external_facing)
4767 return true;
4768
4769 return false;
4770}
4771
4772static int __init platform_optin_force_iommu(void)
4773{
4774 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
4775 return 0;
4776
4777 if (no_iommu || dmar_disabled)
4778 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4779
4780 /*
4781 * If Intel-IOMMU is disabled by default, we will apply identity
4782 * map for all devices except those marked as being untrusted.
4783 */
4784 if (dmar_disabled)
4785 iommu_set_default_passthrough(false);
4786
4787 dmar_disabled = 0;
4788 no_iommu = 0;
4789
4790 return 1;
4791}
4792
4793static int __init probe_acpi_namespace_devices(void)
4794{
4795 struct dmar_drhd_unit *drhd;
4796 /* To avoid a -Wunused-but-set-variable warning. */
4797 struct intel_iommu *iommu __maybe_unused;
4798 struct device *dev;
4799 int i, ret = 0;
4800
4801 for_each_active_iommu(iommu, drhd) {
4802 for_each_active_dev_scope(drhd->devices,
4803 drhd->devices_cnt, i, dev) {
4804 struct acpi_device_physical_node *pn;
4805 struct iommu_group *group;
4806 struct acpi_device *adev;
4807
4808 if (dev->bus != &acpi_bus_type)
4809 continue;
4810
4811 adev = to_acpi_device(dev);
4812 mutex_lock(&adev->physical_node_lock);
4813 list_for_each_entry(pn,
4814 &adev->physical_node_list, node) {
4815 group = iommu_group_get(pn->dev);
4816 if (group) {
4817 iommu_group_put(group);
4818 continue;
4819 }
4820
4821 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4822 ret = iommu_probe_device(pn->dev);
4823 if (ret)
4824 break;
4825 }
4826 mutex_unlock(&adev->physical_node_lock);
4827
4828 if (ret)
4829 return ret;
4830 }
4831 }
4832
4833 return 0;
4834}
4835
4836int __init intel_iommu_init(void)
4837{
4838 int ret = -ENODEV;
4839 struct dmar_drhd_unit *drhd;
4840 struct intel_iommu *iommu;
4841
4842 /*
4843 * Intel IOMMU is required for a TXT/tboot launch or platform
4844 * opt in, so enforce that.
4845 */
4846 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4847
4848 if (iommu_init_mempool()) {
4849 if (force_on)
4850 panic("tboot: Failed to initialize iommu memory\n");
4851 return -ENOMEM;
4852 }
4853
4854 down_write(&dmar_global_lock);
4855 if (dmar_table_init()) {
4856 if (force_on)
4857 panic("tboot: Failed to initialize DMAR table\n");
4858 goto out_free_dmar;
4859 }
4860
4861 if (dmar_dev_scope_init() < 0) {
4862 if (force_on)
4863 panic("tboot: Failed to initialize DMAR device scope\n");
4864 goto out_free_dmar;
4865 }
4866
4867 up_write(&dmar_global_lock);
4868
4869 /*
4870 * The bus notifier takes the dmar_global_lock, so lockdep will
4871 * complain later when we register it under the lock.
4872 */
4873 dmar_register_bus_notifier();
4874
4875 down_write(&dmar_global_lock);
4876
4877 if (!no_iommu)
4878 intel_iommu_debugfs_init();
4879
4880 if (no_iommu || dmar_disabled) {
4881 /*
4882 * We exit the function here to ensure IOMMU's remapping and
4883 * mempool aren't setup, which means that the IOMMU's PMRs
4884 * won't be disabled via the call to init_dmars(). So disable
4885 * it explicitly here. The PMRs were setup by tboot prior to
4886 * calling SENTER, but the kernel is expected to reset/tear
4887 * down the PMRs.
4888 */
4889 if (intel_iommu_tboot_noforce) {
4890 for_each_iommu(iommu, drhd)
4891 iommu_disable_protect_mem_regions(iommu);
4892 }
4893
4894 /*
4895 * Make sure the IOMMUs are switched off, even when we
4896 * boot into a kexec kernel and the previous kernel left
4897 * them enabled
4898 */
4899 intel_disable_iommus();
4900 goto out_free_dmar;
4901 }
4902
4903 if (list_empty(&dmar_rmrr_units))
4904 pr_info("No RMRR found\n");
4905
4906 if (list_empty(&dmar_atsr_units))
4907 pr_info("No ATSR found\n");
4908
4909 if (dmar_init_reserved_ranges()) {
4910 if (force_on)
4911 panic("tboot: Failed to reserve iommu ranges\n");
4912 goto out_free_reserved_range;
4913 }
4914
4915 if (dmar_map_gfx)
4916 intel_iommu_gfx_mapped = 1;
4917
4918 init_no_remapping_devices();
4919
4920 ret = init_dmars();
4921 if (ret) {
4922 if (force_on)
4923 panic("tboot: Failed to initialize DMARs\n");
4924 pr_err("Initialization failed\n");
4925 goto out_free_reserved_range;
4926 }
4927 up_write(&dmar_global_lock);
4928
4929 init_iommu_pm_ops();
4930
4931 down_read(&dmar_global_lock);
4932 for_each_active_iommu(iommu, drhd) {
4933 iommu_device_sysfs_add(&iommu->iommu, NULL,
4934 intel_iommu_groups,
4935 "%s", iommu->name);
4936 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4937 iommu_device_register(&iommu->iommu);
4938 }
4939 up_read(&dmar_global_lock);
4940
4941 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4942 if (si_domain && !hw_pass_through)
4943 register_memory_notifier(&intel_iommu_memory_nb);
4944 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4945 intel_iommu_cpu_dead);
4946
4947 down_read(&dmar_global_lock);
4948 if (probe_acpi_namespace_devices())
4949 pr_warn("ACPI name space devices didn't probe correctly\n");
4950
4951 /* Finally, we enable the DMA remapping hardware. */
4952 for_each_iommu(iommu, drhd) {
4953 if (!drhd->ignored && !translation_pre_enabled(iommu))
4954 iommu_enable_translation(iommu);
4955
4956 iommu_disable_protect_mem_regions(iommu);
4957 }
4958 up_read(&dmar_global_lock);
4959
4960 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4961
4962 intel_iommu_enabled = 1;
4963
4964 return 0;
4965
4966out_free_reserved_range:
4967 put_iova_domain(&reserved_iova_list);
4968out_free_dmar:
4969 intel_iommu_free_dmars();
4970 up_write(&dmar_global_lock);
4971 iommu_exit_mempool();
4972 return ret;
4973}
4974
4975static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4976{
4977 struct intel_iommu *iommu = opaque;
4978
4979 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4980 return 0;
4981}
4982
4983/*
4984 * NB - intel-iommu lacks any sort of reference counting for the users of
4985 * dependent devices. If multiple endpoints have intersecting dependent
4986 * devices, unbinding the driver from any one of them will possibly leave
4987 * the others unable to operate.
4988 */
4989static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4990{
4991 if (!iommu || !dev || !dev_is_pci(dev))
4992 return;
4993
4994 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4995}
4996
4997static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4998{
4999 struct dmar_domain *domain;
5000 struct intel_iommu *iommu;
5001 unsigned long flags;
5002
5003 assert_spin_locked(&device_domain_lock);
5004
5005 if (WARN_ON(!info))
5006 return;
5007
5008 iommu = info->iommu;
5009 domain = info->domain;
5010
5011 if (info->dev) {
5012 if (dev_is_pci(info->dev) && sm_supported(iommu))
5013 intel_pasid_tear_down_entry(iommu, info->dev,
5014 PASID_RID2PASID, false);
5015
5016 iommu_disable_dev_iotlb(info);
5017 if (!dev_is_real_dma_subdevice(info->dev))
5018 domain_context_clear(iommu, info->dev);
5019 intel_pasid_free_table(info->dev);
5020 }
5021
5022 unlink_domain_info(info);
5023
5024 spin_lock_irqsave(&iommu->lock, flags);
5025 domain_detach_iommu(domain, iommu);
5026 spin_unlock_irqrestore(&iommu->lock, flags);
5027
5028 free_devinfo_mem(info);
5029}
5030
5031static void dmar_remove_one_dev_info(struct device *dev)
5032{
5033 struct device_domain_info *info;
5034 unsigned long flags;
5035
5036 spin_lock_irqsave(&device_domain_lock, flags);
5037 info = get_domain_info(dev);
5038 if (info)
5039 __dmar_remove_one_dev_info(info);
5040 spin_unlock_irqrestore(&device_domain_lock, flags);
5041}
5042
5043static int md_domain_init(struct dmar_domain *domain, int guest_width)
5044{
5045 int adjust_width;
5046
5047 /* calculate AGAW */
5048 domain->gaw = guest_width;
5049 adjust_width = guestwidth_to_adjustwidth(guest_width);
5050 domain->agaw = width_to_agaw(adjust_width);
5051
5052 domain->iommu_coherency = 0;
5053 domain->iommu_snooping = 0;
5054 domain->iommu_superpage = 0;
5055 domain->max_addr = 0;
5056
5057 /* always allocate the top pgd */
5058 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5059 if (!domain->pgd)
5060 return -ENOMEM;
5061 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5062 return 0;
5063}
5064
5065static void intel_init_iova_domain(struct dmar_domain *dmar_domain)
5066{
5067 init_iova_domain(&dmar_domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5068 copy_reserved_iova(&reserved_iova_list, &dmar_domain->iovad);
5069
5070 if (!intel_iommu_strict &&
5071 init_iova_flush_queue(&dmar_domain->iovad,
5072 iommu_flush_iova, iova_entry_free))
5073 pr_info("iova flush queue initialization failed\n");
5074}
5075
5076static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5077{
5078 struct dmar_domain *dmar_domain;
5079 struct iommu_domain *domain;
5080
5081 switch (type) {
5082 case IOMMU_DOMAIN_DMA:
5083 case IOMMU_DOMAIN_UNMANAGED:
5084 dmar_domain = alloc_domain(0);
5085 if (!dmar_domain) {
5086 pr_err("Can't allocate dmar_domain\n");
5087 return NULL;
5088 }
5089 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5090 pr_err("Domain initialization failed\n");
5091 domain_exit(dmar_domain);
5092 return NULL;
5093 }
5094
5095 if (type == IOMMU_DOMAIN_DMA)
5096 intel_init_iova_domain(dmar_domain);
5097
5098 domain_update_iommu_cap(dmar_domain);
5099
5100 domain = &dmar_domain->domain;
5101 domain->geometry.aperture_start = 0;
5102 domain->geometry.aperture_end =
5103 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5104 domain->geometry.force_aperture = true;
5105
5106 return domain;
5107 case IOMMU_DOMAIN_IDENTITY:
5108 return &si_domain->domain;
5109 default:
5110 return NULL;
5111 }
5112
5113 return NULL;
5114}
5115
5116static void intel_iommu_domain_free(struct iommu_domain *domain)
5117{
5118 if (domain != &si_domain->domain)
5119 domain_exit(to_dmar_domain(domain));
5120}
5121
5122/*
5123 * Check whether a @domain could be attached to the @dev through the
5124 * aux-domain attach/detach APIs.
5125 */
5126static inline bool
5127is_aux_domain(struct device *dev, struct iommu_domain *domain)
5128{
5129 struct device_domain_info *info = get_domain_info(dev);
5130
5131 return info && info->auxd_enabled &&
5132 domain->type == IOMMU_DOMAIN_UNMANAGED;
5133}
5134
5135static void auxiliary_link_device(struct dmar_domain *domain,
5136 struct device *dev)
5137{
5138 struct device_domain_info *info = get_domain_info(dev);
5139
5140 assert_spin_locked(&device_domain_lock);
5141 if (WARN_ON(!info))
5142 return;
5143
5144 domain->auxd_refcnt++;
5145 list_add(&domain->auxd, &info->auxiliary_domains);
5146}
5147
5148static void auxiliary_unlink_device(struct dmar_domain *domain,
5149 struct device *dev)
5150{
5151 struct device_domain_info *info = get_domain_info(dev);
5152
5153 assert_spin_locked(&device_domain_lock);
5154 if (WARN_ON(!info))
5155 return;
5156
5157 list_del(&domain->auxd);
5158 domain->auxd_refcnt--;
5159
5160 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5161 ioasid_free(domain->default_pasid);
5162}
5163
5164static int aux_domain_add_dev(struct dmar_domain *domain,
5165 struct device *dev)
5166{
5167 int ret;
5168 unsigned long flags;
5169 struct intel_iommu *iommu;
5170
5171 iommu = device_to_iommu(dev, NULL, NULL);
5172 if (!iommu)
5173 return -ENODEV;
5174
5175 if (domain->default_pasid <= 0) {
5176 int pasid;
5177
5178 /* No private data needed for the default pasid */
5179 pasid = ioasid_alloc(NULL, PASID_MIN,
5180 pci_max_pasids(to_pci_dev(dev)) - 1,
5181 NULL);
5182 if (pasid == INVALID_IOASID) {
5183 pr_err("Can't allocate default pasid\n");
5184 return -ENODEV;
5185 }
5186 domain->default_pasid = pasid;
5187 }
5188
5189 spin_lock_irqsave(&device_domain_lock, flags);
5190 /*
5191 * iommu->lock must be held to attach domain to iommu and setup the
5192 * pasid entry for second level translation.
5193 */
5194 spin_lock(&iommu->lock);
5195 ret = domain_attach_iommu(domain, iommu);
5196 if (ret)
5197 goto attach_failed;
5198
5199 /* Setup the PASID entry for mediated devices: */
5200 if (domain_use_first_level(domain))
5201 ret = domain_setup_first_level(iommu, domain, dev,
5202 domain->default_pasid);
5203 else
5204 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5205 domain->default_pasid);
5206 if (ret)
5207 goto table_failed;
5208 spin_unlock(&iommu->lock);
5209
5210 auxiliary_link_device(domain, dev);
5211
5212 spin_unlock_irqrestore(&device_domain_lock, flags);
5213
5214 return 0;
5215
5216table_failed:
5217 domain_detach_iommu(domain, iommu);
5218attach_failed:
5219 spin_unlock(&iommu->lock);
5220 spin_unlock_irqrestore(&device_domain_lock, flags);
5221 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5222 ioasid_free(domain->default_pasid);
5223
5224 return ret;
5225}
5226
5227static void aux_domain_remove_dev(struct dmar_domain *domain,
5228 struct device *dev)
5229{
5230 struct device_domain_info *info;
5231 struct intel_iommu *iommu;
5232 unsigned long flags;
5233
5234 if (!is_aux_domain(dev, &domain->domain))
5235 return;
5236
5237 spin_lock_irqsave(&device_domain_lock, flags);
5238 info = get_domain_info(dev);
5239 iommu = info->iommu;
5240
5241 auxiliary_unlink_device(domain, dev);
5242
5243 spin_lock(&iommu->lock);
5244 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid, false);
5245 domain_detach_iommu(domain, iommu);
5246 spin_unlock(&iommu->lock);
5247
5248 spin_unlock_irqrestore(&device_domain_lock, flags);
5249}
5250
5251static int prepare_domain_attach_device(struct iommu_domain *domain,
5252 struct device *dev)
5253{
5254 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5255 struct intel_iommu *iommu;
5256 int addr_width;
5257
5258 iommu = device_to_iommu(dev, NULL, NULL);
5259 if (!iommu)
5260 return -ENODEV;
5261
5262 /* check if this iommu agaw is sufficient for max mapped address */
5263 addr_width = agaw_to_width(iommu->agaw);
5264 if (addr_width > cap_mgaw(iommu->cap))
5265 addr_width = cap_mgaw(iommu->cap);
5266
5267 if (dmar_domain->max_addr > (1LL << addr_width)) {
5268 dev_err(dev, "%s: iommu width (%d) is not "
5269 "sufficient for the mapped address (%llx)\n",
5270 __func__, addr_width, dmar_domain->max_addr);
5271 return -EFAULT;
5272 }
5273 dmar_domain->gaw = addr_width;
5274
5275 /*
5276 * Knock out extra levels of page tables if necessary
5277 */
5278 while (iommu->agaw < dmar_domain->agaw) {
5279 struct dma_pte *pte;
5280
5281 pte = dmar_domain->pgd;
5282 if (dma_pte_present(pte)) {
5283 dmar_domain->pgd = (struct dma_pte *)
5284 phys_to_virt(dma_pte_addr(pte));
5285 free_pgtable_page(pte);
5286 }
5287 dmar_domain->agaw--;
5288 }
5289
5290 return 0;
5291}
5292
5293static int intel_iommu_attach_device(struct iommu_domain *domain,
5294 struct device *dev)
5295{
5296 int ret;
5297
5298 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5299 device_is_rmrr_locked(dev)) {
5300 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5301 return -EPERM;
5302 }
5303
5304 if (is_aux_domain(dev, domain))
5305 return -EPERM;
5306
5307 /* normally dev is not mapped */
5308 if (unlikely(domain_context_mapped(dev))) {
5309 struct dmar_domain *old_domain;
5310
5311 old_domain = find_domain(dev);
5312 if (old_domain)
5313 dmar_remove_one_dev_info(dev);
5314 }
5315
5316 ret = prepare_domain_attach_device(domain, dev);
5317 if (ret)
5318 return ret;
5319
5320 return domain_add_dev_info(to_dmar_domain(domain), dev);
5321}
5322
5323static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5324 struct device *dev)
5325{
5326 int ret;
5327
5328 if (!is_aux_domain(dev, domain))
5329 return -EPERM;
5330
5331 ret = prepare_domain_attach_device(domain, dev);
5332 if (ret)
5333 return ret;
5334
5335 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5336}
5337
5338static void intel_iommu_detach_device(struct iommu_domain *domain,
5339 struct device *dev)
5340{
5341 dmar_remove_one_dev_info(dev);
5342}
5343
5344static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5345 struct device *dev)
5346{
5347 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5348}
5349
5350/*
5351 * 2D array for converting and sanitizing IOMMU generic TLB granularity to
5352 * VT-d granularity. Invalidation is typically included in the unmap operation
5353 * as a result of DMA or VFIO unmap. However, for assigned devices guest
5354 * owns the first level page tables. Invalidations of translation caches in the
5355 * guest are trapped and passed down to the host.
5356 *
5357 * vIOMMU in the guest will only expose first level page tables, therefore
5358 * we do not support IOTLB granularity for request without PASID (second level).
5359 *
5360 * For example, to find the VT-d granularity encoding for IOTLB
5361 * type and page selective granularity within PASID:
5362 * X: indexed by iommu cache type
5363 * Y: indexed by enum iommu_inv_granularity
5364 * [IOMMU_CACHE_INV_TYPE_IOTLB][IOMMU_INV_GRANU_ADDR]
5365 */
5366
5367static const int
5368inv_type_granu_table[IOMMU_CACHE_INV_TYPE_NR][IOMMU_INV_GRANU_NR] = {
5369 /*
5370 * PASID based IOTLB invalidation: PASID selective (per PASID),
5371 * page selective (address granularity)
5372 */
5373 {-EINVAL, QI_GRAN_NONG_PASID, QI_GRAN_PSI_PASID},
5374 /* PASID based dev TLBs */
5375 {-EINVAL, -EINVAL, QI_DEV_IOTLB_GRAN_PASID_SEL},
5376 /* PASID cache */
5377 {-EINVAL, -EINVAL, -EINVAL}
5378};
5379
5380static inline int to_vtd_granularity(int type, int granu)
5381{
5382 return inv_type_granu_table[type][granu];
5383}
5384
5385static inline u64 to_vtd_size(u64 granu_size, u64 nr_granules)
5386{
5387 u64 nr_pages = (granu_size * nr_granules) >> VTD_PAGE_SHIFT;
5388
5389 /* VT-d size is encoded as 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
5390 * IOMMU cache invalidate API passes granu_size in bytes, and number of
5391 * granu size in contiguous memory.
5392 */
5393 return order_base_2(nr_pages);
5394}
5395
5396#ifdef CONFIG_INTEL_IOMMU_SVM
5397static int
5398intel_iommu_sva_invalidate(struct iommu_domain *domain, struct device *dev,
5399 struct iommu_cache_invalidate_info *inv_info)
5400{
5401 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5402 struct device_domain_info *info;
5403 struct intel_iommu *iommu;
5404 unsigned long flags;
5405 int cache_type;
5406 u8 bus, devfn;
5407 u16 did, sid;
5408 int ret = 0;
5409 u64 size = 0;
5410
5411 if (!inv_info || !dmar_domain ||
5412 inv_info->version != IOMMU_CACHE_INVALIDATE_INFO_VERSION_1)
5413 return -EINVAL;
5414
5415 if (!dev || !dev_is_pci(dev))
5416 return -ENODEV;
5417
5418 iommu = device_to_iommu(dev, &bus, &devfn);
5419 if (!iommu)
5420 return -ENODEV;
5421
5422 if (!(dmar_domain->flags & DOMAIN_FLAG_NESTING_MODE))
5423 return -EINVAL;
5424
5425 spin_lock_irqsave(&device_domain_lock, flags);
5426 spin_lock(&iommu->lock);
5427 info = get_domain_info(dev);
5428 if (!info) {
5429 ret = -EINVAL;
5430 goto out_unlock;
5431 }
5432 did = dmar_domain->iommu_did[iommu->seq_id];
5433 sid = PCI_DEVID(bus, devfn);
5434
5435 /* Size is only valid in address selective invalidation */
5436 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR)
5437 size = to_vtd_size(inv_info->addr_info.granule_size,
5438 inv_info->addr_info.nb_granules);
5439
5440 for_each_set_bit(cache_type,
5441 (unsigned long *)&inv_info->cache,
5442 IOMMU_CACHE_INV_TYPE_NR) {
5443 int granu = 0;
5444 u64 pasid = 0;
5445 u64 addr = 0;
5446
5447 granu = to_vtd_granularity(cache_type, inv_info->granularity);
5448 if (granu == -EINVAL) {
5449 pr_err_ratelimited("Invalid cache type and granu combination %d/%d\n",
5450 cache_type, inv_info->granularity);
5451 break;
5452 }
5453
5454 /*
5455 * PASID is stored in different locations based on the
5456 * granularity.
5457 */
5458 if (inv_info->granularity == IOMMU_INV_GRANU_PASID &&
5459 (inv_info->pasid_info.flags & IOMMU_INV_PASID_FLAGS_PASID))
5460 pasid = inv_info->pasid_info.pasid;
5461 else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5462 (inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_PASID))
5463 pasid = inv_info->addr_info.pasid;
5464
5465 switch (BIT(cache_type)) {
5466 case IOMMU_CACHE_INV_TYPE_IOTLB:
5467 /* HW will ignore LSB bits based on address mask */
5468 if (inv_info->granularity == IOMMU_INV_GRANU_ADDR &&
5469 size &&
5470 (inv_info->addr_info.addr & ((BIT(VTD_PAGE_SHIFT + size)) - 1))) {
5471 pr_err_ratelimited("User address not aligned, 0x%llx, size order %llu\n",
5472 inv_info->addr_info.addr, size);
5473 }
5474
5475 /*
5476 * If granu is PASID-selective, address is ignored.
5477 * We use npages = -1 to indicate that.
5478 */
5479 qi_flush_piotlb(iommu, did, pasid,
5480 mm_to_dma_pfn(inv_info->addr_info.addr),
5481 (granu == QI_GRAN_NONG_PASID) ? -1 : 1 << size,
5482 inv_info->addr_info.flags & IOMMU_INV_ADDR_FLAGS_LEAF);
5483
5484 if (!info->ats_enabled)
5485 break;
5486 /*
5487 * Always flush device IOTLB if ATS is enabled. vIOMMU
5488 * in the guest may assume IOTLB flush is inclusive,
5489 * which is more efficient.
5490 */
5491 fallthrough;
5492 case IOMMU_CACHE_INV_TYPE_DEV_IOTLB:
5493 /*
5494 * PASID based device TLB invalidation does not support
5495 * IOMMU_INV_GRANU_PASID granularity but only supports
5496 * IOMMU_INV_GRANU_ADDR.
5497 * The equivalent of that is we set the size to be the
5498 * entire range of 64 bit. User only provides PASID info
5499 * without address info. So we set addr to 0.
5500 */
5501 if (inv_info->granularity == IOMMU_INV_GRANU_PASID) {
5502 size = 64 - VTD_PAGE_SHIFT;
5503 addr = 0;
5504 } else if (inv_info->granularity == IOMMU_INV_GRANU_ADDR) {
5505 addr = inv_info->addr_info.addr;
5506 }
5507
5508 if (info->ats_enabled)
5509 qi_flush_dev_iotlb_pasid(iommu, sid,
5510 info->pfsid, pasid,
5511 info->ats_qdep, addr,
5512 size);
5513 else
5514 pr_warn_ratelimited("Passdown device IOTLB flush w/o ATS!\n");
5515 break;
5516 default:
5517 dev_err_ratelimited(dev, "Unsupported IOMMU invalidation type %d\n",
5518 cache_type);
5519 ret = -EINVAL;
5520 }
5521 }
5522out_unlock:
5523 spin_unlock(&iommu->lock);
5524 spin_unlock_irqrestore(&device_domain_lock, flags);
5525
5526 return ret;
5527}
5528#endif
5529
5530static int intel_iommu_map(struct iommu_domain *domain,
5531 unsigned long iova, phys_addr_t hpa,
5532 size_t size, int iommu_prot, gfp_t gfp)
5533{
5534 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5535 u64 max_addr;
5536 int prot = 0;
5537 int ret;
5538
5539 if (iommu_prot & IOMMU_READ)
5540 prot |= DMA_PTE_READ;
5541 if (iommu_prot & IOMMU_WRITE)
5542 prot |= DMA_PTE_WRITE;
5543 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5544 prot |= DMA_PTE_SNP;
5545
5546 max_addr = iova + size;
5547 if (dmar_domain->max_addr < max_addr) {
5548 u64 end;
5549
5550 /* check if minimum agaw is sufficient for mapped address */
5551 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5552 if (end < max_addr) {
5553 pr_err("%s: iommu width (%d) is not "
5554 "sufficient for the mapped address (%llx)\n",
5555 __func__, dmar_domain->gaw, max_addr);
5556 return -EFAULT;
5557 }
5558 dmar_domain->max_addr = max_addr;
5559 }
5560 /* Round up size to next multiple of PAGE_SIZE, if it and
5561 the low bits of hpa would take us onto the next page */
5562 size = aligned_nrpages(hpa, size);
5563 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5564 hpa >> VTD_PAGE_SHIFT, size, prot);
5565 return ret;
5566}
5567
5568static size_t intel_iommu_unmap(struct iommu_domain *domain,
5569 unsigned long iova, size_t size,
5570 struct iommu_iotlb_gather *gather)
5571{
5572 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5573 struct page *freelist = NULL;
5574 unsigned long start_pfn, last_pfn;
5575 unsigned int npages;
5576 int iommu_id, level = 0;
5577
5578 /* Cope with horrid API which requires us to unmap more than the
5579 size argument if it happens to be a large-page mapping. */
5580 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5581
5582 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5583 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5584
5585 start_pfn = iova >> VTD_PAGE_SHIFT;
5586 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5587
5588 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5589
5590 npages = last_pfn - start_pfn + 1;
5591
5592 for_each_domain_iommu(iommu_id, dmar_domain)
5593 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5594 start_pfn, npages, !freelist, 0);
5595
5596 dma_free_pagelist(freelist);
5597
5598 if (dmar_domain->max_addr == iova + size)
5599 dmar_domain->max_addr = iova;
5600
5601 return size;
5602}
5603
5604static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5605 dma_addr_t iova)
5606{
5607 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5608 struct dma_pte *pte;
5609 int level = 0;
5610 u64 phys = 0;
5611
5612 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5613 if (pte && dma_pte_present(pte))
5614 phys = dma_pte_addr(pte) +
5615 (iova & (BIT_MASK(level_to_offset_bits(level) +
5616 VTD_PAGE_SHIFT) - 1));
5617
5618 return phys;
5619}
5620
5621static inline bool scalable_mode_support(void)
5622{
5623 struct dmar_drhd_unit *drhd;
5624 struct intel_iommu *iommu;
5625 bool ret = true;
5626
5627 rcu_read_lock();
5628 for_each_active_iommu(iommu, drhd) {
5629 if (!sm_supported(iommu)) {
5630 ret = false;
5631 break;
5632 }
5633 }
5634 rcu_read_unlock();
5635
5636 return ret;
5637}
5638
5639static inline bool iommu_pasid_support(void)
5640{
5641 struct dmar_drhd_unit *drhd;
5642 struct intel_iommu *iommu;
5643 bool ret = true;
5644
5645 rcu_read_lock();
5646 for_each_active_iommu(iommu, drhd) {
5647 if (!pasid_supported(iommu)) {
5648 ret = false;
5649 break;
5650 }
5651 }
5652 rcu_read_unlock();
5653
5654 return ret;
5655}
5656
5657static inline bool nested_mode_support(void)
5658{
5659 struct dmar_drhd_unit *drhd;
5660 struct intel_iommu *iommu;
5661 bool ret = true;
5662
5663 rcu_read_lock();
5664 for_each_active_iommu(iommu, drhd) {
5665 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5666 ret = false;
5667 break;
5668 }
5669 }
5670 rcu_read_unlock();
5671
5672 return ret;
5673}
5674
5675static bool intel_iommu_capable(enum iommu_cap cap)
5676{
5677 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5678 return domain_update_iommu_snooping(NULL) == 1;
5679 if (cap == IOMMU_CAP_INTR_REMAP)
5680 return irq_remapping_enabled == 1;
5681
5682 return false;
5683}
5684
5685static struct iommu_device *intel_iommu_probe_device(struct device *dev)
5686{
5687 struct intel_iommu *iommu;
5688
5689 iommu = device_to_iommu(dev, NULL, NULL);
5690 if (!iommu)
5691 return ERR_PTR(-ENODEV);
5692
5693 if (translation_pre_enabled(iommu))
5694 dev_iommu_priv_set(dev, DEFER_DEVICE_DOMAIN_INFO);
5695
5696 return &iommu->iommu;
5697}
5698
5699static void intel_iommu_release_device(struct device *dev)
5700{
5701 struct intel_iommu *iommu;
5702
5703 iommu = device_to_iommu(dev, NULL, NULL);
5704 if (!iommu)
5705 return;
5706
5707 dmar_remove_one_dev_info(dev);
5708
5709 set_dma_ops(dev, NULL);
5710}
5711
5712static void intel_iommu_probe_finalize(struct device *dev)
5713{
5714 struct iommu_domain *domain;
5715
5716 domain = iommu_get_domain_for_dev(dev);
5717 if (device_needs_bounce(dev))
5718 set_dma_ops(dev, &bounce_dma_ops);
5719 else if (domain && domain->type == IOMMU_DOMAIN_DMA)
5720 set_dma_ops(dev, &intel_dma_ops);
5721 else
5722 set_dma_ops(dev, NULL);
5723}
5724
5725static void intel_iommu_get_resv_regions(struct device *device,
5726 struct list_head *head)
5727{
5728 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5729 struct iommu_resv_region *reg;
5730 struct dmar_rmrr_unit *rmrr;
5731 struct device *i_dev;
5732 int i;
5733
5734 down_read(&dmar_global_lock);
5735 for_each_rmrr_units(rmrr) {
5736 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5737 i, i_dev) {
5738 struct iommu_resv_region *resv;
5739 enum iommu_resv_type type;
5740 size_t length;
5741
5742 if (i_dev != device &&
5743 !is_downstream_to_pci_bridge(device, i_dev))
5744 continue;
5745
5746 length = rmrr->end_address - rmrr->base_address + 1;
5747
5748 type = device_rmrr_is_relaxable(device) ?
5749 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5750
5751 resv = iommu_alloc_resv_region(rmrr->base_address,
5752 length, prot, type);
5753 if (!resv)
5754 break;
5755
5756 list_add_tail(&resv->list, head);
5757 }
5758 }
5759 up_read(&dmar_global_lock);
5760
5761#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5762 if (dev_is_pci(device)) {
5763 struct pci_dev *pdev = to_pci_dev(device);
5764
5765 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5766 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5767 IOMMU_RESV_DIRECT_RELAXABLE);
5768 if (reg)
5769 list_add_tail(®->list, head);
5770 }
5771 }
5772#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5773
5774 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5775 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5776 0, IOMMU_RESV_MSI);
5777 if (!reg)
5778 return;
5779 list_add_tail(®->list, head);
5780}
5781
5782int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5783{
5784 struct device_domain_info *info;
5785 struct context_entry *context;
5786 struct dmar_domain *domain;
5787 unsigned long flags;
5788 u64 ctx_lo;
5789 int ret;
5790
5791 domain = find_domain(dev);
5792 if (!domain)
5793 return -EINVAL;
5794
5795 spin_lock_irqsave(&device_domain_lock, flags);
5796 spin_lock(&iommu->lock);
5797
5798 ret = -EINVAL;
5799 info = get_domain_info(dev);
5800 if (!info || !info->pasid_supported)
5801 goto out;
5802
5803 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5804 if (WARN_ON(!context))
5805 goto out;
5806
5807 ctx_lo = context[0].lo;
5808
5809 if (!(ctx_lo & CONTEXT_PASIDE)) {
5810 ctx_lo |= CONTEXT_PASIDE;
5811 context[0].lo = ctx_lo;
5812 wmb();
5813 iommu->flush.flush_context(iommu,
5814 domain->iommu_did[iommu->seq_id],
5815 PCI_DEVID(info->bus, info->devfn),
5816 DMA_CCMD_MASK_NOBIT,
5817 DMA_CCMD_DEVICE_INVL);
5818 }
5819
5820 /* Enable PASID support in the device, if it wasn't already */
5821 if (!info->pasid_enabled)
5822 iommu_enable_dev_iotlb(info);
5823
5824 ret = 0;
5825
5826 out:
5827 spin_unlock(&iommu->lock);
5828 spin_unlock_irqrestore(&device_domain_lock, flags);
5829
5830 return ret;
5831}
5832
5833static void intel_iommu_apply_resv_region(struct device *dev,
5834 struct iommu_domain *domain,
5835 struct iommu_resv_region *region)
5836{
5837 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5838 unsigned long start, end;
5839
5840 start = IOVA_PFN(region->start);
5841 end = IOVA_PFN(region->start + region->length - 1);
5842
5843 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5844}
5845
5846static struct iommu_group *intel_iommu_device_group(struct device *dev)
5847{
5848 if (dev_is_pci(dev))
5849 return pci_device_group(dev);
5850 return generic_device_group(dev);
5851}
5852
5853static int intel_iommu_enable_auxd(struct device *dev)
5854{
5855 struct device_domain_info *info;
5856 struct intel_iommu *iommu;
5857 unsigned long flags;
5858 int ret;
5859
5860 iommu = device_to_iommu(dev, NULL, NULL);
5861 if (!iommu || dmar_disabled)
5862 return -EINVAL;
5863
5864 if (!sm_supported(iommu) || !pasid_supported(iommu))
5865 return -EINVAL;
5866
5867 ret = intel_iommu_enable_pasid(iommu, dev);
5868 if (ret)
5869 return -ENODEV;
5870
5871 spin_lock_irqsave(&device_domain_lock, flags);
5872 info = get_domain_info(dev);
5873 info->auxd_enabled = 1;
5874 spin_unlock_irqrestore(&device_domain_lock, flags);
5875
5876 return 0;
5877}
5878
5879static int intel_iommu_disable_auxd(struct device *dev)
5880{
5881 struct device_domain_info *info;
5882 unsigned long flags;
5883
5884 spin_lock_irqsave(&device_domain_lock, flags);
5885 info = get_domain_info(dev);
5886 if (!WARN_ON(!info))
5887 info->auxd_enabled = 0;
5888 spin_unlock_irqrestore(&device_domain_lock, flags);
5889
5890 return 0;
5891}
5892
5893/*
5894 * A PCI express designated vendor specific extended capability is defined
5895 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5896 * for system software and tools to detect endpoint devices supporting the
5897 * Intel scalable IO virtualization without host driver dependency.
5898 *
5899 * Returns the address of the matching extended capability structure within
5900 * the device's PCI configuration space or 0 if the device does not support
5901 * it.
5902 */
5903static int siov_find_pci_dvsec(struct pci_dev *pdev)
5904{
5905 int pos;
5906 u16 vendor, id;
5907
5908 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5909 while (pos) {
5910 pci_read_config_word(pdev, pos + 4, &vendor);
5911 pci_read_config_word(pdev, pos + 8, &id);
5912 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5913 return pos;
5914
5915 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5916 }
5917
5918 return 0;
5919}
5920
5921static bool
5922intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5923{
5924 if (feat == IOMMU_DEV_FEAT_AUX) {
5925 int ret;
5926
5927 if (!dev_is_pci(dev) || dmar_disabled ||
5928 !scalable_mode_support() || !iommu_pasid_support())
5929 return false;
5930
5931 ret = pci_pasid_features(to_pci_dev(dev));
5932 if (ret < 0)
5933 return false;
5934
5935 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5936 }
5937
5938 if (feat == IOMMU_DEV_FEAT_SVA) {
5939 struct device_domain_info *info = get_domain_info(dev);
5940
5941 return info && (info->iommu->flags & VTD_FLAG_SVM_CAPABLE) &&
5942 info->pasid_supported && info->pri_supported &&
5943 info->ats_supported;
5944 }
5945
5946 return false;
5947}
5948
5949static int
5950intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5951{
5952 if (feat == IOMMU_DEV_FEAT_AUX)
5953 return intel_iommu_enable_auxd(dev);
5954
5955 if (feat == IOMMU_DEV_FEAT_SVA) {
5956 struct device_domain_info *info = get_domain_info(dev);
5957
5958 if (!info)
5959 return -EINVAL;
5960
5961 if (info->iommu->flags & VTD_FLAG_SVM_CAPABLE)
5962 return 0;
5963 }
5964
5965 return -ENODEV;
5966}
5967
5968static int
5969intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5970{
5971 if (feat == IOMMU_DEV_FEAT_AUX)
5972 return intel_iommu_disable_auxd(dev);
5973
5974 return -ENODEV;
5975}
5976
5977static bool
5978intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5979{
5980 struct device_domain_info *info = get_domain_info(dev);
5981
5982 if (feat == IOMMU_DEV_FEAT_AUX)
5983 return scalable_mode_support() && info && info->auxd_enabled;
5984
5985 return false;
5986}
5987
5988static int
5989intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5990{
5991 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5992
5993 return dmar_domain->default_pasid > 0 ?
5994 dmar_domain->default_pasid : -EINVAL;
5995}
5996
5997static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5998 struct device *dev)
5999{
6000 return attach_deferred(dev);
6001}
6002
6003static int
6004intel_iommu_domain_set_attr(struct iommu_domain *domain,
6005 enum iommu_attr attr, void *data)
6006{
6007 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6008 unsigned long flags;
6009 int ret = 0;
6010
6011 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6012 return -EINVAL;
6013
6014 switch (attr) {
6015 case DOMAIN_ATTR_NESTING:
6016 spin_lock_irqsave(&device_domain_lock, flags);
6017 if (nested_mode_support() &&
6018 list_empty(&dmar_domain->devices)) {
6019 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6020 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6021 } else {
6022 ret = -ENODEV;
6023 }
6024 spin_unlock_irqrestore(&device_domain_lock, flags);
6025 break;
6026 default:
6027 ret = -EINVAL;
6028 break;
6029 }
6030
6031 return ret;
6032}
6033
6034/*
6035 * Check that the device does not live on an external facing PCI port that is
6036 * marked as untrusted. Such devices should not be able to apply quirks and
6037 * thus not be able to bypass the IOMMU restrictions.
6038 */
6039static bool risky_device(struct pci_dev *pdev)
6040{
6041 if (pdev->untrusted) {
6042 pci_info(pdev,
6043 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
6044 pdev->vendor, pdev->device);
6045 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
6046 return true;
6047 }
6048 return false;
6049}
6050
6051const struct iommu_ops intel_iommu_ops = {
6052 .capable = intel_iommu_capable,
6053 .domain_alloc = intel_iommu_domain_alloc,
6054 .domain_free = intel_iommu_domain_free,
6055 .domain_set_attr = intel_iommu_domain_set_attr,
6056 .attach_dev = intel_iommu_attach_device,
6057 .detach_dev = intel_iommu_detach_device,
6058 .aux_attach_dev = intel_iommu_aux_attach_device,
6059 .aux_detach_dev = intel_iommu_aux_detach_device,
6060 .aux_get_pasid = intel_iommu_aux_get_pasid,
6061 .map = intel_iommu_map,
6062 .unmap = intel_iommu_unmap,
6063 .iova_to_phys = intel_iommu_iova_to_phys,
6064 .probe_device = intel_iommu_probe_device,
6065 .probe_finalize = intel_iommu_probe_finalize,
6066 .release_device = intel_iommu_release_device,
6067 .get_resv_regions = intel_iommu_get_resv_regions,
6068 .put_resv_regions = generic_iommu_put_resv_regions,
6069 .apply_resv_region = intel_iommu_apply_resv_region,
6070 .device_group = intel_iommu_device_group,
6071 .dev_has_feat = intel_iommu_dev_has_feat,
6072 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6073 .dev_enable_feat = intel_iommu_dev_enable_feat,
6074 .dev_disable_feat = intel_iommu_dev_disable_feat,
6075 .is_attach_deferred = intel_iommu_is_attach_deferred,
6076 .def_domain_type = device_def_domain_type,
6077 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6078#ifdef CONFIG_INTEL_IOMMU_SVM
6079 .cache_invalidate = intel_iommu_sva_invalidate,
6080 .sva_bind_gpasid = intel_svm_bind_gpasid,
6081 .sva_unbind_gpasid = intel_svm_unbind_gpasid,
6082 .sva_bind = intel_svm_bind,
6083 .sva_unbind = intel_svm_unbind,
6084 .sva_get_pasid = intel_svm_get_pasid,
6085 .page_response = intel_svm_page_response,
6086#endif
6087};
6088
6089static void quirk_iommu_igfx(struct pci_dev *dev)
6090{
6091 if (risky_device(dev))
6092 return;
6093
6094 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6095 dmar_map_gfx = 0;
6096}
6097
6098/* G4x/GM45 integrated gfx dmar support is totally busted. */
6099DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6100DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6101DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6102DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6103DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6104DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6105DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6106
6107/* Broadwell igfx malfunctions with dmar */
6108DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6109DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6110DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6111DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6112DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6113DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6114DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6115DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6116DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6117DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6118DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6119DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6120DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6121DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6122DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6123DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6124DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6125DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6126DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6127DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6128DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6129DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6130DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6131DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6132
6133static void quirk_iommu_rwbf(struct pci_dev *dev)
6134{
6135 if (risky_device(dev))
6136 return;
6137
6138 /*
6139 * Mobile 4 Series Chipset neglects to set RWBF capability,
6140 * but needs it. Same seems to hold for the desktop versions.
6141 */
6142 pci_info(dev, "Forcing write-buffer flush capability\n");
6143 rwbf_quirk = 1;
6144}
6145
6146DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6147DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6148DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6149DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6150DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6151DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6153
6154#define GGC 0x52
6155#define GGC_MEMORY_SIZE_MASK (0xf << 8)
6156#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6157#define GGC_MEMORY_SIZE_1M (0x1 << 8)
6158#define GGC_MEMORY_SIZE_2M (0x3 << 8)
6159#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6160#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6161#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6162#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6163
6164static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6165{
6166 unsigned short ggc;
6167
6168 if (risky_device(dev))
6169 return;
6170
6171 if (pci_read_config_word(dev, GGC, &ggc))
6172 return;
6173
6174 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6175 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6176 dmar_map_gfx = 0;
6177 } else if (dmar_map_gfx) {
6178 /* we have to ensure the gfx device is idle before we flush */
6179 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6180 intel_iommu_strict = 1;
6181 }
6182}
6183DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6184DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6185DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6186DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6187
6188static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
6189{
6190 unsigned short ver;
6191
6192 if (!IS_GFX_DEVICE(dev))
6193 return;
6194
6195 ver = (dev->device >> 8) & 0xff;
6196 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
6197 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
6198 ver != 0x9a)
6199 return;
6200
6201 if (risky_device(dev))
6202 return;
6203
6204 pci_info(dev, "Skip IOMMU disabling for graphics\n");
6205 iommu_skip_te_disable = 1;
6206}
6207DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
6208
6209/* On Tylersburg chipsets, some BIOSes have been known to enable the
6210 ISOCH DMAR unit for the Azalia sound device, but not give it any
6211 TLB entries, which causes it to deadlock. Check for that. We do
6212 this in a function called from init_dmars(), instead of in a PCI
6213 quirk, because we don't want to print the obnoxious "BIOS broken"
6214 message if VT-d is actually disabled.
6215*/
6216static void __init check_tylersburg_isoch(void)
6217{
6218 struct pci_dev *pdev;
6219 uint32_t vtisochctrl;
6220
6221 /* If there's no Azalia in the system anyway, forget it. */
6222 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6223 if (!pdev)
6224 return;
6225
6226 if (risky_device(pdev)) {
6227 pci_dev_put(pdev);
6228 return;
6229 }
6230
6231 pci_dev_put(pdev);
6232
6233 /* System Management Registers. Might be hidden, in which case
6234 we can't do the sanity check. But that's OK, because the
6235 known-broken BIOSes _don't_ actually hide it, so far. */
6236 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6237 if (!pdev)
6238 return;
6239
6240 if (risky_device(pdev)) {
6241 pci_dev_put(pdev);
6242 return;
6243 }
6244
6245 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6246 pci_dev_put(pdev);
6247 return;
6248 }
6249
6250 pci_dev_put(pdev);
6251
6252 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6253 if (vtisochctrl & 1)
6254 return;
6255
6256 /* Drop all bits other than the number of TLB entries */
6257 vtisochctrl &= 0x1c;
6258
6259 /* If we have the recommended number of TLB entries (16), fine. */
6260 if (vtisochctrl == 0x10)
6261 return;
6262
6263 /* Zero TLB entries? You get to ride the short bus to school. */
6264 if (!vtisochctrl) {
6265 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6266 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6267 dmi_get_system_info(DMI_BIOS_VENDOR),
6268 dmi_get_system_info(DMI_BIOS_VERSION),
6269 dmi_get_system_info(DMI_PRODUCT_VERSION));
6270 iommu_identity_mapping |= IDENTMAP_AZALIA;
6271 return;
6272 }
6273
6274 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6275 vtisochctrl);
6276}