Loading...
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33#include "perfmon.h"
34
35#define ROOT_SIZE VTD_PAGE_SIZE
36#define CONTEXT_SIZE VTD_PAGE_SIZE
37
38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58/* IO virtual address start page frame number */
59#define IOVA_START_PFN (1)
60
61#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
62
63static void __init check_tylersburg_isoch(void);
64static int rwbf_quirk;
65
66/*
67 * set to 1 to panic kernel if can't successfully enable VT-d
68 * (used when kernel is launched w/ TXT)
69 */
70static int force_on = 0;
71static int intel_iommu_tboot_noforce;
72static int no_platform_optin;
73
74#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75
76/*
77 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
78 * if marked present.
79 */
80static phys_addr_t root_entry_lctp(struct root_entry *re)
81{
82 if (!(re->lo & 1))
83 return 0;
84
85 return re->lo & VTD_PAGE_MASK;
86}
87
88/*
89 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
90 * if marked present.
91 */
92static phys_addr_t root_entry_uctp(struct root_entry *re)
93{
94 if (!(re->hi & 1))
95 return 0;
96
97 return re->hi & VTD_PAGE_MASK;
98}
99
100/*
101 * This domain is a statically identity mapping domain.
102 * 1. This domain creats a static 1:1 mapping to all usable memory.
103 * 2. It maps to each iommu if successful.
104 * 3. Each iommu mapps to this domain if successful.
105 */
106static struct dmar_domain *si_domain;
107static int hw_pass_through = 1;
108
109struct dmar_rmrr_unit {
110 struct list_head list; /* list of rmrr units */
111 struct acpi_dmar_header *hdr; /* ACPI header */
112 u64 base_address; /* reserved base address*/
113 u64 end_address; /* reserved end address */
114 struct dmar_dev_scope *devices; /* target devices */
115 int devices_cnt; /* target device count */
116};
117
118struct dmar_atsr_unit {
119 struct list_head list; /* list of ATSR units */
120 struct acpi_dmar_header *hdr; /* ACPI header */
121 struct dmar_dev_scope *devices; /* target devices */
122 int devices_cnt; /* target device count */
123 u8 include_all:1; /* include all ports */
124};
125
126struct dmar_satc_unit {
127 struct list_head list; /* list of SATC units */
128 struct acpi_dmar_header *hdr; /* ACPI header */
129 struct dmar_dev_scope *devices; /* target devices */
130 struct intel_iommu *iommu; /* the corresponding iommu */
131 int devices_cnt; /* target device count */
132 u8 atc_required:1; /* ATS is required */
133};
134
135static LIST_HEAD(dmar_atsr_units);
136static LIST_HEAD(dmar_rmrr_units);
137static LIST_HEAD(dmar_satc_units);
138
139#define for_each_rmrr_units(rmrr) \
140 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
141
142static void intel_iommu_domain_free(struct iommu_domain *domain);
143
144int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
145int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
146
147int intel_iommu_enabled = 0;
148EXPORT_SYMBOL_GPL(intel_iommu_enabled);
149
150static int dmar_map_gfx = 1;
151static int intel_iommu_superpage = 1;
152static int iommu_identity_mapping;
153static int iommu_skip_te_disable;
154
155#define IDENTMAP_GFX 2
156#define IDENTMAP_AZALIA 4
157
158const struct iommu_ops intel_iommu_ops;
159static const struct iommu_dirty_ops intel_dirty_ops;
160
161static bool translation_pre_enabled(struct intel_iommu *iommu)
162{
163 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
164}
165
166static void clear_translation_pre_enabled(struct intel_iommu *iommu)
167{
168 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
169}
170
171static void init_translation_status(struct intel_iommu *iommu)
172{
173 u32 gsts;
174
175 gsts = readl(iommu->reg + DMAR_GSTS_REG);
176 if (gsts & DMA_GSTS_TES)
177 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
178}
179
180static int __init intel_iommu_setup(char *str)
181{
182 if (!str)
183 return -EINVAL;
184
185 while (*str) {
186 if (!strncmp(str, "on", 2)) {
187 dmar_disabled = 0;
188 pr_info("IOMMU enabled\n");
189 } else if (!strncmp(str, "off", 3)) {
190 dmar_disabled = 1;
191 no_platform_optin = 1;
192 pr_info("IOMMU disabled\n");
193 } else if (!strncmp(str, "igfx_off", 8)) {
194 dmar_map_gfx = 0;
195 pr_info("Disable GFX device mapping\n");
196 } else if (!strncmp(str, "forcedac", 8)) {
197 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
198 iommu_dma_forcedac = true;
199 } else if (!strncmp(str, "strict", 6)) {
200 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
201 iommu_set_dma_strict();
202 } else if (!strncmp(str, "sp_off", 6)) {
203 pr_info("Disable supported super page\n");
204 intel_iommu_superpage = 0;
205 } else if (!strncmp(str, "sm_on", 5)) {
206 pr_info("Enable scalable mode if hardware supports\n");
207 intel_iommu_sm = 1;
208 } else if (!strncmp(str, "sm_off", 6)) {
209 pr_info("Scalable mode is disallowed\n");
210 intel_iommu_sm = 0;
211 } else if (!strncmp(str, "tboot_noforce", 13)) {
212 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
213 intel_iommu_tboot_noforce = 1;
214 } else {
215 pr_notice("Unknown option - '%s'\n", str);
216 }
217
218 str += strcspn(str, ",");
219 while (*str == ',')
220 str++;
221 }
222
223 return 1;
224}
225__setup("intel_iommu=", intel_iommu_setup);
226
227void *alloc_pgtable_page(int node, gfp_t gfp)
228{
229 struct page *page;
230 void *vaddr = NULL;
231
232 page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
233 if (page)
234 vaddr = page_address(page);
235 return vaddr;
236}
237
238void free_pgtable_page(void *vaddr)
239{
240 free_page((unsigned long)vaddr);
241}
242
243static int domain_type_is_si(struct dmar_domain *domain)
244{
245 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
246}
247
248static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
249{
250 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
251
252 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
253}
254
255/*
256 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
257 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
258 * the returned SAGAW.
259 */
260static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
261{
262 unsigned long fl_sagaw, sl_sagaw;
263
264 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
265 sl_sagaw = cap_sagaw(iommu->cap);
266
267 /* Second level only. */
268 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
269 return sl_sagaw;
270
271 /* First level only. */
272 if (!ecap_slts(iommu->ecap))
273 return fl_sagaw;
274
275 return fl_sagaw & sl_sagaw;
276}
277
278static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
279{
280 unsigned long sagaw;
281 int agaw;
282
283 sagaw = __iommu_calculate_sagaw(iommu);
284 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
285 if (test_bit(agaw, &sagaw))
286 break;
287 }
288
289 return agaw;
290}
291
292/*
293 * Calculate max SAGAW for each iommu.
294 */
295int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
296{
297 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
298}
299
300/*
301 * calculate agaw for each iommu.
302 * "SAGAW" may be different across iommus, use a default agaw, and
303 * get a supported less agaw for iommus that don't support the default agaw.
304 */
305int iommu_calculate_agaw(struct intel_iommu *iommu)
306{
307 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
308}
309
310static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
311{
312 return sm_supported(iommu) ?
313 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
314}
315
316static void domain_update_iommu_coherency(struct dmar_domain *domain)
317{
318 struct iommu_domain_info *info;
319 struct dmar_drhd_unit *drhd;
320 struct intel_iommu *iommu;
321 bool found = false;
322 unsigned long i;
323
324 domain->iommu_coherency = true;
325 xa_for_each(&domain->iommu_array, i, info) {
326 found = true;
327 if (!iommu_paging_structure_coherency(info->iommu)) {
328 domain->iommu_coherency = false;
329 break;
330 }
331 }
332 if (found)
333 return;
334
335 /* No hardware attached; use lowest common denominator */
336 rcu_read_lock();
337 for_each_active_iommu(iommu, drhd) {
338 if (!iommu_paging_structure_coherency(iommu)) {
339 domain->iommu_coherency = false;
340 break;
341 }
342 }
343 rcu_read_unlock();
344}
345
346static int domain_update_iommu_superpage(struct dmar_domain *domain,
347 struct intel_iommu *skip)
348{
349 struct dmar_drhd_unit *drhd;
350 struct intel_iommu *iommu;
351 int mask = 0x3;
352
353 if (!intel_iommu_superpage)
354 return 0;
355
356 /* set iommu_superpage to the smallest common denominator */
357 rcu_read_lock();
358 for_each_active_iommu(iommu, drhd) {
359 if (iommu != skip) {
360 if (domain && domain->use_first_level) {
361 if (!cap_fl1gp_support(iommu->cap))
362 mask = 0x1;
363 } else {
364 mask &= cap_super_page_val(iommu->cap);
365 }
366
367 if (!mask)
368 break;
369 }
370 }
371 rcu_read_unlock();
372
373 return fls(mask);
374}
375
376static int domain_update_device_node(struct dmar_domain *domain)
377{
378 struct device_domain_info *info;
379 int nid = NUMA_NO_NODE;
380 unsigned long flags;
381
382 spin_lock_irqsave(&domain->lock, flags);
383 list_for_each_entry(info, &domain->devices, link) {
384 /*
385 * There could possibly be multiple device numa nodes as devices
386 * within the same domain may sit behind different IOMMUs. There
387 * isn't perfect answer in such situation, so we select first
388 * come first served policy.
389 */
390 nid = dev_to_node(info->dev);
391 if (nid != NUMA_NO_NODE)
392 break;
393 }
394 spin_unlock_irqrestore(&domain->lock, flags);
395
396 return nid;
397}
398
399/* Return the super pagesize bitmap if supported. */
400static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
401{
402 unsigned long bitmap = 0;
403
404 /*
405 * 1-level super page supports page size of 2MiB, 2-level super page
406 * supports page size of both 2MiB and 1GiB.
407 */
408 if (domain->iommu_superpage == 1)
409 bitmap |= SZ_2M;
410 else if (domain->iommu_superpage == 2)
411 bitmap |= SZ_2M | SZ_1G;
412
413 return bitmap;
414}
415
416/* Some capabilities may be different across iommus */
417void domain_update_iommu_cap(struct dmar_domain *domain)
418{
419 domain_update_iommu_coherency(domain);
420 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
421
422 /*
423 * If RHSA is missing, we should default to the device numa domain
424 * as fall back.
425 */
426 if (domain->nid == NUMA_NO_NODE)
427 domain->nid = domain_update_device_node(domain);
428
429 /*
430 * First-level translation restricts the input-address to a
431 * canonical address (i.e., address bits 63:N have the same
432 * value as address bit [N-1], where N is 48-bits with 4-level
433 * paging and 57-bits with 5-level paging). Hence, skip bit
434 * [N-1].
435 */
436 if (domain->use_first_level)
437 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
438 else
439 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
440
441 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
442 domain_update_iotlb(domain);
443}
444
445struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
446 u8 devfn, int alloc)
447{
448 struct root_entry *root = &iommu->root_entry[bus];
449 struct context_entry *context;
450 u64 *entry;
451
452 /*
453 * Except that the caller requested to allocate a new entry,
454 * returning a copied context entry makes no sense.
455 */
456 if (!alloc && context_copied(iommu, bus, devfn))
457 return NULL;
458
459 entry = &root->lo;
460 if (sm_supported(iommu)) {
461 if (devfn >= 0x80) {
462 devfn -= 0x80;
463 entry = &root->hi;
464 }
465 devfn *= 2;
466 }
467 if (*entry & 1)
468 context = phys_to_virt(*entry & VTD_PAGE_MASK);
469 else {
470 unsigned long phy_addr;
471 if (!alloc)
472 return NULL;
473
474 context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
475 if (!context)
476 return NULL;
477
478 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
479 phy_addr = virt_to_phys((void *)context);
480 *entry = phy_addr | 1;
481 __iommu_flush_cache(iommu, entry, sizeof(*entry));
482 }
483 return &context[devfn];
484}
485
486/**
487 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
488 * sub-hierarchy of a candidate PCI-PCI bridge
489 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
490 * @bridge: the candidate PCI-PCI bridge
491 *
492 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
493 */
494static bool
495is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
496{
497 struct pci_dev *pdev, *pbridge;
498
499 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
500 return false;
501
502 pdev = to_pci_dev(dev);
503 pbridge = to_pci_dev(bridge);
504
505 if (pbridge->subordinate &&
506 pbridge->subordinate->number <= pdev->bus->number &&
507 pbridge->subordinate->busn_res.end >= pdev->bus->number)
508 return true;
509
510 return false;
511}
512
513static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
514{
515 struct dmar_drhd_unit *drhd;
516 u32 vtbar;
517 int rc;
518
519 /* We know that this device on this chipset has its own IOMMU.
520 * If we find it under a different IOMMU, then the BIOS is lying
521 * to us. Hope that the IOMMU for this device is actually
522 * disabled, and it needs no translation...
523 */
524 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
525 if (rc) {
526 /* "can't" happen */
527 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
528 return false;
529 }
530 vtbar &= 0xffff0000;
531
532 /* we know that the this iommu should be at offset 0xa000 from vtbar */
533 drhd = dmar_find_matched_drhd_unit(pdev);
534 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
535 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
536 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
537 return true;
538 }
539
540 return false;
541}
542
543static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
544{
545 if (!iommu || iommu->drhd->ignored)
546 return true;
547
548 if (dev_is_pci(dev)) {
549 struct pci_dev *pdev = to_pci_dev(dev);
550
551 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
552 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
553 quirk_ioat_snb_local_iommu(pdev))
554 return true;
555 }
556
557 return false;
558}
559
560static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
561{
562 struct dmar_drhd_unit *drhd = NULL;
563 struct pci_dev *pdev = NULL;
564 struct intel_iommu *iommu;
565 struct device *tmp;
566 u16 segment = 0;
567 int i;
568
569 if (!dev)
570 return NULL;
571
572 if (dev_is_pci(dev)) {
573 struct pci_dev *pf_pdev;
574
575 pdev = pci_real_dma_dev(to_pci_dev(dev));
576
577 /* VFs aren't listed in scope tables; we need to look up
578 * the PF instead to find the IOMMU. */
579 pf_pdev = pci_physfn(pdev);
580 dev = &pf_pdev->dev;
581 segment = pci_domain_nr(pdev->bus);
582 } else if (has_acpi_companion(dev))
583 dev = &ACPI_COMPANION(dev)->dev;
584
585 rcu_read_lock();
586 for_each_iommu(iommu, drhd) {
587 if (pdev && segment != drhd->segment)
588 continue;
589
590 for_each_active_dev_scope(drhd->devices,
591 drhd->devices_cnt, i, tmp) {
592 if (tmp == dev) {
593 /* For a VF use its original BDF# not that of the PF
594 * which we used for the IOMMU lookup. Strictly speaking
595 * we could do this for all PCI devices; we only need to
596 * get the BDF# from the scope table for ACPI matches. */
597 if (pdev && pdev->is_virtfn)
598 goto got_pdev;
599
600 if (bus && devfn) {
601 *bus = drhd->devices[i].bus;
602 *devfn = drhd->devices[i].devfn;
603 }
604 goto out;
605 }
606
607 if (is_downstream_to_pci_bridge(dev, tmp))
608 goto got_pdev;
609 }
610
611 if (pdev && drhd->include_all) {
612got_pdev:
613 if (bus && devfn) {
614 *bus = pdev->bus->number;
615 *devfn = pdev->devfn;
616 }
617 goto out;
618 }
619 }
620 iommu = NULL;
621out:
622 if (iommu_is_dummy(iommu, dev))
623 iommu = NULL;
624
625 rcu_read_unlock();
626
627 return iommu;
628}
629
630static void domain_flush_cache(struct dmar_domain *domain,
631 void *addr, int size)
632{
633 if (!domain->iommu_coherency)
634 clflush_cache_range(addr, size);
635}
636
637static void free_context_table(struct intel_iommu *iommu)
638{
639 struct context_entry *context;
640 int i;
641
642 if (!iommu->root_entry)
643 return;
644
645 for (i = 0; i < ROOT_ENTRY_NR; i++) {
646 context = iommu_context_addr(iommu, i, 0, 0);
647 if (context)
648 free_pgtable_page(context);
649
650 if (!sm_supported(iommu))
651 continue;
652
653 context = iommu_context_addr(iommu, i, 0x80, 0);
654 if (context)
655 free_pgtable_page(context);
656 }
657
658 free_pgtable_page(iommu->root_entry);
659 iommu->root_entry = NULL;
660}
661
662#ifdef CONFIG_DMAR_DEBUG
663static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
664 u8 bus, u8 devfn, struct dma_pte *parent, int level)
665{
666 struct dma_pte *pte;
667 int offset;
668
669 while (1) {
670 offset = pfn_level_offset(pfn, level);
671 pte = &parent[offset];
672 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
673 pr_info("PTE not present at level %d\n", level);
674 break;
675 }
676
677 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
678
679 if (level == 1)
680 break;
681
682 parent = phys_to_virt(dma_pte_addr(pte));
683 level--;
684 }
685}
686
687void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
688 unsigned long long addr, u32 pasid)
689{
690 struct pasid_dir_entry *dir, *pde;
691 struct pasid_entry *entries, *pte;
692 struct context_entry *ctx_entry;
693 struct root_entry *rt_entry;
694 int i, dir_index, index, level;
695 u8 devfn = source_id & 0xff;
696 u8 bus = source_id >> 8;
697 struct dma_pte *pgtable;
698
699 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
700
701 /* root entry dump */
702 rt_entry = &iommu->root_entry[bus];
703 if (!rt_entry) {
704 pr_info("root table entry is not present\n");
705 return;
706 }
707
708 if (sm_supported(iommu))
709 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
710 rt_entry->hi, rt_entry->lo);
711 else
712 pr_info("root entry: 0x%016llx", rt_entry->lo);
713
714 /* context entry dump */
715 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
716 if (!ctx_entry) {
717 pr_info("context table entry is not present\n");
718 return;
719 }
720
721 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
722 ctx_entry->hi, ctx_entry->lo);
723
724 /* legacy mode does not require PASID entries */
725 if (!sm_supported(iommu)) {
726 level = agaw_to_level(ctx_entry->hi & 7);
727 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
728 goto pgtable_walk;
729 }
730
731 /* get the pointer to pasid directory entry */
732 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
733 if (!dir) {
734 pr_info("pasid directory entry is not present\n");
735 return;
736 }
737 /* For request-without-pasid, get the pasid from context entry */
738 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
739 pasid = IOMMU_NO_PASID;
740
741 dir_index = pasid >> PASID_PDE_SHIFT;
742 pde = &dir[dir_index];
743 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
744
745 /* get the pointer to the pasid table entry */
746 entries = get_pasid_table_from_pde(pde);
747 if (!entries) {
748 pr_info("pasid table entry is not present\n");
749 return;
750 }
751 index = pasid & PASID_PTE_MASK;
752 pte = &entries[index];
753 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
754 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
755
756 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
757 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
758 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
759 } else {
760 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
761 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
762 }
763
764pgtable_walk:
765 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
766}
767#endif
768
769static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
770 unsigned long pfn, int *target_level,
771 gfp_t gfp)
772{
773 struct dma_pte *parent, *pte;
774 int level = agaw_to_level(domain->agaw);
775 int offset;
776
777 if (!domain_pfn_supported(domain, pfn))
778 /* Address beyond IOMMU's addressing capabilities. */
779 return NULL;
780
781 parent = domain->pgd;
782
783 while (1) {
784 void *tmp_page;
785
786 offset = pfn_level_offset(pfn, level);
787 pte = &parent[offset];
788 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
789 break;
790 if (level == *target_level)
791 break;
792
793 if (!dma_pte_present(pte)) {
794 uint64_t pteval;
795
796 tmp_page = alloc_pgtable_page(domain->nid, gfp);
797
798 if (!tmp_page)
799 return NULL;
800
801 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
802 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
803 if (domain->use_first_level)
804 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
805
806 if (cmpxchg64(&pte->val, 0ULL, pteval))
807 /* Someone else set it while we were thinking; use theirs. */
808 free_pgtable_page(tmp_page);
809 else
810 domain_flush_cache(domain, pte, sizeof(*pte));
811 }
812 if (level == 1)
813 break;
814
815 parent = phys_to_virt(dma_pte_addr(pte));
816 level--;
817 }
818
819 if (!*target_level)
820 *target_level = level;
821
822 return pte;
823}
824
825/* return address's pte at specific level */
826static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
827 unsigned long pfn,
828 int level, int *large_page)
829{
830 struct dma_pte *parent, *pte;
831 int total = agaw_to_level(domain->agaw);
832 int offset;
833
834 parent = domain->pgd;
835 while (level <= total) {
836 offset = pfn_level_offset(pfn, total);
837 pte = &parent[offset];
838 if (level == total)
839 return pte;
840
841 if (!dma_pte_present(pte)) {
842 *large_page = total;
843 break;
844 }
845
846 if (dma_pte_superpage(pte)) {
847 *large_page = total;
848 return pte;
849 }
850
851 parent = phys_to_virt(dma_pte_addr(pte));
852 total--;
853 }
854 return NULL;
855}
856
857/* clear last level pte, a tlb flush should be followed */
858static void dma_pte_clear_range(struct dmar_domain *domain,
859 unsigned long start_pfn,
860 unsigned long last_pfn)
861{
862 unsigned int large_page;
863 struct dma_pte *first_pte, *pte;
864
865 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
866 WARN_ON(start_pfn > last_pfn))
867 return;
868
869 /* we don't need lock here; nobody else touches the iova range */
870 do {
871 large_page = 1;
872 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
873 if (!pte) {
874 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
875 continue;
876 }
877 do {
878 dma_clear_pte(pte);
879 start_pfn += lvl_to_nr_pages(large_page);
880 pte++;
881 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
882
883 domain_flush_cache(domain, first_pte,
884 (void *)pte - (void *)first_pte);
885
886 } while (start_pfn && start_pfn <= last_pfn);
887}
888
889static void dma_pte_free_level(struct dmar_domain *domain, int level,
890 int retain_level, struct dma_pte *pte,
891 unsigned long pfn, unsigned long start_pfn,
892 unsigned long last_pfn)
893{
894 pfn = max(start_pfn, pfn);
895 pte = &pte[pfn_level_offset(pfn, level)];
896
897 do {
898 unsigned long level_pfn;
899 struct dma_pte *level_pte;
900
901 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
902 goto next;
903
904 level_pfn = pfn & level_mask(level);
905 level_pte = phys_to_virt(dma_pte_addr(pte));
906
907 if (level > 2) {
908 dma_pte_free_level(domain, level - 1, retain_level,
909 level_pte, level_pfn, start_pfn,
910 last_pfn);
911 }
912
913 /*
914 * Free the page table if we're below the level we want to
915 * retain and the range covers the entire table.
916 */
917 if (level < retain_level && !(start_pfn > level_pfn ||
918 last_pfn < level_pfn + level_size(level) - 1)) {
919 dma_clear_pte(pte);
920 domain_flush_cache(domain, pte, sizeof(*pte));
921 free_pgtable_page(level_pte);
922 }
923next:
924 pfn += level_size(level);
925 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
926}
927
928/*
929 * clear last level (leaf) ptes and free page table pages below the
930 * level we wish to keep intact.
931 */
932static void dma_pte_free_pagetable(struct dmar_domain *domain,
933 unsigned long start_pfn,
934 unsigned long last_pfn,
935 int retain_level)
936{
937 dma_pte_clear_range(domain, start_pfn, last_pfn);
938
939 /* We don't need lock here; nobody else touches the iova range */
940 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
941 domain->pgd, 0, start_pfn, last_pfn);
942
943 /* free pgd */
944 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
945 free_pgtable_page(domain->pgd);
946 domain->pgd = NULL;
947 }
948}
949
950/* When a page at a given level is being unlinked from its parent, we don't
951 need to *modify* it at all. All we need to do is make a list of all the
952 pages which can be freed just as soon as we've flushed the IOTLB and we
953 know the hardware page-walk will no longer touch them.
954 The 'pte' argument is the *parent* PTE, pointing to the page that is to
955 be freed. */
956static void dma_pte_list_pagetables(struct dmar_domain *domain,
957 int level, struct dma_pte *pte,
958 struct list_head *freelist)
959{
960 struct page *pg;
961
962 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
963 list_add_tail(&pg->lru, freelist);
964
965 if (level == 1)
966 return;
967
968 pte = page_address(pg);
969 do {
970 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
971 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
972 pte++;
973 } while (!first_pte_in_page(pte));
974}
975
976static void dma_pte_clear_level(struct dmar_domain *domain, int level,
977 struct dma_pte *pte, unsigned long pfn,
978 unsigned long start_pfn, unsigned long last_pfn,
979 struct list_head *freelist)
980{
981 struct dma_pte *first_pte = NULL, *last_pte = NULL;
982
983 pfn = max(start_pfn, pfn);
984 pte = &pte[pfn_level_offset(pfn, level)];
985
986 do {
987 unsigned long level_pfn = pfn & level_mask(level);
988
989 if (!dma_pte_present(pte))
990 goto next;
991
992 /* If range covers entire pagetable, free it */
993 if (start_pfn <= level_pfn &&
994 last_pfn >= level_pfn + level_size(level) - 1) {
995 /* These suborbinate page tables are going away entirely. Don't
996 bother to clear them; we're just going to *free* them. */
997 if (level > 1 && !dma_pte_superpage(pte))
998 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
999
1000 dma_clear_pte(pte);
1001 if (!first_pte)
1002 first_pte = pte;
1003 last_pte = pte;
1004 } else if (level > 1) {
1005 /* Recurse down into a level that isn't *entirely* obsolete */
1006 dma_pte_clear_level(domain, level - 1,
1007 phys_to_virt(dma_pte_addr(pte)),
1008 level_pfn, start_pfn, last_pfn,
1009 freelist);
1010 }
1011next:
1012 pfn = level_pfn + level_size(level);
1013 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1014
1015 if (first_pte)
1016 domain_flush_cache(domain, first_pte,
1017 (void *)++last_pte - (void *)first_pte);
1018}
1019
1020/* We can't just free the pages because the IOMMU may still be walking
1021 the page tables, and may have cached the intermediate levels. The
1022 pages can only be freed after the IOTLB flush has been done. */
1023static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1024 unsigned long last_pfn, struct list_head *freelist)
1025{
1026 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
1027 WARN_ON(start_pfn > last_pfn))
1028 return;
1029
1030 /* we don't need lock here; nobody else touches the iova range */
1031 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1032 domain->pgd, 0, start_pfn, last_pfn, freelist);
1033
1034 /* free pgd */
1035 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1036 struct page *pgd_page = virt_to_page(domain->pgd);
1037 list_add_tail(&pgd_page->lru, freelist);
1038 domain->pgd = NULL;
1039 }
1040}
1041
1042/* iommu handling */
1043static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1044{
1045 struct root_entry *root;
1046
1047 root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
1048 if (!root) {
1049 pr_err("Allocating root entry for %s failed\n",
1050 iommu->name);
1051 return -ENOMEM;
1052 }
1053
1054 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1055 iommu->root_entry = root;
1056
1057 return 0;
1058}
1059
1060static void iommu_set_root_entry(struct intel_iommu *iommu)
1061{
1062 u64 addr;
1063 u32 sts;
1064 unsigned long flag;
1065
1066 addr = virt_to_phys(iommu->root_entry);
1067 if (sm_supported(iommu))
1068 addr |= DMA_RTADDR_SMT;
1069
1070 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1071 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1072
1073 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1074
1075 /* Make sure hardware complete it */
1076 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1077 readl, (sts & DMA_GSTS_RTPS), sts);
1078
1079 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1080
1081 /*
1082 * Hardware invalidates all DMA remapping hardware translation
1083 * caches as part of SRTP flow.
1084 */
1085 if (cap_esrtps(iommu->cap))
1086 return;
1087
1088 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1089 if (sm_supported(iommu))
1090 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1091 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1092}
1093
1094void iommu_flush_write_buffer(struct intel_iommu *iommu)
1095{
1096 u32 val;
1097 unsigned long flag;
1098
1099 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1100 return;
1101
1102 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1103 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1104
1105 /* Make sure hardware complete it */
1106 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1107 readl, (!(val & DMA_GSTS_WBFS)), val);
1108
1109 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1110}
1111
1112/* return value determine if we need a write buffer flush */
1113static void __iommu_flush_context(struct intel_iommu *iommu,
1114 u16 did, u16 source_id, u8 function_mask,
1115 u64 type)
1116{
1117 u64 val = 0;
1118 unsigned long flag;
1119
1120 switch (type) {
1121 case DMA_CCMD_GLOBAL_INVL:
1122 val = DMA_CCMD_GLOBAL_INVL;
1123 break;
1124 case DMA_CCMD_DOMAIN_INVL:
1125 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1126 break;
1127 case DMA_CCMD_DEVICE_INVL:
1128 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1129 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1130 break;
1131 default:
1132 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1133 iommu->name, type);
1134 return;
1135 }
1136 val |= DMA_CCMD_ICC;
1137
1138 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1139 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1140
1141 /* Make sure hardware complete it */
1142 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1143 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1144
1145 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1146}
1147
1148/* return value determine if we need a write buffer flush */
1149static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1150 u64 addr, unsigned int size_order, u64 type)
1151{
1152 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1153 u64 val = 0, val_iva = 0;
1154 unsigned long flag;
1155
1156 switch (type) {
1157 case DMA_TLB_GLOBAL_FLUSH:
1158 /* global flush doesn't need set IVA_REG */
1159 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1160 break;
1161 case DMA_TLB_DSI_FLUSH:
1162 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1163 break;
1164 case DMA_TLB_PSI_FLUSH:
1165 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1166 /* IH bit is passed in as part of address */
1167 val_iva = size_order | addr;
1168 break;
1169 default:
1170 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1171 iommu->name, type);
1172 return;
1173 }
1174
1175 if (cap_write_drain(iommu->cap))
1176 val |= DMA_TLB_WRITE_DRAIN;
1177
1178 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1179 /* Note: Only uses first TLB reg currently */
1180 if (val_iva)
1181 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1182 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1183
1184 /* Make sure hardware complete it */
1185 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1186 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1187
1188 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1189
1190 /* check IOTLB invalidation granularity */
1191 if (DMA_TLB_IAIG(val) == 0)
1192 pr_err("Flush IOTLB failed\n");
1193 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1194 pr_debug("TLB flush request %Lx, actual %Lx\n",
1195 (unsigned long long)DMA_TLB_IIRG(type),
1196 (unsigned long long)DMA_TLB_IAIG(val));
1197}
1198
1199static struct device_domain_info *
1200domain_lookup_dev_info(struct dmar_domain *domain,
1201 struct intel_iommu *iommu, u8 bus, u8 devfn)
1202{
1203 struct device_domain_info *info;
1204 unsigned long flags;
1205
1206 spin_lock_irqsave(&domain->lock, flags);
1207 list_for_each_entry(info, &domain->devices, link) {
1208 if (info->iommu == iommu && info->bus == bus &&
1209 info->devfn == devfn) {
1210 spin_unlock_irqrestore(&domain->lock, flags);
1211 return info;
1212 }
1213 }
1214 spin_unlock_irqrestore(&domain->lock, flags);
1215
1216 return NULL;
1217}
1218
1219void domain_update_iotlb(struct dmar_domain *domain)
1220{
1221 struct dev_pasid_info *dev_pasid;
1222 struct device_domain_info *info;
1223 bool has_iotlb_device = false;
1224 unsigned long flags;
1225
1226 spin_lock_irqsave(&domain->lock, flags);
1227 list_for_each_entry(info, &domain->devices, link) {
1228 if (info->ats_enabled) {
1229 has_iotlb_device = true;
1230 break;
1231 }
1232 }
1233
1234 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1235 info = dev_iommu_priv_get(dev_pasid->dev);
1236 if (info->ats_enabled) {
1237 has_iotlb_device = true;
1238 break;
1239 }
1240 }
1241 domain->has_iotlb_device = has_iotlb_device;
1242 spin_unlock_irqrestore(&domain->lock, flags);
1243}
1244
1245/*
1246 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1247 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1248 * check because it applies only to the built-in QAT devices and it doesn't
1249 * grant additional privileges.
1250 */
1251#define BUGGY_QAT_DEVID_MASK 0x4940
1252static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1253{
1254 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1255 return false;
1256
1257 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1258 return false;
1259
1260 return true;
1261}
1262
1263static void iommu_enable_pci_caps(struct device_domain_info *info)
1264{
1265 struct pci_dev *pdev;
1266
1267 if (!dev_is_pci(info->dev))
1268 return;
1269
1270 pdev = to_pci_dev(info->dev);
1271
1272 /* The PCIe spec, in its wisdom, declares that the behaviour of
1273 the device if you enable PASID support after ATS support is
1274 undefined. So always enable PASID support on devices which
1275 have it, even if we can't yet know if we're ever going to
1276 use it. */
1277 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1278 info->pasid_enabled = 1;
1279
1280 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1281 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1282 info->ats_enabled = 1;
1283 domain_update_iotlb(info->domain);
1284 }
1285}
1286
1287static void iommu_disable_pci_caps(struct device_domain_info *info)
1288{
1289 struct pci_dev *pdev;
1290
1291 if (!dev_is_pci(info->dev))
1292 return;
1293
1294 pdev = to_pci_dev(info->dev);
1295
1296 if (info->ats_enabled) {
1297 pci_disable_ats(pdev);
1298 info->ats_enabled = 0;
1299 domain_update_iotlb(info->domain);
1300 }
1301
1302 if (info->pasid_enabled) {
1303 pci_disable_pasid(pdev);
1304 info->pasid_enabled = 0;
1305 }
1306}
1307
1308static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1309 u64 addr, unsigned int mask)
1310{
1311 u16 sid, qdep;
1312
1313 if (!info || !info->ats_enabled)
1314 return;
1315
1316 sid = info->bus << 8 | info->devfn;
1317 qdep = info->ats_qdep;
1318 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1319 qdep, addr, mask);
1320 quirk_extra_dev_tlb_flush(info, addr, mask, IOMMU_NO_PASID, qdep);
1321}
1322
1323static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1324 u64 addr, unsigned mask)
1325{
1326 struct dev_pasid_info *dev_pasid;
1327 struct device_domain_info *info;
1328 unsigned long flags;
1329
1330 if (!domain->has_iotlb_device)
1331 return;
1332
1333 spin_lock_irqsave(&domain->lock, flags);
1334 list_for_each_entry(info, &domain->devices, link)
1335 __iommu_flush_dev_iotlb(info, addr, mask);
1336
1337 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain) {
1338 info = dev_iommu_priv_get(dev_pasid->dev);
1339
1340 if (!info->ats_enabled)
1341 continue;
1342
1343 qi_flush_dev_iotlb_pasid(info->iommu,
1344 PCI_DEVID(info->bus, info->devfn),
1345 info->pfsid, dev_pasid->pasid,
1346 info->ats_qdep, addr,
1347 mask);
1348 }
1349 spin_unlock_irqrestore(&domain->lock, flags);
1350}
1351
1352static void domain_flush_pasid_iotlb(struct intel_iommu *iommu,
1353 struct dmar_domain *domain, u64 addr,
1354 unsigned long npages, bool ih)
1355{
1356 u16 did = domain_id_iommu(domain, iommu);
1357 struct dev_pasid_info *dev_pasid;
1358 unsigned long flags;
1359
1360 spin_lock_irqsave(&domain->lock, flags);
1361 list_for_each_entry(dev_pasid, &domain->dev_pasids, link_domain)
1362 qi_flush_piotlb(iommu, did, dev_pasid->pasid, addr, npages, ih);
1363
1364 if (!list_empty(&domain->devices))
1365 qi_flush_piotlb(iommu, did, IOMMU_NO_PASID, addr, npages, ih);
1366 spin_unlock_irqrestore(&domain->lock, flags);
1367}
1368
1369static void __iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1370 unsigned long pfn, unsigned int pages,
1371 int ih)
1372{
1373 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1374 unsigned long bitmask = aligned_pages - 1;
1375 unsigned int mask = ilog2(aligned_pages);
1376 u64 addr = (u64)pfn << VTD_PAGE_SHIFT;
1377
1378 /*
1379 * PSI masks the low order bits of the base address. If the
1380 * address isn't aligned to the mask, then compute a mask value
1381 * needed to ensure the target range is flushed.
1382 */
1383 if (unlikely(bitmask & pfn)) {
1384 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1385
1386 /*
1387 * Since end_pfn <= pfn + bitmask, the only way bits
1388 * higher than bitmask can differ in pfn and end_pfn is
1389 * by carrying. This means after masking out bitmask,
1390 * high bits starting with the first set bit in
1391 * shared_bits are all equal in both pfn and end_pfn.
1392 */
1393 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1394 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1395 }
1396
1397 /*
1398 * Fallback to domain selective flush if no PSI support or
1399 * the size is too big.
1400 */
1401 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1402 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1403 DMA_TLB_DSI_FLUSH);
1404 else
1405 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1406 DMA_TLB_PSI_FLUSH);
1407}
1408
1409static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1410 struct dmar_domain *domain,
1411 unsigned long pfn, unsigned int pages,
1412 int ih, int map)
1413{
1414 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1415 unsigned int mask = ilog2(aligned_pages);
1416 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1417 u16 did = domain_id_iommu(domain, iommu);
1418
1419 if (WARN_ON(!pages))
1420 return;
1421
1422 if (ih)
1423 ih = 1 << 6;
1424
1425 if (domain->use_first_level)
1426 domain_flush_pasid_iotlb(iommu, domain, addr, pages, ih);
1427 else
1428 __iommu_flush_iotlb_psi(iommu, did, pfn, pages, ih);
1429
1430 /*
1431 * In caching mode, changes of pages from non-present to present require
1432 * flush. However, device IOTLB doesn't need to be flushed in this case.
1433 */
1434 if (!cap_caching_mode(iommu->cap) || !map)
1435 iommu_flush_dev_iotlb(domain, addr, mask);
1436}
1437
1438/* Notification for newly created mappings */
1439static void __mapping_notify_one(struct intel_iommu *iommu, struct dmar_domain *domain,
1440 unsigned long pfn, unsigned int pages)
1441{
1442 /*
1443 * It's a non-present to present mapping. Only flush if caching mode
1444 * and second level.
1445 */
1446 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1447 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1448 else
1449 iommu_flush_write_buffer(iommu);
1450}
1451
1452/*
1453 * Flush the relevant caches in nested translation if the domain
1454 * also serves as a parent
1455 */
1456static void parent_domain_flush(struct dmar_domain *domain,
1457 unsigned long pfn,
1458 unsigned long pages, int ih)
1459{
1460 struct dmar_domain *s1_domain;
1461
1462 spin_lock(&domain->s1_lock);
1463 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
1464 struct device_domain_info *device_info;
1465 struct iommu_domain_info *info;
1466 unsigned long flags;
1467 unsigned long i;
1468
1469 xa_for_each(&s1_domain->iommu_array, i, info)
1470 __iommu_flush_iotlb_psi(info->iommu, info->did,
1471 pfn, pages, ih);
1472
1473 if (!s1_domain->has_iotlb_device)
1474 continue;
1475
1476 spin_lock_irqsave(&s1_domain->lock, flags);
1477 list_for_each_entry(device_info, &s1_domain->devices, link)
1478 /*
1479 * Address translation cache in device side caches the
1480 * result of nested translation. There is no easy way
1481 * to identify the exact set of nested translations
1482 * affected by a change in S2. So just flush the entire
1483 * device cache.
1484 */
1485 __iommu_flush_dev_iotlb(device_info, 0,
1486 MAX_AGAW_PFN_WIDTH);
1487 spin_unlock_irqrestore(&s1_domain->lock, flags);
1488 }
1489 spin_unlock(&domain->s1_lock);
1490}
1491
1492static void intel_flush_iotlb_all(struct iommu_domain *domain)
1493{
1494 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1495 struct iommu_domain_info *info;
1496 unsigned long idx;
1497
1498 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1499 struct intel_iommu *iommu = info->iommu;
1500 u16 did = domain_id_iommu(dmar_domain, iommu);
1501
1502 if (dmar_domain->use_first_level)
1503 domain_flush_pasid_iotlb(iommu, dmar_domain, 0, -1, 0);
1504 else
1505 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1506 DMA_TLB_DSI_FLUSH);
1507
1508 if (!cap_caching_mode(iommu->cap))
1509 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1510 }
1511
1512 if (dmar_domain->nested_parent)
1513 parent_domain_flush(dmar_domain, 0, -1, 0);
1514}
1515
1516static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1517{
1518 u32 pmen;
1519 unsigned long flags;
1520
1521 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1522 return;
1523
1524 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1525 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1526 pmen &= ~DMA_PMEN_EPM;
1527 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1528
1529 /* wait for the protected region status bit to clear */
1530 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1531 readl, !(pmen & DMA_PMEN_PRS), pmen);
1532
1533 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1534}
1535
1536static void iommu_enable_translation(struct intel_iommu *iommu)
1537{
1538 u32 sts;
1539 unsigned long flags;
1540
1541 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1542 iommu->gcmd |= DMA_GCMD_TE;
1543 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1544
1545 /* Make sure hardware complete it */
1546 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1547 readl, (sts & DMA_GSTS_TES), sts);
1548
1549 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1550}
1551
1552static void iommu_disable_translation(struct intel_iommu *iommu)
1553{
1554 u32 sts;
1555 unsigned long flag;
1556
1557 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1558 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1559 return;
1560
1561 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1562 iommu->gcmd &= ~DMA_GCMD_TE;
1563 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1564
1565 /* Make sure hardware complete it */
1566 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1567 readl, (!(sts & DMA_GSTS_TES)), sts);
1568
1569 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1570}
1571
1572static int iommu_init_domains(struct intel_iommu *iommu)
1573{
1574 u32 ndomains;
1575
1576 ndomains = cap_ndoms(iommu->cap);
1577 pr_debug("%s: Number of Domains supported <%d>\n",
1578 iommu->name, ndomains);
1579
1580 spin_lock_init(&iommu->lock);
1581
1582 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1583 if (!iommu->domain_ids)
1584 return -ENOMEM;
1585
1586 /*
1587 * If Caching mode is set, then invalid translations are tagged
1588 * with domain-id 0, hence we need to pre-allocate it. We also
1589 * use domain-id 0 as a marker for non-allocated domain-id, so
1590 * make sure it is not used for a real domain.
1591 */
1592 set_bit(0, iommu->domain_ids);
1593
1594 /*
1595 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1596 * entry for first-level or pass-through translation modes should
1597 * be programmed with a domain id different from those used for
1598 * second-level or nested translation. We reserve a domain id for
1599 * this purpose.
1600 */
1601 if (sm_supported(iommu))
1602 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1603
1604 return 0;
1605}
1606
1607static void disable_dmar_iommu(struct intel_iommu *iommu)
1608{
1609 if (!iommu->domain_ids)
1610 return;
1611
1612 /*
1613 * All iommu domains must have been detached from the devices,
1614 * hence there should be no domain IDs in use.
1615 */
1616 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1617 > NUM_RESERVED_DID))
1618 return;
1619
1620 if (iommu->gcmd & DMA_GCMD_TE)
1621 iommu_disable_translation(iommu);
1622}
1623
1624static void free_dmar_iommu(struct intel_iommu *iommu)
1625{
1626 if (iommu->domain_ids) {
1627 bitmap_free(iommu->domain_ids);
1628 iommu->domain_ids = NULL;
1629 }
1630
1631 if (iommu->copied_tables) {
1632 bitmap_free(iommu->copied_tables);
1633 iommu->copied_tables = NULL;
1634 }
1635
1636 /* free context mapping */
1637 free_context_table(iommu);
1638
1639#ifdef CONFIG_INTEL_IOMMU_SVM
1640 if (pasid_supported(iommu)) {
1641 if (ecap_prs(iommu->ecap))
1642 intel_svm_finish_prq(iommu);
1643 }
1644#endif
1645}
1646
1647/*
1648 * Check and return whether first level is used by default for
1649 * DMA translation.
1650 */
1651static bool first_level_by_default(unsigned int type)
1652{
1653 /* Only SL is available in legacy mode */
1654 if (!scalable_mode_support())
1655 return false;
1656
1657 /* Only level (either FL or SL) is available, just use it */
1658 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1659 return intel_cap_flts_sanity();
1660
1661 /* Both levels are available, decide it based on domain type */
1662 return type != IOMMU_DOMAIN_UNMANAGED;
1663}
1664
1665static struct dmar_domain *alloc_domain(unsigned int type)
1666{
1667 struct dmar_domain *domain;
1668
1669 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1670 if (!domain)
1671 return NULL;
1672
1673 domain->nid = NUMA_NO_NODE;
1674 if (first_level_by_default(type))
1675 domain->use_first_level = true;
1676 domain->has_iotlb_device = false;
1677 INIT_LIST_HEAD(&domain->devices);
1678 INIT_LIST_HEAD(&domain->dev_pasids);
1679 spin_lock_init(&domain->lock);
1680 xa_init(&domain->iommu_array);
1681
1682 return domain;
1683}
1684
1685int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1686{
1687 struct iommu_domain_info *info, *curr;
1688 unsigned long ndomains;
1689 int num, ret = -ENOSPC;
1690
1691 info = kzalloc(sizeof(*info), GFP_KERNEL);
1692 if (!info)
1693 return -ENOMEM;
1694
1695 spin_lock(&iommu->lock);
1696 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1697 if (curr) {
1698 curr->refcnt++;
1699 spin_unlock(&iommu->lock);
1700 kfree(info);
1701 return 0;
1702 }
1703
1704 ndomains = cap_ndoms(iommu->cap);
1705 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1706 if (num >= ndomains) {
1707 pr_err("%s: No free domain ids\n", iommu->name);
1708 goto err_unlock;
1709 }
1710
1711 set_bit(num, iommu->domain_ids);
1712 info->refcnt = 1;
1713 info->did = num;
1714 info->iommu = iommu;
1715 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1716 NULL, info, GFP_ATOMIC);
1717 if (curr) {
1718 ret = xa_err(curr) ? : -EBUSY;
1719 goto err_clear;
1720 }
1721 domain_update_iommu_cap(domain);
1722
1723 spin_unlock(&iommu->lock);
1724 return 0;
1725
1726err_clear:
1727 clear_bit(info->did, iommu->domain_ids);
1728err_unlock:
1729 spin_unlock(&iommu->lock);
1730 kfree(info);
1731 return ret;
1732}
1733
1734void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1735{
1736 struct iommu_domain_info *info;
1737
1738 spin_lock(&iommu->lock);
1739 info = xa_load(&domain->iommu_array, iommu->seq_id);
1740 if (--info->refcnt == 0) {
1741 clear_bit(info->did, iommu->domain_ids);
1742 xa_erase(&domain->iommu_array, iommu->seq_id);
1743 domain->nid = NUMA_NO_NODE;
1744 domain_update_iommu_cap(domain);
1745 kfree(info);
1746 }
1747 spin_unlock(&iommu->lock);
1748}
1749
1750static int guestwidth_to_adjustwidth(int gaw)
1751{
1752 int agaw;
1753 int r = (gaw - 12) % 9;
1754
1755 if (r == 0)
1756 agaw = gaw;
1757 else
1758 agaw = gaw + 9 - r;
1759 if (agaw > 64)
1760 agaw = 64;
1761 return agaw;
1762}
1763
1764static void domain_exit(struct dmar_domain *domain)
1765{
1766 if (domain->pgd) {
1767 LIST_HEAD(freelist);
1768
1769 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1770 put_pages_list(&freelist);
1771 }
1772
1773 if (WARN_ON(!list_empty(&domain->devices)))
1774 return;
1775
1776 kfree(domain);
1777}
1778
1779/*
1780 * Get the PASID directory size for scalable mode context entry.
1781 * Value of X in the PDTS field of a scalable mode context entry
1782 * indicates PASID directory with 2^(X + 7) entries.
1783 */
1784static unsigned long context_get_sm_pds(struct pasid_table *table)
1785{
1786 unsigned long pds, max_pde;
1787
1788 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1789 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1790 if (pds < 7)
1791 return 0;
1792
1793 return pds - 7;
1794}
1795
1796static int domain_context_mapping_one(struct dmar_domain *domain,
1797 struct intel_iommu *iommu,
1798 struct pasid_table *table,
1799 u8 bus, u8 devfn)
1800{
1801 struct device_domain_info *info =
1802 domain_lookup_dev_info(domain, iommu, bus, devfn);
1803 u16 did = domain_id_iommu(domain, iommu);
1804 int translation = CONTEXT_TT_MULTI_LEVEL;
1805 struct context_entry *context;
1806 int ret;
1807
1808 if (hw_pass_through && domain_type_is_si(domain))
1809 translation = CONTEXT_TT_PASS_THROUGH;
1810
1811 pr_debug("Set context mapping for %02x:%02x.%d\n",
1812 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1813
1814 spin_lock(&iommu->lock);
1815 ret = -ENOMEM;
1816 context = iommu_context_addr(iommu, bus, devfn, 1);
1817 if (!context)
1818 goto out_unlock;
1819
1820 ret = 0;
1821 if (context_present(context) && !context_copied(iommu, bus, devfn))
1822 goto out_unlock;
1823
1824 /*
1825 * For kdump cases, old valid entries may be cached due to the
1826 * in-flight DMA and copied pgtable, but there is no unmapping
1827 * behaviour for them, thus we need an explicit cache flush for
1828 * the newly-mapped device. For kdump, at this point, the device
1829 * is supposed to finish reset at its driver probe stage, so no
1830 * in-flight DMA will exist, and we don't need to worry anymore
1831 * hereafter.
1832 */
1833 if (context_copied(iommu, bus, devfn)) {
1834 u16 did_old = context_domain_id(context);
1835
1836 if (did_old < cap_ndoms(iommu->cap)) {
1837 iommu->flush.flush_context(iommu, did_old,
1838 (((u16)bus) << 8) | devfn,
1839 DMA_CCMD_MASK_NOBIT,
1840 DMA_CCMD_DEVICE_INVL);
1841 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1842 DMA_TLB_DSI_FLUSH);
1843 }
1844
1845 clear_context_copied(iommu, bus, devfn);
1846 }
1847
1848 context_clear_entry(context);
1849
1850 if (sm_supported(iommu)) {
1851 unsigned long pds;
1852
1853 /* Setup the PASID DIR pointer: */
1854 pds = context_get_sm_pds(table);
1855 context->lo = (u64)virt_to_phys(table->table) |
1856 context_pdts(pds);
1857
1858 /* Setup the RID_PASID field: */
1859 context_set_sm_rid2pasid(context, IOMMU_NO_PASID);
1860
1861 /*
1862 * Setup the Device-TLB enable bit and Page request
1863 * Enable bit:
1864 */
1865 if (info && info->ats_supported)
1866 context_set_sm_dte(context);
1867 if (info && info->pri_supported)
1868 context_set_sm_pre(context);
1869 if (info && info->pasid_supported)
1870 context_set_pasid(context);
1871 } else {
1872 struct dma_pte *pgd = domain->pgd;
1873 int agaw;
1874
1875 context_set_domain_id(context, did);
1876
1877 if (translation != CONTEXT_TT_PASS_THROUGH) {
1878 /*
1879 * Skip top levels of page tables for iommu which has
1880 * less agaw than default. Unnecessary for PT mode.
1881 */
1882 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1883 ret = -ENOMEM;
1884 pgd = phys_to_virt(dma_pte_addr(pgd));
1885 if (!dma_pte_present(pgd))
1886 goto out_unlock;
1887 }
1888
1889 if (info && info->ats_supported)
1890 translation = CONTEXT_TT_DEV_IOTLB;
1891 else
1892 translation = CONTEXT_TT_MULTI_LEVEL;
1893
1894 context_set_address_root(context, virt_to_phys(pgd));
1895 context_set_address_width(context, agaw);
1896 } else {
1897 /*
1898 * In pass through mode, AW must be programmed to
1899 * indicate the largest AGAW value supported by
1900 * hardware. And ASR is ignored by hardware.
1901 */
1902 context_set_address_width(context, iommu->msagaw);
1903 }
1904
1905 context_set_translation_type(context, translation);
1906 }
1907
1908 context_set_fault_enable(context);
1909 context_set_present(context);
1910 if (!ecap_coherent(iommu->ecap))
1911 clflush_cache_range(context, sizeof(*context));
1912
1913 /*
1914 * It's a non-present to present mapping. If hardware doesn't cache
1915 * non-present entry we only need to flush the write-buffer. If the
1916 * _does_ cache non-present entries, then it does so in the special
1917 * domain #0, which we have to flush:
1918 */
1919 if (cap_caching_mode(iommu->cap)) {
1920 iommu->flush.flush_context(iommu, 0,
1921 (((u16)bus) << 8) | devfn,
1922 DMA_CCMD_MASK_NOBIT,
1923 DMA_CCMD_DEVICE_INVL);
1924 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1925 } else {
1926 iommu_flush_write_buffer(iommu);
1927 }
1928
1929 ret = 0;
1930
1931out_unlock:
1932 spin_unlock(&iommu->lock);
1933
1934 return ret;
1935}
1936
1937struct domain_context_mapping_data {
1938 struct dmar_domain *domain;
1939 struct intel_iommu *iommu;
1940 struct pasid_table *table;
1941};
1942
1943static int domain_context_mapping_cb(struct pci_dev *pdev,
1944 u16 alias, void *opaque)
1945{
1946 struct domain_context_mapping_data *data = opaque;
1947
1948 return domain_context_mapping_one(data->domain, data->iommu,
1949 data->table, PCI_BUS_NUM(alias),
1950 alias & 0xff);
1951}
1952
1953static int
1954domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1955{
1956 struct device_domain_info *info = dev_iommu_priv_get(dev);
1957 struct domain_context_mapping_data data;
1958 struct intel_iommu *iommu = info->iommu;
1959 u8 bus = info->bus, devfn = info->devfn;
1960 struct pasid_table *table;
1961
1962 table = intel_pasid_get_table(dev);
1963
1964 if (!dev_is_pci(dev))
1965 return domain_context_mapping_one(domain, iommu, table,
1966 bus, devfn);
1967
1968 data.domain = domain;
1969 data.iommu = iommu;
1970 data.table = table;
1971
1972 return pci_for_each_dma_alias(to_pci_dev(dev),
1973 &domain_context_mapping_cb, &data);
1974}
1975
1976/* Returns a number of VTD pages, but aligned to MM page size */
1977static unsigned long aligned_nrpages(unsigned long host_addr, size_t size)
1978{
1979 host_addr &= ~PAGE_MASK;
1980 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1981}
1982
1983/* Return largest possible superpage level for a given mapping */
1984static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1985 unsigned long phy_pfn, unsigned long pages)
1986{
1987 int support, level = 1;
1988 unsigned long pfnmerge;
1989
1990 support = domain->iommu_superpage;
1991
1992 /* To use a large page, the virtual *and* physical addresses
1993 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1994 of them will mean we have to use smaller pages. So just
1995 merge them and check both at once. */
1996 pfnmerge = iov_pfn | phy_pfn;
1997
1998 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1999 pages >>= VTD_STRIDE_SHIFT;
2000 if (!pages)
2001 break;
2002 pfnmerge >>= VTD_STRIDE_SHIFT;
2003 level++;
2004 support--;
2005 }
2006 return level;
2007}
2008
2009/*
2010 * Ensure that old small page tables are removed to make room for superpage(s).
2011 * We're going to add new large pages, so make sure we don't remove their parent
2012 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2013 */
2014static void switch_to_super_page(struct dmar_domain *domain,
2015 unsigned long start_pfn,
2016 unsigned long end_pfn, int level)
2017{
2018 unsigned long lvl_pages = lvl_to_nr_pages(level);
2019 struct iommu_domain_info *info;
2020 struct dma_pte *pte = NULL;
2021 unsigned long i;
2022
2023 while (start_pfn <= end_pfn) {
2024 if (!pte)
2025 pte = pfn_to_dma_pte(domain, start_pfn, &level,
2026 GFP_ATOMIC);
2027
2028 if (dma_pte_present(pte)) {
2029 dma_pte_free_pagetable(domain, start_pfn,
2030 start_pfn + lvl_pages - 1,
2031 level + 1);
2032
2033 xa_for_each(&domain->iommu_array, i, info)
2034 iommu_flush_iotlb_psi(info->iommu, domain,
2035 start_pfn, lvl_pages,
2036 0, 0);
2037 if (domain->nested_parent)
2038 parent_domain_flush(domain, start_pfn,
2039 lvl_pages, 0);
2040 }
2041
2042 pte++;
2043 start_pfn += lvl_pages;
2044 if (first_pte_in_page(pte))
2045 pte = NULL;
2046 }
2047}
2048
2049static int
2050__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2051 unsigned long phys_pfn, unsigned long nr_pages, int prot,
2052 gfp_t gfp)
2053{
2054 struct dma_pte *first_pte = NULL, *pte = NULL;
2055 unsigned int largepage_lvl = 0;
2056 unsigned long lvl_pages = 0;
2057 phys_addr_t pteval;
2058 u64 attr;
2059
2060 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
2061 return -EINVAL;
2062
2063 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2064 return -EINVAL;
2065
2066 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
2067 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
2068 return -EINVAL;
2069 }
2070
2071 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2072 attr |= DMA_FL_PTE_PRESENT;
2073 if (domain->use_first_level) {
2074 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2075 if (prot & DMA_PTE_WRITE)
2076 attr |= DMA_FL_PTE_DIRTY;
2077 }
2078
2079 domain->has_mappings = true;
2080
2081 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2082
2083 while (nr_pages > 0) {
2084 uint64_t tmp;
2085
2086 if (!pte) {
2087 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2088 phys_pfn, nr_pages);
2089
2090 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
2091 gfp);
2092 if (!pte)
2093 return -ENOMEM;
2094 first_pte = pte;
2095
2096 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2097
2098 /* It is large page*/
2099 if (largepage_lvl > 1) {
2100 unsigned long end_pfn;
2101 unsigned long pages_to_remove;
2102
2103 pteval |= DMA_PTE_LARGE_PAGE;
2104 pages_to_remove = min_t(unsigned long, nr_pages,
2105 nr_pte_to_next_page(pte) * lvl_pages);
2106 end_pfn = iov_pfn + pages_to_remove - 1;
2107 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2108 } else {
2109 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2110 }
2111
2112 }
2113 /* We don't need lock here, nobody else
2114 * touches the iova range
2115 */
2116 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2117 if (tmp) {
2118 static int dumps = 5;
2119 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2120 iov_pfn, tmp, (unsigned long long)pteval);
2121 if (dumps) {
2122 dumps--;
2123 debug_dma_dump_mappings(NULL);
2124 }
2125 WARN_ON(1);
2126 }
2127
2128 nr_pages -= lvl_pages;
2129 iov_pfn += lvl_pages;
2130 phys_pfn += lvl_pages;
2131 pteval += lvl_pages * VTD_PAGE_SIZE;
2132
2133 /* If the next PTE would be the first in a new page, then we
2134 * need to flush the cache on the entries we've just written.
2135 * And then we'll need to recalculate 'pte', so clear it and
2136 * let it get set again in the if (!pte) block above.
2137 *
2138 * If we're done (!nr_pages) we need to flush the cache too.
2139 *
2140 * Also if we've been setting superpages, we may need to
2141 * recalculate 'pte' and switch back to smaller pages for the
2142 * end of the mapping, if the trailing size is not enough to
2143 * use another superpage (i.e. nr_pages < lvl_pages).
2144 */
2145 pte++;
2146 if (!nr_pages || first_pte_in_page(pte) ||
2147 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2148 domain_flush_cache(domain, first_pte,
2149 (void *)pte - (void *)first_pte);
2150 pte = NULL;
2151 }
2152 }
2153
2154 return 0;
2155}
2156
2157static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2158{
2159 struct intel_iommu *iommu = info->iommu;
2160 struct context_entry *context;
2161 u16 did_old;
2162
2163 if (!iommu)
2164 return;
2165
2166 spin_lock(&iommu->lock);
2167 context = iommu_context_addr(iommu, bus, devfn, 0);
2168 if (!context) {
2169 spin_unlock(&iommu->lock);
2170 return;
2171 }
2172
2173 if (sm_supported(iommu)) {
2174 if (hw_pass_through && domain_type_is_si(info->domain))
2175 did_old = FLPT_DEFAULT_DID;
2176 else
2177 did_old = domain_id_iommu(info->domain, iommu);
2178 } else {
2179 did_old = context_domain_id(context);
2180 }
2181
2182 context_clear_entry(context);
2183 __iommu_flush_cache(iommu, context, sizeof(*context));
2184 spin_unlock(&iommu->lock);
2185 iommu->flush.flush_context(iommu,
2186 did_old,
2187 (((u16)bus) << 8) | devfn,
2188 DMA_CCMD_MASK_NOBIT,
2189 DMA_CCMD_DEVICE_INVL);
2190
2191 if (sm_supported(iommu))
2192 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2193
2194 iommu->flush.flush_iotlb(iommu,
2195 did_old,
2196 0,
2197 0,
2198 DMA_TLB_DSI_FLUSH);
2199
2200 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2201}
2202
2203static int domain_setup_first_level(struct intel_iommu *iommu,
2204 struct dmar_domain *domain,
2205 struct device *dev,
2206 u32 pasid)
2207{
2208 struct dma_pte *pgd = domain->pgd;
2209 int agaw, level;
2210 int flags = 0;
2211
2212 /*
2213 * Skip top levels of page tables for iommu which has
2214 * less agaw than default. Unnecessary for PT mode.
2215 */
2216 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2217 pgd = phys_to_virt(dma_pte_addr(pgd));
2218 if (!dma_pte_present(pgd))
2219 return -ENOMEM;
2220 }
2221
2222 level = agaw_to_level(agaw);
2223 if (level != 4 && level != 5)
2224 return -EINVAL;
2225
2226 if (level == 5)
2227 flags |= PASID_FLAG_FL5LP;
2228
2229 if (domain->force_snooping)
2230 flags |= PASID_FLAG_PAGE_SNOOP;
2231
2232 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2233 domain_id_iommu(domain, iommu),
2234 flags);
2235}
2236
2237static bool dev_is_real_dma_subdevice(struct device *dev)
2238{
2239 return dev && dev_is_pci(dev) &&
2240 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2241}
2242
2243static int iommu_domain_identity_map(struct dmar_domain *domain,
2244 unsigned long first_vpfn,
2245 unsigned long last_vpfn)
2246{
2247 /*
2248 * RMRR range might have overlap with physical memory range,
2249 * clear it first
2250 */
2251 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2252
2253 return __domain_mapping(domain, first_vpfn,
2254 first_vpfn, last_vpfn - first_vpfn + 1,
2255 DMA_PTE_READ|DMA_PTE_WRITE, GFP_KERNEL);
2256}
2257
2258static int md_domain_init(struct dmar_domain *domain, int guest_width);
2259
2260static int __init si_domain_init(int hw)
2261{
2262 struct dmar_rmrr_unit *rmrr;
2263 struct device *dev;
2264 int i, nid, ret;
2265
2266 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2267 if (!si_domain)
2268 return -EFAULT;
2269
2270 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2271 domain_exit(si_domain);
2272 si_domain = NULL;
2273 return -EFAULT;
2274 }
2275
2276 if (hw)
2277 return 0;
2278
2279 for_each_online_node(nid) {
2280 unsigned long start_pfn, end_pfn;
2281 int i;
2282
2283 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2284 ret = iommu_domain_identity_map(si_domain,
2285 mm_to_dma_pfn_start(start_pfn),
2286 mm_to_dma_pfn_end(end_pfn));
2287 if (ret)
2288 return ret;
2289 }
2290 }
2291
2292 /*
2293 * Identity map the RMRRs so that devices with RMRRs could also use
2294 * the si_domain.
2295 */
2296 for_each_rmrr_units(rmrr) {
2297 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2298 i, dev) {
2299 unsigned long long start = rmrr->base_address;
2300 unsigned long long end = rmrr->end_address;
2301
2302 if (WARN_ON(end < start ||
2303 end >> agaw_to_width(si_domain->agaw)))
2304 continue;
2305
2306 ret = iommu_domain_identity_map(si_domain,
2307 mm_to_dma_pfn_start(start >> PAGE_SHIFT),
2308 mm_to_dma_pfn_end(end >> PAGE_SHIFT));
2309 if (ret)
2310 return ret;
2311 }
2312 }
2313
2314 return 0;
2315}
2316
2317static int dmar_domain_attach_device(struct dmar_domain *domain,
2318 struct device *dev)
2319{
2320 struct device_domain_info *info = dev_iommu_priv_get(dev);
2321 struct intel_iommu *iommu = info->iommu;
2322 unsigned long flags;
2323 int ret;
2324
2325 ret = domain_attach_iommu(domain, iommu);
2326 if (ret)
2327 return ret;
2328 info->domain = domain;
2329 spin_lock_irqsave(&domain->lock, flags);
2330 list_add(&info->link, &domain->devices);
2331 spin_unlock_irqrestore(&domain->lock, flags);
2332
2333 /* PASID table is mandatory for a PCI device in scalable mode. */
2334 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2335 /* Setup the PASID entry for requests without PASID: */
2336 if (hw_pass_through && domain_type_is_si(domain))
2337 ret = intel_pasid_setup_pass_through(iommu,
2338 dev, IOMMU_NO_PASID);
2339 else if (domain->use_first_level)
2340 ret = domain_setup_first_level(iommu, domain, dev,
2341 IOMMU_NO_PASID);
2342 else
2343 ret = intel_pasid_setup_second_level(iommu, domain,
2344 dev, IOMMU_NO_PASID);
2345 if (ret) {
2346 dev_err(dev, "Setup RID2PASID failed\n");
2347 device_block_translation(dev);
2348 return ret;
2349 }
2350 }
2351
2352 ret = domain_context_mapping(domain, dev);
2353 if (ret) {
2354 dev_err(dev, "Domain context map failed\n");
2355 device_block_translation(dev);
2356 return ret;
2357 }
2358
2359 if (sm_supported(info->iommu) || !domain_type_is_si(info->domain))
2360 iommu_enable_pci_caps(info);
2361
2362 return 0;
2363}
2364
2365/**
2366 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2367 * is relaxable (ie. is allowed to be not enforced under some conditions)
2368 * @dev: device handle
2369 *
2370 * We assume that PCI USB devices with RMRRs have them largely
2371 * for historical reasons and that the RMRR space is not actively used post
2372 * boot. This exclusion may change if vendors begin to abuse it.
2373 *
2374 * The same exception is made for graphics devices, with the requirement that
2375 * any use of the RMRR regions will be torn down before assigning the device
2376 * to a guest.
2377 *
2378 * Return: true if the RMRR is relaxable, false otherwise
2379 */
2380static bool device_rmrr_is_relaxable(struct device *dev)
2381{
2382 struct pci_dev *pdev;
2383
2384 if (!dev_is_pci(dev))
2385 return false;
2386
2387 pdev = to_pci_dev(dev);
2388 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2389 return true;
2390 else
2391 return false;
2392}
2393
2394/*
2395 * Return the required default domain type for a specific device.
2396 *
2397 * @dev: the device in query
2398 * @startup: true if this is during early boot
2399 *
2400 * Returns:
2401 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2402 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2403 * - 0: both identity and dynamic domains work for this device
2404 */
2405static int device_def_domain_type(struct device *dev)
2406{
2407 if (dev_is_pci(dev)) {
2408 struct pci_dev *pdev = to_pci_dev(dev);
2409
2410 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2411 return IOMMU_DOMAIN_IDENTITY;
2412
2413 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2414 return IOMMU_DOMAIN_IDENTITY;
2415 }
2416
2417 return 0;
2418}
2419
2420static void intel_iommu_init_qi(struct intel_iommu *iommu)
2421{
2422 /*
2423 * Start from the sane iommu hardware state.
2424 * If the queued invalidation is already initialized by us
2425 * (for example, while enabling interrupt-remapping) then
2426 * we got the things already rolling from a sane state.
2427 */
2428 if (!iommu->qi) {
2429 /*
2430 * Clear any previous faults.
2431 */
2432 dmar_fault(-1, iommu);
2433 /*
2434 * Disable queued invalidation if supported and already enabled
2435 * before OS handover.
2436 */
2437 dmar_disable_qi(iommu);
2438 }
2439
2440 if (dmar_enable_qi(iommu)) {
2441 /*
2442 * Queued Invalidate not enabled, use Register Based Invalidate
2443 */
2444 iommu->flush.flush_context = __iommu_flush_context;
2445 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2446 pr_info("%s: Using Register based invalidation\n",
2447 iommu->name);
2448 } else {
2449 iommu->flush.flush_context = qi_flush_context;
2450 iommu->flush.flush_iotlb = qi_flush_iotlb;
2451 pr_info("%s: Using Queued invalidation\n", iommu->name);
2452 }
2453}
2454
2455static int copy_context_table(struct intel_iommu *iommu,
2456 struct root_entry *old_re,
2457 struct context_entry **tbl,
2458 int bus, bool ext)
2459{
2460 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2461 struct context_entry *new_ce = NULL, ce;
2462 struct context_entry *old_ce = NULL;
2463 struct root_entry re;
2464 phys_addr_t old_ce_phys;
2465
2466 tbl_idx = ext ? bus * 2 : bus;
2467 memcpy(&re, old_re, sizeof(re));
2468
2469 for (devfn = 0; devfn < 256; devfn++) {
2470 /* First calculate the correct index */
2471 idx = (ext ? devfn * 2 : devfn) % 256;
2472
2473 if (idx == 0) {
2474 /* First save what we may have and clean up */
2475 if (new_ce) {
2476 tbl[tbl_idx] = new_ce;
2477 __iommu_flush_cache(iommu, new_ce,
2478 VTD_PAGE_SIZE);
2479 pos = 1;
2480 }
2481
2482 if (old_ce)
2483 memunmap(old_ce);
2484
2485 ret = 0;
2486 if (devfn < 0x80)
2487 old_ce_phys = root_entry_lctp(&re);
2488 else
2489 old_ce_phys = root_entry_uctp(&re);
2490
2491 if (!old_ce_phys) {
2492 if (ext && devfn == 0) {
2493 /* No LCTP, try UCTP */
2494 devfn = 0x7f;
2495 continue;
2496 } else {
2497 goto out;
2498 }
2499 }
2500
2501 ret = -ENOMEM;
2502 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2503 MEMREMAP_WB);
2504 if (!old_ce)
2505 goto out;
2506
2507 new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
2508 if (!new_ce)
2509 goto out_unmap;
2510
2511 ret = 0;
2512 }
2513
2514 /* Now copy the context entry */
2515 memcpy(&ce, old_ce + idx, sizeof(ce));
2516
2517 if (!context_present(&ce))
2518 continue;
2519
2520 did = context_domain_id(&ce);
2521 if (did >= 0 && did < cap_ndoms(iommu->cap))
2522 set_bit(did, iommu->domain_ids);
2523
2524 set_context_copied(iommu, bus, devfn);
2525 new_ce[idx] = ce;
2526 }
2527
2528 tbl[tbl_idx + pos] = new_ce;
2529
2530 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2531
2532out_unmap:
2533 memunmap(old_ce);
2534
2535out:
2536 return ret;
2537}
2538
2539static int copy_translation_tables(struct intel_iommu *iommu)
2540{
2541 struct context_entry **ctxt_tbls;
2542 struct root_entry *old_rt;
2543 phys_addr_t old_rt_phys;
2544 int ctxt_table_entries;
2545 u64 rtaddr_reg;
2546 int bus, ret;
2547 bool new_ext, ext;
2548
2549 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2550 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2551 new_ext = !!sm_supported(iommu);
2552
2553 /*
2554 * The RTT bit can only be changed when translation is disabled,
2555 * but disabling translation means to open a window for data
2556 * corruption. So bail out and don't copy anything if we would
2557 * have to change the bit.
2558 */
2559 if (new_ext != ext)
2560 return -EINVAL;
2561
2562 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2563 if (!iommu->copied_tables)
2564 return -ENOMEM;
2565
2566 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2567 if (!old_rt_phys)
2568 return -EINVAL;
2569
2570 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2571 if (!old_rt)
2572 return -ENOMEM;
2573
2574 /* This is too big for the stack - allocate it from slab */
2575 ctxt_table_entries = ext ? 512 : 256;
2576 ret = -ENOMEM;
2577 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2578 if (!ctxt_tbls)
2579 goto out_unmap;
2580
2581 for (bus = 0; bus < 256; bus++) {
2582 ret = copy_context_table(iommu, &old_rt[bus],
2583 ctxt_tbls, bus, ext);
2584 if (ret) {
2585 pr_err("%s: Failed to copy context table for bus %d\n",
2586 iommu->name, bus);
2587 continue;
2588 }
2589 }
2590
2591 spin_lock(&iommu->lock);
2592
2593 /* Context tables are copied, now write them to the root_entry table */
2594 for (bus = 0; bus < 256; bus++) {
2595 int idx = ext ? bus * 2 : bus;
2596 u64 val;
2597
2598 if (ctxt_tbls[idx]) {
2599 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2600 iommu->root_entry[bus].lo = val;
2601 }
2602
2603 if (!ext || !ctxt_tbls[idx + 1])
2604 continue;
2605
2606 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2607 iommu->root_entry[bus].hi = val;
2608 }
2609
2610 spin_unlock(&iommu->lock);
2611
2612 kfree(ctxt_tbls);
2613
2614 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2615
2616 ret = 0;
2617
2618out_unmap:
2619 memunmap(old_rt);
2620
2621 return ret;
2622}
2623
2624static int __init init_dmars(void)
2625{
2626 struct dmar_drhd_unit *drhd;
2627 struct intel_iommu *iommu;
2628 int ret;
2629
2630 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2631 if (ret)
2632 goto free_iommu;
2633
2634 for_each_iommu(iommu, drhd) {
2635 if (drhd->ignored) {
2636 iommu_disable_translation(iommu);
2637 continue;
2638 }
2639
2640 /*
2641 * Find the max pasid size of all IOMMU's in the system.
2642 * We need to ensure the system pasid table is no bigger
2643 * than the smallest supported.
2644 */
2645 if (pasid_supported(iommu)) {
2646 u32 temp = 2 << ecap_pss(iommu->ecap);
2647
2648 intel_pasid_max_id = min_t(u32, temp,
2649 intel_pasid_max_id);
2650 }
2651
2652 intel_iommu_init_qi(iommu);
2653
2654 ret = iommu_init_domains(iommu);
2655 if (ret)
2656 goto free_iommu;
2657
2658 init_translation_status(iommu);
2659
2660 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2661 iommu_disable_translation(iommu);
2662 clear_translation_pre_enabled(iommu);
2663 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2664 iommu->name);
2665 }
2666
2667 /*
2668 * TBD:
2669 * we could share the same root & context tables
2670 * among all IOMMU's. Need to Split it later.
2671 */
2672 ret = iommu_alloc_root_entry(iommu);
2673 if (ret)
2674 goto free_iommu;
2675
2676 if (translation_pre_enabled(iommu)) {
2677 pr_info("Translation already enabled - trying to copy translation structures\n");
2678
2679 ret = copy_translation_tables(iommu);
2680 if (ret) {
2681 /*
2682 * We found the IOMMU with translation
2683 * enabled - but failed to copy over the
2684 * old root-entry table. Try to proceed
2685 * by disabling translation now and
2686 * allocating a clean root-entry table.
2687 * This might cause DMAR faults, but
2688 * probably the dump will still succeed.
2689 */
2690 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2691 iommu->name);
2692 iommu_disable_translation(iommu);
2693 clear_translation_pre_enabled(iommu);
2694 } else {
2695 pr_info("Copied translation tables from previous kernel for %s\n",
2696 iommu->name);
2697 }
2698 }
2699
2700 if (!ecap_pass_through(iommu->ecap))
2701 hw_pass_through = 0;
2702 intel_svm_check(iommu);
2703 }
2704
2705 /*
2706 * Now that qi is enabled on all iommus, set the root entry and flush
2707 * caches. This is required on some Intel X58 chipsets, otherwise the
2708 * flush_context function will loop forever and the boot hangs.
2709 */
2710 for_each_active_iommu(iommu, drhd) {
2711 iommu_flush_write_buffer(iommu);
2712 iommu_set_root_entry(iommu);
2713 }
2714
2715#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2716 dmar_map_gfx = 0;
2717#endif
2718
2719 if (!dmar_map_gfx)
2720 iommu_identity_mapping |= IDENTMAP_GFX;
2721
2722 check_tylersburg_isoch();
2723
2724 ret = si_domain_init(hw_pass_through);
2725 if (ret)
2726 goto free_iommu;
2727
2728 /*
2729 * for each drhd
2730 * enable fault log
2731 * global invalidate context cache
2732 * global invalidate iotlb
2733 * enable translation
2734 */
2735 for_each_iommu(iommu, drhd) {
2736 if (drhd->ignored) {
2737 /*
2738 * we always have to disable PMRs or DMA may fail on
2739 * this device
2740 */
2741 if (force_on)
2742 iommu_disable_protect_mem_regions(iommu);
2743 continue;
2744 }
2745
2746 iommu_flush_write_buffer(iommu);
2747
2748#ifdef CONFIG_INTEL_IOMMU_SVM
2749 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
2750 /*
2751 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2752 * could cause possible lock race condition.
2753 */
2754 up_write(&dmar_global_lock);
2755 ret = intel_svm_enable_prq(iommu);
2756 down_write(&dmar_global_lock);
2757 if (ret)
2758 goto free_iommu;
2759 }
2760#endif
2761 ret = dmar_set_interrupt(iommu);
2762 if (ret)
2763 goto free_iommu;
2764 }
2765
2766 return 0;
2767
2768free_iommu:
2769 for_each_active_iommu(iommu, drhd) {
2770 disable_dmar_iommu(iommu);
2771 free_dmar_iommu(iommu);
2772 }
2773 if (si_domain) {
2774 domain_exit(si_domain);
2775 si_domain = NULL;
2776 }
2777
2778 return ret;
2779}
2780
2781static void __init init_no_remapping_devices(void)
2782{
2783 struct dmar_drhd_unit *drhd;
2784 struct device *dev;
2785 int i;
2786
2787 for_each_drhd_unit(drhd) {
2788 if (!drhd->include_all) {
2789 for_each_active_dev_scope(drhd->devices,
2790 drhd->devices_cnt, i, dev)
2791 break;
2792 /* ignore DMAR unit if no devices exist */
2793 if (i == drhd->devices_cnt)
2794 drhd->ignored = 1;
2795 }
2796 }
2797
2798 for_each_active_drhd_unit(drhd) {
2799 if (drhd->include_all)
2800 continue;
2801
2802 for_each_active_dev_scope(drhd->devices,
2803 drhd->devices_cnt, i, dev)
2804 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2805 break;
2806 if (i < drhd->devices_cnt)
2807 continue;
2808
2809 /* This IOMMU has *only* gfx devices. Either bypass it or
2810 set the gfx_mapped flag, as appropriate */
2811 drhd->gfx_dedicated = 1;
2812 if (!dmar_map_gfx)
2813 drhd->ignored = 1;
2814 }
2815}
2816
2817#ifdef CONFIG_SUSPEND
2818static int init_iommu_hw(void)
2819{
2820 struct dmar_drhd_unit *drhd;
2821 struct intel_iommu *iommu = NULL;
2822 int ret;
2823
2824 for_each_active_iommu(iommu, drhd) {
2825 if (iommu->qi) {
2826 ret = dmar_reenable_qi(iommu);
2827 if (ret)
2828 return ret;
2829 }
2830 }
2831
2832 for_each_iommu(iommu, drhd) {
2833 if (drhd->ignored) {
2834 /*
2835 * we always have to disable PMRs or DMA may fail on
2836 * this device
2837 */
2838 if (force_on)
2839 iommu_disable_protect_mem_regions(iommu);
2840 continue;
2841 }
2842
2843 iommu_flush_write_buffer(iommu);
2844 iommu_set_root_entry(iommu);
2845 iommu_enable_translation(iommu);
2846 iommu_disable_protect_mem_regions(iommu);
2847 }
2848
2849 return 0;
2850}
2851
2852static void iommu_flush_all(void)
2853{
2854 struct dmar_drhd_unit *drhd;
2855 struct intel_iommu *iommu;
2856
2857 for_each_active_iommu(iommu, drhd) {
2858 iommu->flush.flush_context(iommu, 0, 0, 0,
2859 DMA_CCMD_GLOBAL_INVL);
2860 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2861 DMA_TLB_GLOBAL_FLUSH);
2862 }
2863}
2864
2865static int iommu_suspend(void)
2866{
2867 struct dmar_drhd_unit *drhd;
2868 struct intel_iommu *iommu = NULL;
2869 unsigned long flag;
2870
2871 iommu_flush_all();
2872
2873 for_each_active_iommu(iommu, drhd) {
2874 iommu_disable_translation(iommu);
2875
2876 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2877
2878 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2879 readl(iommu->reg + DMAR_FECTL_REG);
2880 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2881 readl(iommu->reg + DMAR_FEDATA_REG);
2882 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2883 readl(iommu->reg + DMAR_FEADDR_REG);
2884 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2885 readl(iommu->reg + DMAR_FEUADDR_REG);
2886
2887 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2888 }
2889 return 0;
2890}
2891
2892static void iommu_resume(void)
2893{
2894 struct dmar_drhd_unit *drhd;
2895 struct intel_iommu *iommu = NULL;
2896 unsigned long flag;
2897
2898 if (init_iommu_hw()) {
2899 if (force_on)
2900 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2901 else
2902 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2903 return;
2904 }
2905
2906 for_each_active_iommu(iommu, drhd) {
2907
2908 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2909
2910 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2911 iommu->reg + DMAR_FECTL_REG);
2912 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2913 iommu->reg + DMAR_FEDATA_REG);
2914 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2915 iommu->reg + DMAR_FEADDR_REG);
2916 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2917 iommu->reg + DMAR_FEUADDR_REG);
2918
2919 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2920 }
2921}
2922
2923static struct syscore_ops iommu_syscore_ops = {
2924 .resume = iommu_resume,
2925 .suspend = iommu_suspend,
2926};
2927
2928static void __init init_iommu_pm_ops(void)
2929{
2930 register_syscore_ops(&iommu_syscore_ops);
2931}
2932
2933#else
2934static inline void init_iommu_pm_ops(void) {}
2935#endif /* CONFIG_PM */
2936
2937static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2938{
2939 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2940 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2941 rmrr->end_address <= rmrr->base_address ||
2942 arch_rmrr_sanity_check(rmrr))
2943 return -EINVAL;
2944
2945 return 0;
2946}
2947
2948int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2949{
2950 struct acpi_dmar_reserved_memory *rmrr;
2951 struct dmar_rmrr_unit *rmrru;
2952
2953 rmrr = (struct acpi_dmar_reserved_memory *)header;
2954 if (rmrr_sanity_check(rmrr)) {
2955 pr_warn(FW_BUG
2956 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2957 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2958 rmrr->base_address, rmrr->end_address,
2959 dmi_get_system_info(DMI_BIOS_VENDOR),
2960 dmi_get_system_info(DMI_BIOS_VERSION),
2961 dmi_get_system_info(DMI_PRODUCT_VERSION));
2962 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2963 }
2964
2965 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2966 if (!rmrru)
2967 goto out;
2968
2969 rmrru->hdr = header;
2970
2971 rmrru->base_address = rmrr->base_address;
2972 rmrru->end_address = rmrr->end_address;
2973
2974 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2975 ((void *)rmrr) + rmrr->header.length,
2976 &rmrru->devices_cnt);
2977 if (rmrru->devices_cnt && rmrru->devices == NULL)
2978 goto free_rmrru;
2979
2980 list_add(&rmrru->list, &dmar_rmrr_units);
2981
2982 return 0;
2983free_rmrru:
2984 kfree(rmrru);
2985out:
2986 return -ENOMEM;
2987}
2988
2989static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2990{
2991 struct dmar_atsr_unit *atsru;
2992 struct acpi_dmar_atsr *tmp;
2993
2994 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2995 dmar_rcu_check()) {
2996 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2997 if (atsr->segment != tmp->segment)
2998 continue;
2999 if (atsr->header.length != tmp->header.length)
3000 continue;
3001 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3002 return atsru;
3003 }
3004
3005 return NULL;
3006}
3007
3008int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3009{
3010 struct acpi_dmar_atsr *atsr;
3011 struct dmar_atsr_unit *atsru;
3012
3013 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3014 return 0;
3015
3016 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3017 atsru = dmar_find_atsr(atsr);
3018 if (atsru)
3019 return 0;
3020
3021 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3022 if (!atsru)
3023 return -ENOMEM;
3024
3025 /*
3026 * If memory is allocated from slab by ACPI _DSM method, we need to
3027 * copy the memory content because the memory buffer will be freed
3028 * on return.
3029 */
3030 atsru->hdr = (void *)(atsru + 1);
3031 memcpy(atsru->hdr, hdr, hdr->length);
3032 atsru->include_all = atsr->flags & 0x1;
3033 if (!atsru->include_all) {
3034 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3035 (void *)atsr + atsr->header.length,
3036 &atsru->devices_cnt);
3037 if (atsru->devices_cnt && atsru->devices == NULL) {
3038 kfree(atsru);
3039 return -ENOMEM;
3040 }
3041 }
3042
3043 list_add_rcu(&atsru->list, &dmar_atsr_units);
3044
3045 return 0;
3046}
3047
3048static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3049{
3050 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3051 kfree(atsru);
3052}
3053
3054int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3055{
3056 struct acpi_dmar_atsr *atsr;
3057 struct dmar_atsr_unit *atsru;
3058
3059 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3060 atsru = dmar_find_atsr(atsr);
3061 if (atsru) {
3062 list_del_rcu(&atsru->list);
3063 synchronize_rcu();
3064 intel_iommu_free_atsr(atsru);
3065 }
3066
3067 return 0;
3068}
3069
3070int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3071{
3072 int i;
3073 struct device *dev;
3074 struct acpi_dmar_atsr *atsr;
3075 struct dmar_atsr_unit *atsru;
3076
3077 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3078 atsru = dmar_find_atsr(atsr);
3079 if (!atsru)
3080 return 0;
3081
3082 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3083 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3084 i, dev)
3085 return -EBUSY;
3086 }
3087
3088 return 0;
3089}
3090
3091static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3092{
3093 struct dmar_satc_unit *satcu;
3094 struct acpi_dmar_satc *tmp;
3095
3096 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3097 dmar_rcu_check()) {
3098 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3099 if (satc->segment != tmp->segment)
3100 continue;
3101 if (satc->header.length != tmp->header.length)
3102 continue;
3103 if (memcmp(satc, tmp, satc->header.length) == 0)
3104 return satcu;
3105 }
3106
3107 return NULL;
3108}
3109
3110int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3111{
3112 struct acpi_dmar_satc *satc;
3113 struct dmar_satc_unit *satcu;
3114
3115 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3116 return 0;
3117
3118 satc = container_of(hdr, struct acpi_dmar_satc, header);
3119 satcu = dmar_find_satc(satc);
3120 if (satcu)
3121 return 0;
3122
3123 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3124 if (!satcu)
3125 return -ENOMEM;
3126
3127 satcu->hdr = (void *)(satcu + 1);
3128 memcpy(satcu->hdr, hdr, hdr->length);
3129 satcu->atc_required = satc->flags & 0x1;
3130 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3131 (void *)satc + satc->header.length,
3132 &satcu->devices_cnt);
3133 if (satcu->devices_cnt && !satcu->devices) {
3134 kfree(satcu);
3135 return -ENOMEM;
3136 }
3137 list_add_rcu(&satcu->list, &dmar_satc_units);
3138
3139 return 0;
3140}
3141
3142static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3143{
3144 int sp, ret;
3145 struct intel_iommu *iommu = dmaru->iommu;
3146
3147 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3148 if (ret)
3149 goto out;
3150
3151 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3152 pr_warn("%s: Doesn't support hardware pass through.\n",
3153 iommu->name);
3154 return -ENXIO;
3155 }
3156
3157 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3158 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3159 pr_warn("%s: Doesn't support large page.\n",
3160 iommu->name);
3161 return -ENXIO;
3162 }
3163
3164 /*
3165 * Disable translation if already enabled prior to OS handover.
3166 */
3167 if (iommu->gcmd & DMA_GCMD_TE)
3168 iommu_disable_translation(iommu);
3169
3170 ret = iommu_init_domains(iommu);
3171 if (ret == 0)
3172 ret = iommu_alloc_root_entry(iommu);
3173 if (ret)
3174 goto out;
3175
3176 intel_svm_check(iommu);
3177
3178 if (dmaru->ignored) {
3179 /*
3180 * we always have to disable PMRs or DMA may fail on this device
3181 */
3182 if (force_on)
3183 iommu_disable_protect_mem_regions(iommu);
3184 return 0;
3185 }
3186
3187 intel_iommu_init_qi(iommu);
3188 iommu_flush_write_buffer(iommu);
3189
3190#ifdef CONFIG_INTEL_IOMMU_SVM
3191 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3192 ret = intel_svm_enable_prq(iommu);
3193 if (ret)
3194 goto disable_iommu;
3195 }
3196#endif
3197 ret = dmar_set_interrupt(iommu);
3198 if (ret)
3199 goto disable_iommu;
3200
3201 iommu_set_root_entry(iommu);
3202 iommu_enable_translation(iommu);
3203
3204 iommu_disable_protect_mem_regions(iommu);
3205 return 0;
3206
3207disable_iommu:
3208 disable_dmar_iommu(iommu);
3209out:
3210 free_dmar_iommu(iommu);
3211 return ret;
3212}
3213
3214int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3215{
3216 int ret = 0;
3217 struct intel_iommu *iommu = dmaru->iommu;
3218
3219 if (!intel_iommu_enabled)
3220 return 0;
3221 if (iommu == NULL)
3222 return -EINVAL;
3223
3224 if (insert) {
3225 ret = intel_iommu_add(dmaru);
3226 } else {
3227 disable_dmar_iommu(iommu);
3228 free_dmar_iommu(iommu);
3229 }
3230
3231 return ret;
3232}
3233
3234static void intel_iommu_free_dmars(void)
3235{
3236 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3237 struct dmar_atsr_unit *atsru, *atsr_n;
3238 struct dmar_satc_unit *satcu, *satc_n;
3239
3240 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3241 list_del(&rmrru->list);
3242 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3243 kfree(rmrru);
3244 }
3245
3246 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3247 list_del(&atsru->list);
3248 intel_iommu_free_atsr(atsru);
3249 }
3250 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3251 list_del(&satcu->list);
3252 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3253 kfree(satcu);
3254 }
3255}
3256
3257static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3258{
3259 struct dmar_satc_unit *satcu;
3260 struct acpi_dmar_satc *satc;
3261 struct device *tmp;
3262 int i;
3263
3264 dev = pci_physfn(dev);
3265 rcu_read_lock();
3266
3267 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3268 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3269 if (satc->segment != pci_domain_nr(dev->bus))
3270 continue;
3271 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3272 if (to_pci_dev(tmp) == dev)
3273 goto out;
3274 }
3275 satcu = NULL;
3276out:
3277 rcu_read_unlock();
3278 return satcu;
3279}
3280
3281static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3282{
3283 int i, ret = 1;
3284 struct pci_bus *bus;
3285 struct pci_dev *bridge = NULL;
3286 struct device *tmp;
3287 struct acpi_dmar_atsr *atsr;
3288 struct dmar_atsr_unit *atsru;
3289 struct dmar_satc_unit *satcu;
3290
3291 dev = pci_physfn(dev);
3292 satcu = dmar_find_matched_satc_unit(dev);
3293 if (satcu)
3294 /*
3295 * This device supports ATS as it is in SATC table.
3296 * When IOMMU is in legacy mode, enabling ATS is done
3297 * automatically by HW for the device that requires
3298 * ATS, hence OS should not enable this device ATS
3299 * to avoid duplicated TLB invalidation.
3300 */
3301 return !(satcu->atc_required && !sm_supported(iommu));
3302
3303 for (bus = dev->bus; bus; bus = bus->parent) {
3304 bridge = bus->self;
3305 /* If it's an integrated device, allow ATS */
3306 if (!bridge)
3307 return 1;
3308 /* Connected via non-PCIe: no ATS */
3309 if (!pci_is_pcie(bridge) ||
3310 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3311 return 0;
3312 /* If we found the root port, look it up in the ATSR */
3313 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3314 break;
3315 }
3316
3317 rcu_read_lock();
3318 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3319 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3320 if (atsr->segment != pci_domain_nr(dev->bus))
3321 continue;
3322
3323 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3324 if (tmp == &bridge->dev)
3325 goto out;
3326
3327 if (atsru->include_all)
3328 goto out;
3329 }
3330 ret = 0;
3331out:
3332 rcu_read_unlock();
3333
3334 return ret;
3335}
3336
3337int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3338{
3339 int ret;
3340 struct dmar_rmrr_unit *rmrru;
3341 struct dmar_atsr_unit *atsru;
3342 struct dmar_satc_unit *satcu;
3343 struct acpi_dmar_atsr *atsr;
3344 struct acpi_dmar_reserved_memory *rmrr;
3345 struct acpi_dmar_satc *satc;
3346
3347 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3348 return 0;
3349
3350 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3351 rmrr = container_of(rmrru->hdr,
3352 struct acpi_dmar_reserved_memory, header);
3353 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3354 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3355 ((void *)rmrr) + rmrr->header.length,
3356 rmrr->segment, rmrru->devices,
3357 rmrru->devices_cnt);
3358 if (ret < 0)
3359 return ret;
3360 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3361 dmar_remove_dev_scope(info, rmrr->segment,
3362 rmrru->devices, rmrru->devices_cnt);
3363 }
3364 }
3365
3366 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3367 if (atsru->include_all)
3368 continue;
3369
3370 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3371 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3372 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3373 (void *)atsr + atsr->header.length,
3374 atsr->segment, atsru->devices,
3375 atsru->devices_cnt);
3376 if (ret > 0)
3377 break;
3378 else if (ret < 0)
3379 return ret;
3380 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3381 if (dmar_remove_dev_scope(info, atsr->segment,
3382 atsru->devices, atsru->devices_cnt))
3383 break;
3384 }
3385 }
3386 list_for_each_entry(satcu, &dmar_satc_units, list) {
3387 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3388 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3389 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3390 (void *)satc + satc->header.length,
3391 satc->segment, satcu->devices,
3392 satcu->devices_cnt);
3393 if (ret > 0)
3394 break;
3395 else if (ret < 0)
3396 return ret;
3397 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3398 if (dmar_remove_dev_scope(info, satc->segment,
3399 satcu->devices, satcu->devices_cnt))
3400 break;
3401 }
3402 }
3403
3404 return 0;
3405}
3406
3407static int intel_iommu_memory_notifier(struct notifier_block *nb,
3408 unsigned long val, void *v)
3409{
3410 struct memory_notify *mhp = v;
3411 unsigned long start_vpfn = mm_to_dma_pfn_start(mhp->start_pfn);
3412 unsigned long last_vpfn = mm_to_dma_pfn_end(mhp->start_pfn +
3413 mhp->nr_pages - 1);
3414
3415 switch (val) {
3416 case MEM_GOING_ONLINE:
3417 if (iommu_domain_identity_map(si_domain,
3418 start_vpfn, last_vpfn)) {
3419 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3420 start_vpfn, last_vpfn);
3421 return NOTIFY_BAD;
3422 }
3423 break;
3424
3425 case MEM_OFFLINE:
3426 case MEM_CANCEL_ONLINE:
3427 {
3428 struct dmar_drhd_unit *drhd;
3429 struct intel_iommu *iommu;
3430 LIST_HEAD(freelist);
3431
3432 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3433
3434 rcu_read_lock();
3435 for_each_active_iommu(iommu, drhd)
3436 iommu_flush_iotlb_psi(iommu, si_domain,
3437 start_vpfn, mhp->nr_pages,
3438 list_empty(&freelist), 0);
3439 rcu_read_unlock();
3440 put_pages_list(&freelist);
3441 }
3442 break;
3443 }
3444
3445 return NOTIFY_OK;
3446}
3447
3448static struct notifier_block intel_iommu_memory_nb = {
3449 .notifier_call = intel_iommu_memory_notifier,
3450 .priority = 0
3451};
3452
3453static void intel_disable_iommus(void)
3454{
3455 struct intel_iommu *iommu = NULL;
3456 struct dmar_drhd_unit *drhd;
3457
3458 for_each_iommu(iommu, drhd)
3459 iommu_disable_translation(iommu);
3460}
3461
3462void intel_iommu_shutdown(void)
3463{
3464 struct dmar_drhd_unit *drhd;
3465 struct intel_iommu *iommu = NULL;
3466
3467 if (no_iommu || dmar_disabled)
3468 return;
3469
3470 down_write(&dmar_global_lock);
3471
3472 /* Disable PMRs explicitly here. */
3473 for_each_iommu(iommu, drhd)
3474 iommu_disable_protect_mem_regions(iommu);
3475
3476 /* Make sure the IOMMUs are switched off */
3477 intel_disable_iommus();
3478
3479 up_write(&dmar_global_lock);
3480}
3481
3482static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3483{
3484 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3485
3486 return container_of(iommu_dev, struct intel_iommu, iommu);
3487}
3488
3489static ssize_t version_show(struct device *dev,
3490 struct device_attribute *attr, char *buf)
3491{
3492 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3493 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3494 return sysfs_emit(buf, "%d:%d\n",
3495 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3496}
3497static DEVICE_ATTR_RO(version);
3498
3499static ssize_t address_show(struct device *dev,
3500 struct device_attribute *attr, char *buf)
3501{
3502 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3503 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
3504}
3505static DEVICE_ATTR_RO(address);
3506
3507static ssize_t cap_show(struct device *dev,
3508 struct device_attribute *attr, char *buf)
3509{
3510 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3511 return sysfs_emit(buf, "%llx\n", iommu->cap);
3512}
3513static DEVICE_ATTR_RO(cap);
3514
3515static ssize_t ecap_show(struct device *dev,
3516 struct device_attribute *attr, char *buf)
3517{
3518 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3519 return sysfs_emit(buf, "%llx\n", iommu->ecap);
3520}
3521static DEVICE_ATTR_RO(ecap);
3522
3523static ssize_t domains_supported_show(struct device *dev,
3524 struct device_attribute *attr, char *buf)
3525{
3526 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3527 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
3528}
3529static DEVICE_ATTR_RO(domains_supported);
3530
3531static ssize_t domains_used_show(struct device *dev,
3532 struct device_attribute *attr, char *buf)
3533{
3534 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3535 return sysfs_emit(buf, "%d\n",
3536 bitmap_weight(iommu->domain_ids,
3537 cap_ndoms(iommu->cap)));
3538}
3539static DEVICE_ATTR_RO(domains_used);
3540
3541static struct attribute *intel_iommu_attrs[] = {
3542 &dev_attr_version.attr,
3543 &dev_attr_address.attr,
3544 &dev_attr_cap.attr,
3545 &dev_attr_ecap.attr,
3546 &dev_attr_domains_supported.attr,
3547 &dev_attr_domains_used.attr,
3548 NULL,
3549};
3550
3551static struct attribute_group intel_iommu_group = {
3552 .name = "intel-iommu",
3553 .attrs = intel_iommu_attrs,
3554};
3555
3556const struct attribute_group *intel_iommu_groups[] = {
3557 &intel_iommu_group,
3558 NULL,
3559};
3560
3561static bool has_external_pci(void)
3562{
3563 struct pci_dev *pdev = NULL;
3564
3565 for_each_pci_dev(pdev)
3566 if (pdev->external_facing) {
3567 pci_dev_put(pdev);
3568 return true;
3569 }
3570
3571 return false;
3572}
3573
3574static int __init platform_optin_force_iommu(void)
3575{
3576 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3577 return 0;
3578
3579 if (no_iommu || dmar_disabled)
3580 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3581
3582 /*
3583 * If Intel-IOMMU is disabled by default, we will apply identity
3584 * map for all devices except those marked as being untrusted.
3585 */
3586 if (dmar_disabled)
3587 iommu_set_default_passthrough(false);
3588
3589 dmar_disabled = 0;
3590 no_iommu = 0;
3591
3592 return 1;
3593}
3594
3595static int __init probe_acpi_namespace_devices(void)
3596{
3597 struct dmar_drhd_unit *drhd;
3598 /* To avoid a -Wunused-but-set-variable warning. */
3599 struct intel_iommu *iommu __maybe_unused;
3600 struct device *dev;
3601 int i, ret = 0;
3602
3603 for_each_active_iommu(iommu, drhd) {
3604 for_each_active_dev_scope(drhd->devices,
3605 drhd->devices_cnt, i, dev) {
3606 struct acpi_device_physical_node *pn;
3607 struct acpi_device *adev;
3608
3609 if (dev->bus != &acpi_bus_type)
3610 continue;
3611
3612 adev = to_acpi_device(dev);
3613 mutex_lock(&adev->physical_node_lock);
3614 list_for_each_entry(pn,
3615 &adev->physical_node_list, node) {
3616 ret = iommu_probe_device(pn->dev);
3617 if (ret)
3618 break;
3619 }
3620 mutex_unlock(&adev->physical_node_lock);
3621
3622 if (ret)
3623 return ret;
3624 }
3625 }
3626
3627 return 0;
3628}
3629
3630static __init int tboot_force_iommu(void)
3631{
3632 if (!tboot_enabled())
3633 return 0;
3634
3635 if (no_iommu || dmar_disabled)
3636 pr_warn("Forcing Intel-IOMMU to enabled\n");
3637
3638 dmar_disabled = 0;
3639 no_iommu = 0;
3640
3641 return 1;
3642}
3643
3644int __init intel_iommu_init(void)
3645{
3646 int ret = -ENODEV;
3647 struct dmar_drhd_unit *drhd;
3648 struct intel_iommu *iommu;
3649
3650 /*
3651 * Intel IOMMU is required for a TXT/tboot launch or platform
3652 * opt in, so enforce that.
3653 */
3654 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3655 platform_optin_force_iommu();
3656
3657 down_write(&dmar_global_lock);
3658 if (dmar_table_init()) {
3659 if (force_on)
3660 panic("tboot: Failed to initialize DMAR table\n");
3661 goto out_free_dmar;
3662 }
3663
3664 if (dmar_dev_scope_init() < 0) {
3665 if (force_on)
3666 panic("tboot: Failed to initialize DMAR device scope\n");
3667 goto out_free_dmar;
3668 }
3669
3670 up_write(&dmar_global_lock);
3671
3672 /*
3673 * The bus notifier takes the dmar_global_lock, so lockdep will
3674 * complain later when we register it under the lock.
3675 */
3676 dmar_register_bus_notifier();
3677
3678 down_write(&dmar_global_lock);
3679
3680 if (!no_iommu)
3681 intel_iommu_debugfs_init();
3682
3683 if (no_iommu || dmar_disabled) {
3684 /*
3685 * We exit the function here to ensure IOMMU's remapping and
3686 * mempool aren't setup, which means that the IOMMU's PMRs
3687 * won't be disabled via the call to init_dmars(). So disable
3688 * it explicitly here. The PMRs were setup by tboot prior to
3689 * calling SENTER, but the kernel is expected to reset/tear
3690 * down the PMRs.
3691 */
3692 if (intel_iommu_tboot_noforce) {
3693 for_each_iommu(iommu, drhd)
3694 iommu_disable_protect_mem_regions(iommu);
3695 }
3696
3697 /*
3698 * Make sure the IOMMUs are switched off, even when we
3699 * boot into a kexec kernel and the previous kernel left
3700 * them enabled
3701 */
3702 intel_disable_iommus();
3703 goto out_free_dmar;
3704 }
3705
3706 if (list_empty(&dmar_rmrr_units))
3707 pr_info("No RMRR found\n");
3708
3709 if (list_empty(&dmar_atsr_units))
3710 pr_info("No ATSR found\n");
3711
3712 if (list_empty(&dmar_satc_units))
3713 pr_info("No SATC found\n");
3714
3715 init_no_remapping_devices();
3716
3717 ret = init_dmars();
3718 if (ret) {
3719 if (force_on)
3720 panic("tboot: Failed to initialize DMARs\n");
3721 pr_err("Initialization failed\n");
3722 goto out_free_dmar;
3723 }
3724 up_write(&dmar_global_lock);
3725
3726 init_iommu_pm_ops();
3727
3728 down_read(&dmar_global_lock);
3729 for_each_active_iommu(iommu, drhd) {
3730 /*
3731 * The flush queue implementation does not perform
3732 * page-selective invalidations that are required for efficient
3733 * TLB flushes in virtual environments. The benefit of batching
3734 * is likely to be much lower than the overhead of synchronizing
3735 * the virtual and physical IOMMU page-tables.
3736 */
3737 if (cap_caching_mode(iommu->cap) &&
3738 !first_level_by_default(IOMMU_DOMAIN_DMA)) {
3739 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3740 iommu_set_dma_strict();
3741 }
3742 iommu_device_sysfs_add(&iommu->iommu, NULL,
3743 intel_iommu_groups,
3744 "%s", iommu->name);
3745 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3746
3747 iommu_pmu_register(iommu);
3748 }
3749 up_read(&dmar_global_lock);
3750
3751 if (si_domain && !hw_pass_through)
3752 register_memory_notifier(&intel_iommu_memory_nb);
3753
3754 down_read(&dmar_global_lock);
3755 if (probe_acpi_namespace_devices())
3756 pr_warn("ACPI name space devices didn't probe correctly\n");
3757
3758 /* Finally, we enable the DMA remapping hardware. */
3759 for_each_iommu(iommu, drhd) {
3760 if (!drhd->ignored && !translation_pre_enabled(iommu))
3761 iommu_enable_translation(iommu);
3762
3763 iommu_disable_protect_mem_regions(iommu);
3764 }
3765 up_read(&dmar_global_lock);
3766
3767 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3768
3769 intel_iommu_enabled = 1;
3770
3771 return 0;
3772
3773out_free_dmar:
3774 intel_iommu_free_dmars();
3775 up_write(&dmar_global_lock);
3776 return ret;
3777}
3778
3779static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3780{
3781 struct device_domain_info *info = opaque;
3782
3783 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3784 return 0;
3785}
3786
3787/*
3788 * NB - intel-iommu lacks any sort of reference counting for the users of
3789 * dependent devices. If multiple endpoints have intersecting dependent
3790 * devices, unbinding the driver from any one of them will possibly leave
3791 * the others unable to operate.
3792 */
3793static void domain_context_clear(struct device_domain_info *info)
3794{
3795 if (!dev_is_pci(info->dev))
3796 domain_context_clear_one(info, info->bus, info->devfn);
3797
3798 pci_for_each_dma_alias(to_pci_dev(info->dev),
3799 &domain_context_clear_one_cb, info);
3800}
3801
3802static void dmar_remove_one_dev_info(struct device *dev)
3803{
3804 struct device_domain_info *info = dev_iommu_priv_get(dev);
3805 struct dmar_domain *domain = info->domain;
3806 struct intel_iommu *iommu = info->iommu;
3807 unsigned long flags;
3808
3809 if (!dev_is_real_dma_subdevice(info->dev)) {
3810 if (dev_is_pci(info->dev) && sm_supported(iommu))
3811 intel_pasid_tear_down_entry(iommu, info->dev,
3812 IOMMU_NO_PASID, false);
3813
3814 iommu_disable_pci_caps(info);
3815 domain_context_clear(info);
3816 }
3817
3818 spin_lock_irqsave(&domain->lock, flags);
3819 list_del(&info->link);
3820 spin_unlock_irqrestore(&domain->lock, flags);
3821
3822 domain_detach_iommu(domain, iommu);
3823 info->domain = NULL;
3824}
3825
3826/*
3827 * Clear the page table pointer in context or pasid table entries so that
3828 * all DMA requests without PASID from the device are blocked. If the page
3829 * table has been set, clean up the data structures.
3830 */
3831void device_block_translation(struct device *dev)
3832{
3833 struct device_domain_info *info = dev_iommu_priv_get(dev);
3834 struct intel_iommu *iommu = info->iommu;
3835 unsigned long flags;
3836
3837 iommu_disable_pci_caps(info);
3838 if (!dev_is_real_dma_subdevice(dev)) {
3839 if (sm_supported(iommu))
3840 intel_pasid_tear_down_entry(iommu, dev,
3841 IOMMU_NO_PASID, false);
3842 else
3843 domain_context_clear(info);
3844 }
3845
3846 if (!info->domain)
3847 return;
3848
3849 spin_lock_irqsave(&info->domain->lock, flags);
3850 list_del(&info->link);
3851 spin_unlock_irqrestore(&info->domain->lock, flags);
3852
3853 domain_detach_iommu(info->domain, iommu);
3854 info->domain = NULL;
3855}
3856
3857static int md_domain_init(struct dmar_domain *domain, int guest_width)
3858{
3859 int adjust_width;
3860
3861 /* calculate AGAW */
3862 domain->gaw = guest_width;
3863 adjust_width = guestwidth_to_adjustwidth(guest_width);
3864 domain->agaw = width_to_agaw(adjust_width);
3865
3866 domain->iommu_coherency = false;
3867 domain->iommu_superpage = 0;
3868 domain->max_addr = 0;
3869
3870 /* always allocate the top pgd */
3871 domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
3872 if (!domain->pgd)
3873 return -ENOMEM;
3874 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3875 return 0;
3876}
3877
3878static int blocking_domain_attach_dev(struct iommu_domain *domain,
3879 struct device *dev)
3880{
3881 device_block_translation(dev);
3882 return 0;
3883}
3884
3885static struct iommu_domain blocking_domain = {
3886 .type = IOMMU_DOMAIN_BLOCKED,
3887 .ops = &(const struct iommu_domain_ops) {
3888 .attach_dev = blocking_domain_attach_dev,
3889 }
3890};
3891
3892static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
3893{
3894 struct dmar_domain *dmar_domain;
3895 struct iommu_domain *domain;
3896
3897 switch (type) {
3898 case IOMMU_DOMAIN_DMA:
3899 case IOMMU_DOMAIN_UNMANAGED:
3900 dmar_domain = alloc_domain(type);
3901 if (!dmar_domain) {
3902 pr_err("Can't allocate dmar_domain\n");
3903 return NULL;
3904 }
3905 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3906 pr_err("Domain initialization failed\n");
3907 domain_exit(dmar_domain);
3908 return NULL;
3909 }
3910
3911 domain = &dmar_domain->domain;
3912 domain->geometry.aperture_start = 0;
3913 domain->geometry.aperture_end =
3914 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
3915 domain->geometry.force_aperture = true;
3916
3917 return domain;
3918 case IOMMU_DOMAIN_IDENTITY:
3919 return &si_domain->domain;
3920 case IOMMU_DOMAIN_SVA:
3921 return intel_svm_domain_alloc();
3922 default:
3923 return NULL;
3924 }
3925
3926 return NULL;
3927}
3928
3929static struct iommu_domain *
3930intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
3931 struct iommu_domain *parent,
3932 const struct iommu_user_data *user_data)
3933{
3934 struct device_domain_info *info = dev_iommu_priv_get(dev);
3935 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3936 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3937 struct intel_iommu *iommu = info->iommu;
3938 struct dmar_domain *dmar_domain;
3939 struct iommu_domain *domain;
3940
3941 /* Must be NESTING domain */
3942 if (parent) {
3943 if (!nested_supported(iommu) || flags)
3944 return ERR_PTR(-EOPNOTSUPP);
3945 return intel_nested_domain_alloc(parent, user_data);
3946 }
3947
3948 if (flags &
3949 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING)))
3950 return ERR_PTR(-EOPNOTSUPP);
3951 if (nested_parent && !nested_supported(iommu))
3952 return ERR_PTR(-EOPNOTSUPP);
3953 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3954 return ERR_PTR(-EOPNOTSUPP);
3955
3956 /*
3957 * domain_alloc_user op needs to fully initialize a domain before
3958 * return, so uses iommu_domain_alloc() here for simple.
3959 */
3960 domain = iommu_domain_alloc(dev->bus);
3961 if (!domain)
3962 return ERR_PTR(-ENOMEM);
3963
3964 dmar_domain = to_dmar_domain(domain);
3965
3966 if (nested_parent) {
3967 dmar_domain->nested_parent = true;
3968 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3969 spin_lock_init(&dmar_domain->s1_lock);
3970 }
3971
3972 if (dirty_tracking) {
3973 if (dmar_domain->use_first_level) {
3974 iommu_domain_free(domain);
3975 return ERR_PTR(-EOPNOTSUPP);
3976 }
3977 domain->dirty_ops = &intel_dirty_ops;
3978 }
3979
3980 return domain;
3981}
3982
3983static void intel_iommu_domain_free(struct iommu_domain *domain)
3984{
3985 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3986
3987 WARN_ON(dmar_domain->nested_parent &&
3988 !list_empty(&dmar_domain->s1_domains));
3989 if (domain != &si_domain->domain)
3990 domain_exit(dmar_domain);
3991}
3992
3993int prepare_domain_attach_device(struct iommu_domain *domain,
3994 struct device *dev)
3995{
3996 struct device_domain_info *info = dev_iommu_priv_get(dev);
3997 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3998 struct intel_iommu *iommu = info->iommu;
3999 int addr_width;
4000
4001 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4002 return -EINVAL;
4003
4004 if (domain->dirty_ops && !ssads_supported(iommu))
4005 return -EINVAL;
4006
4007 /* check if this iommu agaw is sufficient for max mapped address */
4008 addr_width = agaw_to_width(iommu->agaw);
4009 if (addr_width > cap_mgaw(iommu->cap))
4010 addr_width = cap_mgaw(iommu->cap);
4011
4012 if (dmar_domain->max_addr > (1LL << addr_width))
4013 return -EINVAL;
4014 dmar_domain->gaw = addr_width;
4015
4016 /*
4017 * Knock out extra levels of page tables if necessary
4018 */
4019 while (iommu->agaw < dmar_domain->agaw) {
4020 struct dma_pte *pte;
4021
4022 pte = dmar_domain->pgd;
4023 if (dma_pte_present(pte)) {
4024 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4025 free_pgtable_page(pte);
4026 }
4027 dmar_domain->agaw--;
4028 }
4029
4030 return 0;
4031}
4032
4033static int intel_iommu_attach_device(struct iommu_domain *domain,
4034 struct device *dev)
4035{
4036 struct device_domain_info *info = dev_iommu_priv_get(dev);
4037 int ret;
4038
4039 if (info->domain)
4040 device_block_translation(dev);
4041
4042 ret = prepare_domain_attach_device(domain, dev);
4043 if (ret)
4044 return ret;
4045
4046 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4047}
4048
4049static int intel_iommu_map(struct iommu_domain *domain,
4050 unsigned long iova, phys_addr_t hpa,
4051 size_t size, int iommu_prot, gfp_t gfp)
4052{
4053 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4054 u64 max_addr;
4055 int prot = 0;
4056
4057 if (iommu_prot & IOMMU_READ)
4058 prot |= DMA_PTE_READ;
4059 if (iommu_prot & IOMMU_WRITE)
4060 prot |= DMA_PTE_WRITE;
4061 if (dmar_domain->set_pte_snp)
4062 prot |= DMA_PTE_SNP;
4063
4064 max_addr = iova + size;
4065 if (dmar_domain->max_addr < max_addr) {
4066 u64 end;
4067
4068 /* check if minimum agaw is sufficient for mapped address */
4069 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4070 if (end < max_addr) {
4071 pr_err("%s: iommu width (%d) is not "
4072 "sufficient for the mapped address (%llx)\n",
4073 __func__, dmar_domain->gaw, max_addr);
4074 return -EFAULT;
4075 }
4076 dmar_domain->max_addr = max_addr;
4077 }
4078 /* Round up size to next multiple of PAGE_SIZE, if it and
4079 the low bits of hpa would take us onto the next page */
4080 size = aligned_nrpages(hpa, size);
4081 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4082 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
4083}
4084
4085static int intel_iommu_map_pages(struct iommu_domain *domain,
4086 unsigned long iova, phys_addr_t paddr,
4087 size_t pgsize, size_t pgcount,
4088 int prot, gfp_t gfp, size_t *mapped)
4089{
4090 unsigned long pgshift = __ffs(pgsize);
4091 size_t size = pgcount << pgshift;
4092 int ret;
4093
4094 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4095 return -EINVAL;
4096
4097 if (!IS_ALIGNED(iova | paddr, pgsize))
4098 return -EINVAL;
4099
4100 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4101 if (!ret && mapped)
4102 *mapped = size;
4103
4104 return ret;
4105}
4106
4107static size_t intel_iommu_unmap(struct iommu_domain *domain,
4108 unsigned long iova, size_t size,
4109 struct iommu_iotlb_gather *gather)
4110{
4111 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4112 unsigned long start_pfn, last_pfn;
4113 int level = 0;
4114
4115 /* Cope with horrid API which requires us to unmap more than the
4116 size argument if it happens to be a large-page mapping. */
4117 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
4118 &level, GFP_ATOMIC)))
4119 return 0;
4120
4121 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4122 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4123
4124 start_pfn = iova >> VTD_PAGE_SHIFT;
4125 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4126
4127 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4128
4129 if (dmar_domain->max_addr == iova + size)
4130 dmar_domain->max_addr = iova;
4131
4132 /*
4133 * We do not use page-selective IOTLB invalidation in flush queue,
4134 * so there is no need to track page and sync iotlb.
4135 */
4136 if (!iommu_iotlb_gather_queued(gather))
4137 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4138
4139 return size;
4140}
4141
4142static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4143 unsigned long iova,
4144 size_t pgsize, size_t pgcount,
4145 struct iommu_iotlb_gather *gather)
4146{
4147 unsigned long pgshift = __ffs(pgsize);
4148 size_t size = pgcount << pgshift;
4149
4150 return intel_iommu_unmap(domain, iova, size, gather);
4151}
4152
4153static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4154 struct iommu_iotlb_gather *gather)
4155{
4156 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4157 unsigned long iova_pfn = IOVA_PFN(gather->start);
4158 size_t size = gather->end - gather->start;
4159 struct iommu_domain_info *info;
4160 unsigned long start_pfn;
4161 unsigned long nrpages;
4162 unsigned long i;
4163
4164 nrpages = aligned_nrpages(gather->start, size);
4165 start_pfn = mm_to_dma_pfn_start(iova_pfn);
4166
4167 xa_for_each(&dmar_domain->iommu_array, i, info)
4168 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4169 start_pfn, nrpages,
4170 list_empty(&gather->freelist), 0);
4171
4172 if (dmar_domain->nested_parent)
4173 parent_domain_flush(dmar_domain, start_pfn, nrpages,
4174 list_empty(&gather->freelist));
4175 put_pages_list(&gather->freelist);
4176}
4177
4178static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4179 dma_addr_t iova)
4180{
4181 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4182 struct dma_pte *pte;
4183 int level = 0;
4184 u64 phys = 0;
4185
4186 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
4187 GFP_ATOMIC);
4188 if (pte && dma_pte_present(pte))
4189 phys = dma_pte_addr(pte) +
4190 (iova & (BIT_MASK(level_to_offset_bits(level) +
4191 VTD_PAGE_SHIFT) - 1));
4192
4193 return phys;
4194}
4195
4196static bool domain_support_force_snooping(struct dmar_domain *domain)
4197{
4198 struct device_domain_info *info;
4199 bool support = true;
4200
4201 assert_spin_locked(&domain->lock);
4202 list_for_each_entry(info, &domain->devices, link) {
4203 if (!ecap_sc_support(info->iommu->ecap)) {
4204 support = false;
4205 break;
4206 }
4207 }
4208
4209 return support;
4210}
4211
4212static void domain_set_force_snooping(struct dmar_domain *domain)
4213{
4214 struct device_domain_info *info;
4215
4216 assert_spin_locked(&domain->lock);
4217 /*
4218 * Second level page table supports per-PTE snoop control. The
4219 * iommu_map() interface will handle this by setting SNP bit.
4220 */
4221 if (!domain->use_first_level) {
4222 domain->set_pte_snp = true;
4223 return;
4224 }
4225
4226 list_for_each_entry(info, &domain->devices, link)
4227 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4228 IOMMU_NO_PASID);
4229}
4230
4231static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4232{
4233 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4234 unsigned long flags;
4235
4236 if (dmar_domain->force_snooping)
4237 return true;
4238
4239 spin_lock_irqsave(&dmar_domain->lock, flags);
4240 if (!domain_support_force_snooping(dmar_domain) ||
4241 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
4242 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4243 return false;
4244 }
4245
4246 domain_set_force_snooping(dmar_domain);
4247 dmar_domain->force_snooping = true;
4248 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4249
4250 return true;
4251}
4252
4253static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4254{
4255 struct device_domain_info *info = dev_iommu_priv_get(dev);
4256
4257 switch (cap) {
4258 case IOMMU_CAP_CACHE_COHERENCY:
4259 case IOMMU_CAP_DEFERRED_FLUSH:
4260 return true;
4261 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4262 return dmar_platform_optin();
4263 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4264 return ecap_sc_support(info->iommu->ecap);
4265 case IOMMU_CAP_DIRTY_TRACKING:
4266 return ssads_supported(info->iommu);
4267 default:
4268 return false;
4269 }
4270}
4271
4272static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4273{
4274 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4275 struct device_domain_info *info;
4276 struct intel_iommu *iommu;
4277 u8 bus, devfn;
4278 int ret;
4279
4280 iommu = device_lookup_iommu(dev, &bus, &devfn);
4281 if (!iommu || !iommu->iommu.ops)
4282 return ERR_PTR(-ENODEV);
4283
4284 info = kzalloc(sizeof(*info), GFP_KERNEL);
4285 if (!info)
4286 return ERR_PTR(-ENOMEM);
4287
4288 if (dev_is_real_dma_subdevice(dev)) {
4289 info->bus = pdev->bus->number;
4290 info->devfn = pdev->devfn;
4291 info->segment = pci_domain_nr(pdev->bus);
4292 } else {
4293 info->bus = bus;
4294 info->devfn = devfn;
4295 info->segment = iommu->segment;
4296 }
4297
4298 info->dev = dev;
4299 info->iommu = iommu;
4300 if (dev_is_pci(dev)) {
4301 if (ecap_dev_iotlb_support(iommu->ecap) &&
4302 pci_ats_supported(pdev) &&
4303 dmar_ats_supported(pdev, iommu)) {
4304 info->ats_supported = 1;
4305 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4306
4307 /*
4308 * For IOMMU that supports device IOTLB throttling
4309 * (DIT), we assign PFSID to the invalidation desc
4310 * of a VF such that IOMMU HW can gauge queue depth
4311 * at PF level. If DIT is not set, PFSID will be
4312 * treated as reserved, which should be set to 0.
4313 */
4314 if (ecap_dit(iommu->ecap))
4315 info->pfsid = pci_dev_id(pci_physfn(pdev));
4316 info->ats_qdep = pci_ats_queue_depth(pdev);
4317 }
4318 if (sm_supported(iommu)) {
4319 if (pasid_supported(iommu)) {
4320 int features = pci_pasid_features(pdev);
4321
4322 if (features >= 0)
4323 info->pasid_supported = features | 1;
4324 }
4325
4326 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4327 pci_pri_supported(pdev))
4328 info->pri_supported = 1;
4329 }
4330 }
4331
4332 dev_iommu_priv_set(dev, info);
4333
4334 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4335 ret = intel_pasid_alloc_table(dev);
4336 if (ret) {
4337 dev_err(dev, "PASID table allocation failed\n");
4338 kfree(info);
4339 return ERR_PTR(ret);
4340 }
4341 }
4342
4343 intel_iommu_debugfs_create_dev(info);
4344
4345 return &iommu->iommu;
4346}
4347
4348static void intel_iommu_release_device(struct device *dev)
4349{
4350 struct device_domain_info *info = dev_iommu_priv_get(dev);
4351
4352 dmar_remove_one_dev_info(dev);
4353 intel_pasid_free_table(dev);
4354 intel_iommu_debugfs_remove_dev(info);
4355 kfree(info);
4356 set_dma_ops(dev, NULL);
4357}
4358
4359static void intel_iommu_probe_finalize(struct device *dev)
4360{
4361 set_dma_ops(dev, NULL);
4362 iommu_setup_dma_ops(dev, 0, U64_MAX);
4363}
4364
4365static void intel_iommu_get_resv_regions(struct device *device,
4366 struct list_head *head)
4367{
4368 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4369 struct iommu_resv_region *reg;
4370 struct dmar_rmrr_unit *rmrr;
4371 struct device *i_dev;
4372 int i;
4373
4374 rcu_read_lock();
4375 for_each_rmrr_units(rmrr) {
4376 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4377 i, i_dev) {
4378 struct iommu_resv_region *resv;
4379 enum iommu_resv_type type;
4380 size_t length;
4381
4382 if (i_dev != device &&
4383 !is_downstream_to_pci_bridge(device, i_dev))
4384 continue;
4385
4386 length = rmrr->end_address - rmrr->base_address + 1;
4387
4388 type = device_rmrr_is_relaxable(device) ?
4389 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4390
4391 resv = iommu_alloc_resv_region(rmrr->base_address,
4392 length, prot, type,
4393 GFP_ATOMIC);
4394 if (!resv)
4395 break;
4396
4397 list_add_tail(&resv->list, head);
4398 }
4399 }
4400 rcu_read_unlock();
4401
4402#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4403 if (dev_is_pci(device)) {
4404 struct pci_dev *pdev = to_pci_dev(device);
4405
4406 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4407 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4408 IOMMU_RESV_DIRECT_RELAXABLE,
4409 GFP_KERNEL);
4410 if (reg)
4411 list_add_tail(®->list, head);
4412 }
4413 }
4414#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4415
4416 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4417 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4418 0, IOMMU_RESV_MSI, GFP_KERNEL);
4419 if (!reg)
4420 return;
4421 list_add_tail(®->list, head);
4422}
4423
4424static struct iommu_group *intel_iommu_device_group(struct device *dev)
4425{
4426 if (dev_is_pci(dev))
4427 return pci_device_group(dev);
4428 return generic_device_group(dev);
4429}
4430
4431static int intel_iommu_enable_sva(struct device *dev)
4432{
4433 struct device_domain_info *info = dev_iommu_priv_get(dev);
4434 struct intel_iommu *iommu;
4435
4436 if (!info || dmar_disabled)
4437 return -EINVAL;
4438
4439 iommu = info->iommu;
4440 if (!iommu)
4441 return -EINVAL;
4442
4443 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4444 return -ENODEV;
4445
4446 if (!info->pasid_enabled || !info->ats_enabled)
4447 return -EINVAL;
4448
4449 /*
4450 * Devices having device-specific I/O fault handling should not
4451 * support PCI/PRI. The IOMMU side has no means to check the
4452 * capability of device-specific IOPF. Therefore, IOMMU can only
4453 * default that if the device driver enables SVA on a non-PRI
4454 * device, it will handle IOPF in its own way.
4455 */
4456 if (!info->pri_supported)
4457 return 0;
4458
4459 /* Devices supporting PRI should have it enabled. */
4460 if (!info->pri_enabled)
4461 return -EINVAL;
4462
4463 return 0;
4464}
4465
4466static int intel_iommu_enable_iopf(struct device *dev)
4467{
4468 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4469 struct device_domain_info *info = dev_iommu_priv_get(dev);
4470 struct intel_iommu *iommu;
4471 int ret;
4472
4473 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
4474 return -ENODEV;
4475
4476 if (info->pri_enabled)
4477 return -EBUSY;
4478
4479 iommu = info->iommu;
4480 if (!iommu)
4481 return -EINVAL;
4482
4483 /* PASID is required in PRG Response Message. */
4484 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
4485 return -EINVAL;
4486
4487 ret = pci_reset_pri(pdev);
4488 if (ret)
4489 return ret;
4490
4491 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4492 if (ret)
4493 return ret;
4494
4495 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4496 if (ret)
4497 goto iopf_remove_device;
4498
4499 ret = pci_enable_pri(pdev, PRQ_DEPTH);
4500 if (ret)
4501 goto iopf_unregister_handler;
4502 info->pri_enabled = 1;
4503
4504 return 0;
4505
4506iopf_unregister_handler:
4507 iommu_unregister_device_fault_handler(dev);
4508iopf_remove_device:
4509 iopf_queue_remove_device(iommu->iopf_queue, dev);
4510
4511 return ret;
4512}
4513
4514static int intel_iommu_disable_iopf(struct device *dev)
4515{
4516 struct device_domain_info *info = dev_iommu_priv_get(dev);
4517 struct intel_iommu *iommu = info->iommu;
4518
4519 if (!info->pri_enabled)
4520 return -EINVAL;
4521
4522 /*
4523 * PCIe spec states that by clearing PRI enable bit, the Page
4524 * Request Interface will not issue new page requests, but has
4525 * outstanding page requests that have been transmitted or are
4526 * queued for transmission. This is supposed to be called after
4527 * the device driver has stopped DMA, all PASIDs have been
4528 * unbound and the outstanding PRQs have been drained.
4529 */
4530 pci_disable_pri(to_pci_dev(dev));
4531 info->pri_enabled = 0;
4532
4533 /*
4534 * With PRI disabled and outstanding PRQs drained, unregistering
4535 * fault handler and removing device from iopf queue should never
4536 * fail.
4537 */
4538 WARN_ON(iommu_unregister_device_fault_handler(dev));
4539 WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
4540
4541 return 0;
4542}
4543
4544static int
4545intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4546{
4547 switch (feat) {
4548 case IOMMU_DEV_FEAT_IOPF:
4549 return intel_iommu_enable_iopf(dev);
4550
4551 case IOMMU_DEV_FEAT_SVA:
4552 return intel_iommu_enable_sva(dev);
4553
4554 default:
4555 return -ENODEV;
4556 }
4557}
4558
4559static int
4560intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4561{
4562 switch (feat) {
4563 case IOMMU_DEV_FEAT_IOPF:
4564 return intel_iommu_disable_iopf(dev);
4565
4566 case IOMMU_DEV_FEAT_SVA:
4567 return 0;
4568
4569 default:
4570 return -ENODEV;
4571 }
4572}
4573
4574static bool intel_iommu_is_attach_deferred(struct device *dev)
4575{
4576 struct device_domain_info *info = dev_iommu_priv_get(dev);
4577
4578 return translation_pre_enabled(info->iommu) && !info->domain;
4579}
4580
4581/*
4582 * Check that the device does not live on an external facing PCI port that is
4583 * marked as untrusted. Such devices should not be able to apply quirks and
4584 * thus not be able to bypass the IOMMU restrictions.
4585 */
4586static bool risky_device(struct pci_dev *pdev)
4587{
4588 if (pdev->untrusted) {
4589 pci_info(pdev,
4590 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4591 pdev->vendor, pdev->device);
4592 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4593 return true;
4594 }
4595 return false;
4596}
4597
4598static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4599 unsigned long iova, size_t size)
4600{
4601 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4602 unsigned long pages = aligned_nrpages(iova, size);
4603 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4604 struct iommu_domain_info *info;
4605 unsigned long i;
4606
4607 xa_for_each(&dmar_domain->iommu_array, i, info)
4608 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4609 return 0;
4610}
4611
4612static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4613{
4614 struct device_domain_info *info = dev_iommu_priv_get(dev);
4615 struct dev_pasid_info *curr, *dev_pasid = NULL;
4616 struct intel_iommu *iommu = info->iommu;
4617 struct dmar_domain *dmar_domain;
4618 struct iommu_domain *domain;
4619 unsigned long flags;
4620
4621 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4622 if (WARN_ON_ONCE(!domain))
4623 goto out_tear_down;
4624
4625 /*
4626 * The SVA implementation needs to handle its own stuffs like the mm
4627 * notification. Before consolidating that code into iommu core, let
4628 * the intel sva code handle it.
4629 */
4630 if (domain->type == IOMMU_DOMAIN_SVA) {
4631 intel_svm_remove_dev_pasid(dev, pasid);
4632 goto out_tear_down;
4633 }
4634
4635 dmar_domain = to_dmar_domain(domain);
4636 spin_lock_irqsave(&dmar_domain->lock, flags);
4637 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4638 if (curr->dev == dev && curr->pasid == pasid) {
4639 list_del(&curr->link_domain);
4640 dev_pasid = curr;
4641 break;
4642 }
4643 }
4644 WARN_ON_ONCE(!dev_pasid);
4645 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4646
4647 domain_detach_iommu(dmar_domain, iommu);
4648 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4649 kfree(dev_pasid);
4650out_tear_down:
4651 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4652 intel_drain_pasid_prq(dev, pasid);
4653}
4654
4655static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4656 struct device *dev, ioasid_t pasid)
4657{
4658 struct device_domain_info *info = dev_iommu_priv_get(dev);
4659 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4660 struct intel_iommu *iommu = info->iommu;
4661 struct dev_pasid_info *dev_pasid;
4662 unsigned long flags;
4663 int ret;
4664
4665 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4666 return -EOPNOTSUPP;
4667
4668 if (domain->dirty_ops)
4669 return -EINVAL;
4670
4671 if (context_copied(iommu, info->bus, info->devfn))
4672 return -EBUSY;
4673
4674 ret = prepare_domain_attach_device(domain, dev);
4675 if (ret)
4676 return ret;
4677
4678 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4679 if (!dev_pasid)
4680 return -ENOMEM;
4681
4682 ret = domain_attach_iommu(dmar_domain, iommu);
4683 if (ret)
4684 goto out_free;
4685
4686 if (domain_type_is_si(dmar_domain))
4687 ret = intel_pasid_setup_pass_through(iommu, dev, pasid);
4688 else if (dmar_domain->use_first_level)
4689 ret = domain_setup_first_level(iommu, dmar_domain,
4690 dev, pasid);
4691 else
4692 ret = intel_pasid_setup_second_level(iommu, dmar_domain,
4693 dev, pasid);
4694 if (ret)
4695 goto out_detach_iommu;
4696
4697 dev_pasid->dev = dev;
4698 dev_pasid->pasid = pasid;
4699 spin_lock_irqsave(&dmar_domain->lock, flags);
4700 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4701 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4702
4703 if (domain->type & __IOMMU_DOMAIN_PAGING)
4704 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4705
4706 return 0;
4707out_detach_iommu:
4708 domain_detach_iommu(dmar_domain, iommu);
4709out_free:
4710 kfree(dev_pasid);
4711 return ret;
4712}
4713
4714static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4715{
4716 struct device_domain_info *info = dev_iommu_priv_get(dev);
4717 struct intel_iommu *iommu = info->iommu;
4718 struct iommu_hw_info_vtd *vtd;
4719
4720 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4721 if (!vtd)
4722 return ERR_PTR(-ENOMEM);
4723
4724 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4725 vtd->cap_reg = iommu->cap;
4726 vtd->ecap_reg = iommu->ecap;
4727 *length = sizeof(*vtd);
4728 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4729 return vtd;
4730}
4731
4732/*
4733 * Set dirty tracking for the device list of a domain. The caller must
4734 * hold the domain->lock when calling it.
4735 */
4736static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4737{
4738 struct device_domain_info *info;
4739 int ret = 0;
4740
4741 list_for_each_entry(info, devices, link) {
4742 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4743 IOMMU_NO_PASID, enable);
4744 if (ret)
4745 break;
4746 }
4747
4748 return ret;
4749}
4750
4751static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4752 bool enable)
4753{
4754 struct dmar_domain *s1_domain;
4755 unsigned long flags;
4756 int ret;
4757
4758 spin_lock(&domain->s1_lock);
4759 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4760 spin_lock_irqsave(&s1_domain->lock, flags);
4761 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4762 spin_unlock_irqrestore(&s1_domain->lock, flags);
4763 if (ret)
4764 goto err_unwind;
4765 }
4766 spin_unlock(&domain->s1_lock);
4767 return 0;
4768
4769err_unwind:
4770 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4771 spin_lock_irqsave(&s1_domain->lock, flags);
4772 device_set_dirty_tracking(&s1_domain->devices,
4773 domain->dirty_tracking);
4774 spin_unlock_irqrestore(&s1_domain->lock, flags);
4775 }
4776 spin_unlock(&domain->s1_lock);
4777 return ret;
4778}
4779
4780static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4781 bool enable)
4782{
4783 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4784 int ret;
4785
4786 spin_lock(&dmar_domain->lock);
4787 if (dmar_domain->dirty_tracking == enable)
4788 goto out_unlock;
4789
4790 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4791 if (ret)
4792 goto err_unwind;
4793
4794 if (dmar_domain->nested_parent) {
4795 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4796 if (ret)
4797 goto err_unwind;
4798 }
4799
4800 dmar_domain->dirty_tracking = enable;
4801out_unlock:
4802 spin_unlock(&dmar_domain->lock);
4803
4804 return 0;
4805
4806err_unwind:
4807 device_set_dirty_tracking(&dmar_domain->devices,
4808 dmar_domain->dirty_tracking);
4809 spin_unlock(&dmar_domain->lock);
4810 return ret;
4811}
4812
4813static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4814 unsigned long iova, size_t size,
4815 unsigned long flags,
4816 struct iommu_dirty_bitmap *dirty)
4817{
4818 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4819 unsigned long end = iova + size - 1;
4820 unsigned long pgsize;
4821
4822 /*
4823 * IOMMUFD core calls into a dirty tracking disabled domain without an
4824 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4825 * have occurred when we stopped dirty tracking. This ensures that we
4826 * never inherit dirtied bits from a previous cycle.
4827 */
4828 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4829 return -EINVAL;
4830
4831 do {
4832 struct dma_pte *pte;
4833 int lvl = 0;
4834
4835 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4836 GFP_ATOMIC);
4837 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4838 if (!pte || !dma_pte_present(pte)) {
4839 iova += pgsize;
4840 continue;
4841 }
4842
4843 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4844 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4845 iova += pgsize;
4846 } while (iova < end);
4847
4848 return 0;
4849}
4850
4851static const struct iommu_dirty_ops intel_dirty_ops = {
4852 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4853 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4854};
4855
4856const struct iommu_ops intel_iommu_ops = {
4857 .blocked_domain = &blocking_domain,
4858 .capable = intel_iommu_capable,
4859 .hw_info = intel_iommu_hw_info,
4860 .domain_alloc = intel_iommu_domain_alloc,
4861 .domain_alloc_user = intel_iommu_domain_alloc_user,
4862 .probe_device = intel_iommu_probe_device,
4863 .probe_finalize = intel_iommu_probe_finalize,
4864 .release_device = intel_iommu_release_device,
4865 .get_resv_regions = intel_iommu_get_resv_regions,
4866 .device_group = intel_iommu_device_group,
4867 .dev_enable_feat = intel_iommu_dev_enable_feat,
4868 .dev_disable_feat = intel_iommu_dev_disable_feat,
4869 .is_attach_deferred = intel_iommu_is_attach_deferred,
4870 .def_domain_type = device_def_domain_type,
4871 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4872 .pgsize_bitmap = SZ_4K,
4873#ifdef CONFIG_INTEL_IOMMU_SVM
4874 .page_response = intel_svm_page_response,
4875#endif
4876 .default_domain_ops = &(const struct iommu_domain_ops) {
4877 .attach_dev = intel_iommu_attach_device,
4878 .set_dev_pasid = intel_iommu_set_dev_pasid,
4879 .map_pages = intel_iommu_map_pages,
4880 .unmap_pages = intel_iommu_unmap_pages,
4881 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4882 .flush_iotlb_all = intel_flush_iotlb_all,
4883 .iotlb_sync = intel_iommu_tlb_sync,
4884 .iova_to_phys = intel_iommu_iova_to_phys,
4885 .free = intel_iommu_domain_free,
4886 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4887 }
4888};
4889
4890static void quirk_iommu_igfx(struct pci_dev *dev)
4891{
4892 if (risky_device(dev))
4893 return;
4894
4895 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4896 dmar_map_gfx = 0;
4897}
4898
4899/* G4x/GM45 integrated gfx dmar support is totally busted. */
4900DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4901DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4902DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4903DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4904DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4905DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4906DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4907
4908/* Broadwell igfx malfunctions with dmar */
4909DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4910DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4912DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4913DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4914DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4915DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4916DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4917DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4918DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4919DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4920DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4921DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4922DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4923DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4924DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4925DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4926DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4927DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4928DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4929DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4930DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4931DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4932DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4933
4934static void quirk_iommu_rwbf(struct pci_dev *dev)
4935{
4936 if (risky_device(dev))
4937 return;
4938
4939 /*
4940 * Mobile 4 Series Chipset neglects to set RWBF capability,
4941 * but needs it. Same seems to hold for the desktop versions.
4942 */
4943 pci_info(dev, "Forcing write-buffer flush capability\n");
4944 rwbf_quirk = 1;
4945}
4946
4947DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4948DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4949DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4950DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4951DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4952DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4953DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4954
4955#define GGC 0x52
4956#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4957#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4958#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4959#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4960#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4961#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4962#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4963#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4964
4965static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4966{
4967 unsigned short ggc;
4968
4969 if (risky_device(dev))
4970 return;
4971
4972 if (pci_read_config_word(dev, GGC, &ggc))
4973 return;
4974
4975 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4976 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4977 dmar_map_gfx = 0;
4978 } else if (dmar_map_gfx) {
4979 /* we have to ensure the gfx device is idle before we flush */
4980 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4981 iommu_set_dma_strict();
4982 }
4983}
4984DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4985DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4986DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4987DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4988
4989static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4990{
4991 unsigned short ver;
4992
4993 if (!IS_GFX_DEVICE(dev))
4994 return;
4995
4996 ver = (dev->device >> 8) & 0xff;
4997 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4998 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4999 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
5000 return;
5001
5002 if (risky_device(dev))
5003 return;
5004
5005 pci_info(dev, "Skip IOMMU disabling for graphics\n");
5006 iommu_skip_te_disable = 1;
5007}
5008DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
5009
5010/* On Tylersburg chipsets, some BIOSes have been known to enable the
5011 ISOCH DMAR unit for the Azalia sound device, but not give it any
5012 TLB entries, which causes it to deadlock. Check for that. We do
5013 this in a function called from init_dmars(), instead of in a PCI
5014 quirk, because we don't want to print the obnoxious "BIOS broken"
5015 message if VT-d is actually disabled.
5016*/
5017static void __init check_tylersburg_isoch(void)
5018{
5019 struct pci_dev *pdev;
5020 uint32_t vtisochctrl;
5021
5022 /* If there's no Azalia in the system anyway, forget it. */
5023 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5024 if (!pdev)
5025 return;
5026
5027 if (risky_device(pdev)) {
5028 pci_dev_put(pdev);
5029 return;
5030 }
5031
5032 pci_dev_put(pdev);
5033
5034 /* System Management Registers. Might be hidden, in which case
5035 we can't do the sanity check. But that's OK, because the
5036 known-broken BIOSes _don't_ actually hide it, so far. */
5037 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5038 if (!pdev)
5039 return;
5040
5041 if (risky_device(pdev)) {
5042 pci_dev_put(pdev);
5043 return;
5044 }
5045
5046 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5047 pci_dev_put(pdev);
5048 return;
5049 }
5050
5051 pci_dev_put(pdev);
5052
5053 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5054 if (vtisochctrl & 1)
5055 return;
5056
5057 /* Drop all bits other than the number of TLB entries */
5058 vtisochctrl &= 0x1c;
5059
5060 /* If we have the recommended number of TLB entries (16), fine. */
5061 if (vtisochctrl == 0x10)
5062 return;
5063
5064 /* Zero TLB entries? You get to ride the short bus to school. */
5065 if (!vtisochctrl) {
5066 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5067 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5068 dmi_get_system_info(DMI_BIOS_VENDOR),
5069 dmi_get_system_info(DMI_BIOS_VERSION),
5070 dmi_get_system_info(DMI_PRODUCT_VERSION));
5071 iommu_identity_mapping |= IDENTMAP_AZALIA;
5072 return;
5073 }
5074
5075 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5076 vtisochctrl);
5077}
5078
5079/*
5080 * Here we deal with a device TLB defect where device may inadvertently issue ATS
5081 * invalidation completion before posted writes initiated with translated address
5082 * that utilized translations matching the invalidation address range, violating
5083 * the invalidation completion ordering.
5084 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
5085 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
5086 * under the control of the trusted/privileged host device driver must use this
5087 * quirk.
5088 * Device TLBs are invalidated under the following six conditions:
5089 * 1. Device driver does DMA API unmap IOVA
5090 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
5091 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
5092 * exit_mmap() due to crash
5093 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
5094 * VM has to free pages that were unmapped
5095 * 5. Userspace driver unmaps a DMA buffer
5096 * 6. Cache invalidation in vSVA usage (upcoming)
5097 *
5098 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5099 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5100 * invalidate TLB the same way as normal user unmap which will use this quirk.
5101 * The dTLB invalidation after PASID cache flush does not need this quirk.
5102 *
5103 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5104 */
5105void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5106 unsigned long address, unsigned long mask,
5107 u32 pasid, u16 qdep)
5108{
5109 u16 sid;
5110
5111 if (likely(!info->dtlb_extra_inval))
5112 return;
5113
5114 sid = PCI_DEVID(info->bus, info->devfn);
5115 if (pasid == IOMMU_NO_PASID) {
5116 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5117 qdep, address, mask);
5118 } else {
5119 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5120 pasid, qdep, address, mask);
5121 }
5122}
5123
5124#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
5125
5126/*
5127 * Function to submit a command to the enhanced command interface. The
5128 * valid enhanced command descriptions are defined in Table 47 of the
5129 * VT-d spec. The VT-d hardware implementation may support some but not
5130 * all commands, which can be determined by checking the Enhanced
5131 * Command Capability Register.
5132 *
5133 * Return values:
5134 * - 0: Command successful without any error;
5135 * - Negative: software error value;
5136 * - Nonzero positive: failure status code defined in Table 48.
5137 */
5138int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
5139{
5140 unsigned long flags;
5141 u64 res;
5142 int ret;
5143
5144 if (!cap_ecmds(iommu->cap))
5145 return -ENODEV;
5146
5147 raw_spin_lock_irqsave(&iommu->register_lock, flags);
5148
5149 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
5150 if (res & DMA_ECMD_ECRSP_IP) {
5151 ret = -EBUSY;
5152 goto err;
5153 }
5154
5155 /*
5156 * Unconditionally write the operand B, because
5157 * - There is no side effect if an ecmd doesn't require an
5158 * operand B, but we set the register to some value.
5159 * - It's not invoked in any critical path. The extra MMIO
5160 * write doesn't bring any performance concerns.
5161 */
5162 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
5163 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
5164
5165 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
5166 !(res & DMA_ECMD_ECRSP_IP), res);
5167
5168 if (res & DMA_ECMD_ECRSP_IP) {
5169 ret = -ETIMEDOUT;
5170 goto err;
5171 }
5172
5173 ret = ecmd_get_status_code(res);
5174err:
5175 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
5176
5177 return ret;
5178}
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/intel-svm.h>
20#include <linux/memory.h>
21#include <linux/pci.h>
22#include <linux/pci-ats.h>
23#include <linux/spinlock.h>
24#include <linux/syscore_ops.h>
25#include <linux/tboot.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-sva.h"
31#include "pasid.h"
32#include "cap_audit.h"
33
34#define ROOT_SIZE VTD_PAGE_SIZE
35#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START (0xfee00000)
43#define IOAPIC_RANGE_END (0xfeefffff)
44#define IOVA_START_ADDR (0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define MAX_AGAW_WIDTH 64
49#define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50
51#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53
54/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
57 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59
60/* IO virtual address start page frame number */
61#define IOVA_START_PFN (1)
62
63#define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
64
65/* page table handling */
66#define LEVEL_STRIDE (9)
67#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
68
69static inline int agaw_to_level(int agaw)
70{
71 return agaw + 2;
72}
73
74static inline int agaw_to_width(int agaw)
75{
76 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77}
78
79static inline int width_to_agaw(int width)
80{
81 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82}
83
84static inline unsigned int level_to_offset_bits(int level)
85{
86 return (level - 1) * LEVEL_STRIDE;
87}
88
89static inline int pfn_level_offset(u64 pfn, int level)
90{
91 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92}
93
94static inline u64 level_mask(int level)
95{
96 return -1ULL << level_to_offset_bits(level);
97}
98
99static inline u64 level_size(int level)
100{
101 return 1ULL << level_to_offset_bits(level);
102}
103
104static inline u64 align_to_level(u64 pfn, int level)
105{
106 return (pfn + level_size(level) - 1) & level_mask(level);
107}
108
109static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110{
111 return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112}
113
114/* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115 are never going to work. */
116static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117{
118 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119}
120static inline unsigned long page_to_dma_pfn(struct page *pg)
121{
122 return mm_to_dma_pfn(page_to_pfn(pg));
123}
124static inline unsigned long virt_to_dma_pfn(void *p)
125{
126 return page_to_dma_pfn(virt_to_page(p));
127}
128
129static void __init check_tylersburg_isoch(void);
130static int rwbf_quirk;
131
132/*
133 * set to 1 to panic kernel if can't successfully enable VT-d
134 * (used when kernel is launched w/ TXT)
135 */
136static int force_on = 0;
137static int intel_iommu_tboot_noforce;
138static int no_platform_optin;
139
140#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141
142/*
143 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144 * if marked present.
145 */
146static phys_addr_t root_entry_lctp(struct root_entry *re)
147{
148 if (!(re->lo & 1))
149 return 0;
150
151 return re->lo & VTD_PAGE_MASK;
152}
153
154/*
155 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156 * if marked present.
157 */
158static phys_addr_t root_entry_uctp(struct root_entry *re)
159{
160 if (!(re->hi & 1))
161 return 0;
162
163 return re->hi & VTD_PAGE_MASK;
164}
165
166static inline void context_set_present(struct context_entry *context)
167{
168 context->lo |= 1;
169}
170
171static inline void context_set_fault_enable(struct context_entry *context)
172{
173 context->lo &= (((u64)-1) << 2) | 1;
174}
175
176static inline void context_set_translation_type(struct context_entry *context,
177 unsigned long value)
178{
179 context->lo &= (((u64)-1) << 4) | 3;
180 context->lo |= (value & 3) << 2;
181}
182
183static inline void context_set_address_root(struct context_entry *context,
184 unsigned long value)
185{
186 context->lo &= ~VTD_PAGE_MASK;
187 context->lo |= value & VTD_PAGE_MASK;
188}
189
190static inline void context_set_address_width(struct context_entry *context,
191 unsigned long value)
192{
193 context->hi |= value & 7;
194}
195
196static inline void context_set_domain_id(struct context_entry *context,
197 unsigned long value)
198{
199 context->hi |= (value & ((1 << 16) - 1)) << 8;
200}
201
202static inline void context_set_pasid(struct context_entry *context)
203{
204 context->lo |= CONTEXT_PASIDE;
205}
206
207static inline int context_domain_id(struct context_entry *c)
208{
209 return((c->hi >> 8) & 0xffff);
210}
211
212static inline void context_clear_entry(struct context_entry *context)
213{
214 context->lo = 0;
215 context->hi = 0;
216}
217
218static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219{
220 if (!iommu->copied_tables)
221 return false;
222
223 return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224}
225
226static inline void
227set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228{
229 set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230}
231
232static inline void
233clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234{
235 clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236}
237
238/*
239 * This domain is a statically identity mapping domain.
240 * 1. This domain creats a static 1:1 mapping to all usable memory.
241 * 2. It maps to each iommu if successful.
242 * 3. Each iommu mapps to this domain if successful.
243 */
244static struct dmar_domain *si_domain;
245static int hw_pass_through = 1;
246
247struct dmar_rmrr_unit {
248 struct list_head list; /* list of rmrr units */
249 struct acpi_dmar_header *hdr; /* ACPI header */
250 u64 base_address; /* reserved base address*/
251 u64 end_address; /* reserved end address */
252 struct dmar_dev_scope *devices; /* target devices */
253 int devices_cnt; /* target device count */
254};
255
256struct dmar_atsr_unit {
257 struct list_head list; /* list of ATSR units */
258 struct acpi_dmar_header *hdr; /* ACPI header */
259 struct dmar_dev_scope *devices; /* target devices */
260 int devices_cnt; /* target device count */
261 u8 include_all:1; /* include all ports */
262};
263
264struct dmar_satc_unit {
265 struct list_head list; /* list of SATC units */
266 struct acpi_dmar_header *hdr; /* ACPI header */
267 struct dmar_dev_scope *devices; /* target devices */
268 struct intel_iommu *iommu; /* the corresponding iommu */
269 int devices_cnt; /* target device count */
270 u8 atc_required:1; /* ATS is required */
271};
272
273static LIST_HEAD(dmar_atsr_units);
274static LIST_HEAD(dmar_rmrr_units);
275static LIST_HEAD(dmar_satc_units);
276
277#define for_each_rmrr_units(rmrr) \
278 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279
280static void device_block_translation(struct device *dev);
281static void intel_iommu_domain_free(struct iommu_domain *domain);
282
283int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
284int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
285
286int intel_iommu_enabled = 0;
287EXPORT_SYMBOL_GPL(intel_iommu_enabled);
288
289static int dmar_map_gfx = 1;
290static int intel_iommu_superpage = 1;
291static int iommu_identity_mapping;
292static int iommu_skip_te_disable;
293
294#define IDENTMAP_GFX 2
295#define IDENTMAP_AZALIA 4
296
297const struct iommu_ops intel_iommu_ops;
298
299static bool translation_pre_enabled(struct intel_iommu *iommu)
300{
301 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
302}
303
304static void clear_translation_pre_enabled(struct intel_iommu *iommu)
305{
306 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
307}
308
309static void init_translation_status(struct intel_iommu *iommu)
310{
311 u32 gsts;
312
313 gsts = readl(iommu->reg + DMAR_GSTS_REG);
314 if (gsts & DMA_GSTS_TES)
315 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
316}
317
318static int __init intel_iommu_setup(char *str)
319{
320 if (!str)
321 return -EINVAL;
322
323 while (*str) {
324 if (!strncmp(str, "on", 2)) {
325 dmar_disabled = 0;
326 pr_info("IOMMU enabled\n");
327 } else if (!strncmp(str, "off", 3)) {
328 dmar_disabled = 1;
329 no_platform_optin = 1;
330 pr_info("IOMMU disabled\n");
331 } else if (!strncmp(str, "igfx_off", 8)) {
332 dmar_map_gfx = 0;
333 pr_info("Disable GFX device mapping\n");
334 } else if (!strncmp(str, "forcedac", 8)) {
335 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
336 iommu_dma_forcedac = true;
337 } else if (!strncmp(str, "strict", 6)) {
338 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
339 iommu_set_dma_strict();
340 } else if (!strncmp(str, "sp_off", 6)) {
341 pr_info("Disable supported super page\n");
342 intel_iommu_superpage = 0;
343 } else if (!strncmp(str, "sm_on", 5)) {
344 pr_info("Enable scalable mode if hardware supports\n");
345 intel_iommu_sm = 1;
346 } else if (!strncmp(str, "sm_off", 6)) {
347 pr_info("Scalable mode is disallowed\n");
348 intel_iommu_sm = 0;
349 } else if (!strncmp(str, "tboot_noforce", 13)) {
350 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
351 intel_iommu_tboot_noforce = 1;
352 } else {
353 pr_notice("Unknown option - '%s'\n", str);
354 }
355
356 str += strcspn(str, ",");
357 while (*str == ',')
358 str++;
359 }
360
361 return 1;
362}
363__setup("intel_iommu=", intel_iommu_setup);
364
365void *alloc_pgtable_page(int node)
366{
367 struct page *page;
368 void *vaddr = NULL;
369
370 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
371 if (page)
372 vaddr = page_address(page);
373 return vaddr;
374}
375
376void free_pgtable_page(void *vaddr)
377{
378 free_page((unsigned long)vaddr);
379}
380
381static inline int domain_type_is_si(struct dmar_domain *domain)
382{
383 return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
384}
385
386static inline int domain_pfn_supported(struct dmar_domain *domain,
387 unsigned long pfn)
388{
389 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
390
391 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
392}
393
394/*
395 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
396 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
397 * the returned SAGAW.
398 */
399static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
400{
401 unsigned long fl_sagaw, sl_sagaw;
402
403 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
404 sl_sagaw = cap_sagaw(iommu->cap);
405
406 /* Second level only. */
407 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
408 return sl_sagaw;
409
410 /* First level only. */
411 if (!ecap_slts(iommu->ecap))
412 return fl_sagaw;
413
414 return fl_sagaw & sl_sagaw;
415}
416
417static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
418{
419 unsigned long sagaw;
420 int agaw;
421
422 sagaw = __iommu_calculate_sagaw(iommu);
423 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
424 if (test_bit(agaw, &sagaw))
425 break;
426 }
427
428 return agaw;
429}
430
431/*
432 * Calculate max SAGAW for each iommu.
433 */
434int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
435{
436 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
437}
438
439/*
440 * calculate agaw for each iommu.
441 * "SAGAW" may be different across iommus, use a default agaw, and
442 * get a supported less agaw for iommus that don't support the default agaw.
443 */
444int iommu_calculate_agaw(struct intel_iommu *iommu)
445{
446 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
447}
448
449static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
450{
451 return sm_supported(iommu) ?
452 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
453}
454
455static void domain_update_iommu_coherency(struct dmar_domain *domain)
456{
457 struct iommu_domain_info *info;
458 struct dmar_drhd_unit *drhd;
459 struct intel_iommu *iommu;
460 bool found = false;
461 unsigned long i;
462
463 domain->iommu_coherency = true;
464 xa_for_each(&domain->iommu_array, i, info) {
465 found = true;
466 if (!iommu_paging_structure_coherency(info->iommu)) {
467 domain->iommu_coherency = false;
468 break;
469 }
470 }
471 if (found)
472 return;
473
474 /* No hardware attached; use lowest common denominator */
475 rcu_read_lock();
476 for_each_active_iommu(iommu, drhd) {
477 if (!iommu_paging_structure_coherency(iommu)) {
478 domain->iommu_coherency = false;
479 break;
480 }
481 }
482 rcu_read_unlock();
483}
484
485static int domain_update_iommu_superpage(struct dmar_domain *domain,
486 struct intel_iommu *skip)
487{
488 struct dmar_drhd_unit *drhd;
489 struct intel_iommu *iommu;
490 int mask = 0x3;
491
492 if (!intel_iommu_superpage)
493 return 0;
494
495 /* set iommu_superpage to the smallest common denominator */
496 rcu_read_lock();
497 for_each_active_iommu(iommu, drhd) {
498 if (iommu != skip) {
499 if (domain && domain->use_first_level) {
500 if (!cap_fl1gp_support(iommu->cap))
501 mask = 0x1;
502 } else {
503 mask &= cap_super_page_val(iommu->cap);
504 }
505
506 if (!mask)
507 break;
508 }
509 }
510 rcu_read_unlock();
511
512 return fls(mask);
513}
514
515static int domain_update_device_node(struct dmar_domain *domain)
516{
517 struct device_domain_info *info;
518 int nid = NUMA_NO_NODE;
519 unsigned long flags;
520
521 spin_lock_irqsave(&domain->lock, flags);
522 list_for_each_entry(info, &domain->devices, link) {
523 /*
524 * There could possibly be multiple device numa nodes as devices
525 * within the same domain may sit behind different IOMMUs. There
526 * isn't perfect answer in such situation, so we select first
527 * come first served policy.
528 */
529 nid = dev_to_node(info->dev);
530 if (nid != NUMA_NO_NODE)
531 break;
532 }
533 spin_unlock_irqrestore(&domain->lock, flags);
534
535 return nid;
536}
537
538static void domain_update_iotlb(struct dmar_domain *domain);
539
540/* Return the super pagesize bitmap if supported. */
541static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
542{
543 unsigned long bitmap = 0;
544
545 /*
546 * 1-level super page supports page size of 2MiB, 2-level super page
547 * supports page size of both 2MiB and 1GiB.
548 */
549 if (domain->iommu_superpage == 1)
550 bitmap |= SZ_2M;
551 else if (domain->iommu_superpage == 2)
552 bitmap |= SZ_2M | SZ_1G;
553
554 return bitmap;
555}
556
557/* Some capabilities may be different across iommus */
558static void domain_update_iommu_cap(struct dmar_domain *domain)
559{
560 domain_update_iommu_coherency(domain);
561 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
562
563 /*
564 * If RHSA is missing, we should default to the device numa domain
565 * as fall back.
566 */
567 if (domain->nid == NUMA_NO_NODE)
568 domain->nid = domain_update_device_node(domain);
569
570 /*
571 * First-level translation restricts the input-address to a
572 * canonical address (i.e., address bits 63:N have the same
573 * value as address bit [N-1], where N is 48-bits with 4-level
574 * paging and 57-bits with 5-level paging). Hence, skip bit
575 * [N-1].
576 */
577 if (domain->use_first_level)
578 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
579 else
580 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
581
582 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
583 domain_update_iotlb(domain);
584}
585
586struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
587 u8 devfn, int alloc)
588{
589 struct root_entry *root = &iommu->root_entry[bus];
590 struct context_entry *context;
591 u64 *entry;
592
593 /*
594 * Except that the caller requested to allocate a new entry,
595 * returning a copied context entry makes no sense.
596 */
597 if (!alloc && context_copied(iommu, bus, devfn))
598 return NULL;
599
600 entry = &root->lo;
601 if (sm_supported(iommu)) {
602 if (devfn >= 0x80) {
603 devfn -= 0x80;
604 entry = &root->hi;
605 }
606 devfn *= 2;
607 }
608 if (*entry & 1)
609 context = phys_to_virt(*entry & VTD_PAGE_MASK);
610 else {
611 unsigned long phy_addr;
612 if (!alloc)
613 return NULL;
614
615 context = alloc_pgtable_page(iommu->node);
616 if (!context)
617 return NULL;
618
619 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
620 phy_addr = virt_to_phys((void *)context);
621 *entry = phy_addr | 1;
622 __iommu_flush_cache(iommu, entry, sizeof(*entry));
623 }
624 return &context[devfn];
625}
626
627/**
628 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
629 * sub-hierarchy of a candidate PCI-PCI bridge
630 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
631 * @bridge: the candidate PCI-PCI bridge
632 *
633 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
634 */
635static bool
636is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
637{
638 struct pci_dev *pdev, *pbridge;
639
640 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
641 return false;
642
643 pdev = to_pci_dev(dev);
644 pbridge = to_pci_dev(bridge);
645
646 if (pbridge->subordinate &&
647 pbridge->subordinate->number <= pdev->bus->number &&
648 pbridge->subordinate->busn_res.end >= pdev->bus->number)
649 return true;
650
651 return false;
652}
653
654static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
655{
656 struct dmar_drhd_unit *drhd;
657 u32 vtbar;
658 int rc;
659
660 /* We know that this device on this chipset has its own IOMMU.
661 * If we find it under a different IOMMU, then the BIOS is lying
662 * to us. Hope that the IOMMU for this device is actually
663 * disabled, and it needs no translation...
664 */
665 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
666 if (rc) {
667 /* "can't" happen */
668 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
669 return false;
670 }
671 vtbar &= 0xffff0000;
672
673 /* we know that the this iommu should be at offset 0xa000 from vtbar */
674 drhd = dmar_find_matched_drhd_unit(pdev);
675 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
676 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
677 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
678 return true;
679 }
680
681 return false;
682}
683
684static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
685{
686 if (!iommu || iommu->drhd->ignored)
687 return true;
688
689 if (dev_is_pci(dev)) {
690 struct pci_dev *pdev = to_pci_dev(dev);
691
692 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
693 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
694 quirk_ioat_snb_local_iommu(pdev))
695 return true;
696 }
697
698 return false;
699}
700
701struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
702{
703 struct dmar_drhd_unit *drhd = NULL;
704 struct pci_dev *pdev = NULL;
705 struct intel_iommu *iommu;
706 struct device *tmp;
707 u16 segment = 0;
708 int i;
709
710 if (!dev)
711 return NULL;
712
713 if (dev_is_pci(dev)) {
714 struct pci_dev *pf_pdev;
715
716 pdev = pci_real_dma_dev(to_pci_dev(dev));
717
718 /* VFs aren't listed in scope tables; we need to look up
719 * the PF instead to find the IOMMU. */
720 pf_pdev = pci_physfn(pdev);
721 dev = &pf_pdev->dev;
722 segment = pci_domain_nr(pdev->bus);
723 } else if (has_acpi_companion(dev))
724 dev = &ACPI_COMPANION(dev)->dev;
725
726 rcu_read_lock();
727 for_each_iommu(iommu, drhd) {
728 if (pdev && segment != drhd->segment)
729 continue;
730
731 for_each_active_dev_scope(drhd->devices,
732 drhd->devices_cnt, i, tmp) {
733 if (tmp == dev) {
734 /* For a VF use its original BDF# not that of the PF
735 * which we used for the IOMMU lookup. Strictly speaking
736 * we could do this for all PCI devices; we only need to
737 * get the BDF# from the scope table for ACPI matches. */
738 if (pdev && pdev->is_virtfn)
739 goto got_pdev;
740
741 if (bus && devfn) {
742 *bus = drhd->devices[i].bus;
743 *devfn = drhd->devices[i].devfn;
744 }
745 goto out;
746 }
747
748 if (is_downstream_to_pci_bridge(dev, tmp))
749 goto got_pdev;
750 }
751
752 if (pdev && drhd->include_all) {
753got_pdev:
754 if (bus && devfn) {
755 *bus = pdev->bus->number;
756 *devfn = pdev->devfn;
757 }
758 goto out;
759 }
760 }
761 iommu = NULL;
762out:
763 if (iommu_is_dummy(iommu, dev))
764 iommu = NULL;
765
766 rcu_read_unlock();
767
768 return iommu;
769}
770
771static void domain_flush_cache(struct dmar_domain *domain,
772 void *addr, int size)
773{
774 if (!domain->iommu_coherency)
775 clflush_cache_range(addr, size);
776}
777
778static void free_context_table(struct intel_iommu *iommu)
779{
780 struct context_entry *context;
781 int i;
782
783 if (!iommu->root_entry)
784 return;
785
786 for (i = 0; i < ROOT_ENTRY_NR; i++) {
787 context = iommu_context_addr(iommu, i, 0, 0);
788 if (context)
789 free_pgtable_page(context);
790
791 if (!sm_supported(iommu))
792 continue;
793
794 context = iommu_context_addr(iommu, i, 0x80, 0);
795 if (context)
796 free_pgtable_page(context);
797 }
798
799 free_pgtable_page(iommu->root_entry);
800 iommu->root_entry = NULL;
801}
802
803#ifdef CONFIG_DMAR_DEBUG
804static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
805 u8 bus, u8 devfn, struct dma_pte *parent, int level)
806{
807 struct dma_pte *pte;
808 int offset;
809
810 while (1) {
811 offset = pfn_level_offset(pfn, level);
812 pte = &parent[offset];
813 if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
814 pr_info("PTE not present at level %d\n", level);
815 break;
816 }
817
818 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
819
820 if (level == 1)
821 break;
822
823 parent = phys_to_virt(dma_pte_addr(pte));
824 level--;
825 }
826}
827
828void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
829 unsigned long long addr, u32 pasid)
830{
831 struct pasid_dir_entry *dir, *pde;
832 struct pasid_entry *entries, *pte;
833 struct context_entry *ctx_entry;
834 struct root_entry *rt_entry;
835 int i, dir_index, index, level;
836 u8 devfn = source_id & 0xff;
837 u8 bus = source_id >> 8;
838 struct dma_pte *pgtable;
839
840 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
841
842 /* root entry dump */
843 rt_entry = &iommu->root_entry[bus];
844 if (!rt_entry) {
845 pr_info("root table entry is not present\n");
846 return;
847 }
848
849 if (sm_supported(iommu))
850 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
851 rt_entry->hi, rt_entry->lo);
852 else
853 pr_info("root entry: 0x%016llx", rt_entry->lo);
854
855 /* context entry dump */
856 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
857 if (!ctx_entry) {
858 pr_info("context table entry is not present\n");
859 return;
860 }
861
862 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
863 ctx_entry->hi, ctx_entry->lo);
864
865 /* legacy mode does not require PASID entries */
866 if (!sm_supported(iommu)) {
867 level = agaw_to_level(ctx_entry->hi & 7);
868 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
869 goto pgtable_walk;
870 }
871
872 /* get the pointer to pasid directory entry */
873 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
874 if (!dir) {
875 pr_info("pasid directory entry is not present\n");
876 return;
877 }
878 /* For request-without-pasid, get the pasid from context entry */
879 if (intel_iommu_sm && pasid == INVALID_IOASID)
880 pasid = PASID_RID2PASID;
881
882 dir_index = pasid >> PASID_PDE_SHIFT;
883 pde = &dir[dir_index];
884 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
885
886 /* get the pointer to the pasid table entry */
887 entries = get_pasid_table_from_pde(pde);
888 if (!entries) {
889 pr_info("pasid table entry is not present\n");
890 return;
891 }
892 index = pasid & PASID_PTE_MASK;
893 pte = &entries[index];
894 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
895 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
896
897 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
898 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
899 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
900 } else {
901 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
902 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
903 }
904
905pgtable_walk:
906 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
907}
908#endif
909
910static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
912{
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
915 int offset;
916
917 BUG_ON(!domain->pgd);
918
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
921 return NULL;
922
923 parent = domain->pgd;
924
925 while (1) {
926 void *tmp_page;
927
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
931 break;
932 if (level == *target_level)
933 break;
934
935 if (!dma_pte_present(pte)) {
936 uint64_t pteval;
937
938 tmp_page = alloc_pgtable_page(domain->nid);
939
940 if (!tmp_page)
941 return NULL;
942
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain->use_first_level)
946 pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
947
948 if (cmpxchg64(&pte->val, 0ULL, pteval))
949 /* Someone else set it while we were thinking; use theirs. */
950 free_pgtable_page(tmp_page);
951 else
952 domain_flush_cache(domain, pte, sizeof(*pte));
953 }
954 if (level == 1)
955 break;
956
957 parent = phys_to_virt(dma_pte_addr(pte));
958 level--;
959 }
960
961 if (!*target_level)
962 *target_level = level;
963
964 return pte;
965}
966
967/* return address's pte at specific level */
968static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 unsigned long pfn,
970 int level, int *large_page)
971{
972 struct dma_pte *parent, *pte;
973 int total = agaw_to_level(domain->agaw);
974 int offset;
975
976 parent = domain->pgd;
977 while (level <= total) {
978 offset = pfn_level_offset(pfn, total);
979 pte = &parent[offset];
980 if (level == total)
981 return pte;
982
983 if (!dma_pte_present(pte)) {
984 *large_page = total;
985 break;
986 }
987
988 if (dma_pte_superpage(pte)) {
989 *large_page = total;
990 return pte;
991 }
992
993 parent = phys_to_virt(dma_pte_addr(pte));
994 total--;
995 }
996 return NULL;
997}
998
999/* clear last level pte, a tlb flush should be followed */
1000static void dma_pte_clear_range(struct dmar_domain *domain,
1001 unsigned long start_pfn,
1002 unsigned long last_pfn)
1003{
1004 unsigned int large_page;
1005 struct dma_pte *first_pte, *pte;
1006
1007 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 BUG_ON(start_pfn > last_pfn);
1010
1011 /* we don't need lock here; nobody else touches the iova range */
1012 do {
1013 large_page = 1;
1014 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 if (!pte) {
1016 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017 continue;
1018 }
1019 do {
1020 dma_clear_pte(pte);
1021 start_pfn += lvl_to_nr_pages(large_page);
1022 pte++;
1023 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025 domain_flush_cache(domain, first_pte,
1026 (void *)pte - (void *)first_pte);
1027
1028 } while (start_pfn && start_pfn <= last_pfn);
1029}
1030
1031static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 int retain_level, struct dma_pte *pte,
1033 unsigned long pfn, unsigned long start_pfn,
1034 unsigned long last_pfn)
1035{
1036 pfn = max(start_pfn, pfn);
1037 pte = &pte[pfn_level_offset(pfn, level)];
1038
1039 do {
1040 unsigned long level_pfn;
1041 struct dma_pte *level_pte;
1042
1043 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044 goto next;
1045
1046 level_pfn = pfn & level_mask(level);
1047 level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049 if (level > 2) {
1050 dma_pte_free_level(domain, level - 1, retain_level,
1051 level_pte, level_pfn, start_pfn,
1052 last_pfn);
1053 }
1054
1055 /*
1056 * Free the page table if we're below the level we want to
1057 * retain and the range covers the entire table.
1058 */
1059 if (level < retain_level && !(start_pfn > level_pfn ||
1060 last_pfn < level_pfn + level_size(level) - 1)) {
1061 dma_clear_pte(pte);
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1063 free_pgtable_page(level_pte);
1064 }
1065next:
1066 pfn += level_size(level);
1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068}
1069
1070/*
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1073 */
1074static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn,
1077 int retain_level)
1078{
1079 dma_pte_clear_range(domain, start_pfn, last_pfn);
1080
1081 /* We don't need lock here; nobody else touches the iova range */
1082 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1083 domain->pgd, 0, start_pfn, last_pfn);
1084
1085 /* free pgd */
1086 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1087 free_pgtable_page(domain->pgd);
1088 domain->pgd = NULL;
1089 }
1090}
1091
1092/* When a page at a given level is being unlinked from its parent, we don't
1093 need to *modify* it at all. All we need to do is make a list of all the
1094 pages which can be freed just as soon as we've flushed the IOTLB and we
1095 know the hardware page-walk will no longer touch them.
1096 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1097 be freed. */
1098static void dma_pte_list_pagetables(struct dmar_domain *domain,
1099 int level, struct dma_pte *pte,
1100 struct list_head *freelist)
1101{
1102 struct page *pg;
1103
1104 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1105 list_add_tail(&pg->lru, freelist);
1106
1107 if (level == 1)
1108 return;
1109
1110 pte = page_address(pg);
1111 do {
1112 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1113 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 pte++;
1115 } while (!first_pte_in_page(pte));
1116}
1117
1118static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1119 struct dma_pte *pte, unsigned long pfn,
1120 unsigned long start_pfn, unsigned long last_pfn,
1121 struct list_head *freelist)
1122{
1123 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1124
1125 pfn = max(start_pfn, pfn);
1126 pte = &pte[pfn_level_offset(pfn, level)];
1127
1128 do {
1129 unsigned long level_pfn = pfn & level_mask(level);
1130
1131 if (!dma_pte_present(pte))
1132 goto next;
1133
1134 /* If range covers entire pagetable, free it */
1135 if (start_pfn <= level_pfn &&
1136 last_pfn >= level_pfn + level_size(level) - 1) {
1137 /* These suborbinate page tables are going away entirely. Don't
1138 bother to clear them; we're just going to *free* them. */
1139 if (level > 1 && !dma_pte_superpage(pte))
1140 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1141
1142 dma_clear_pte(pte);
1143 if (!first_pte)
1144 first_pte = pte;
1145 last_pte = pte;
1146 } else if (level > 1) {
1147 /* Recurse down into a level that isn't *entirely* obsolete */
1148 dma_pte_clear_level(domain, level - 1,
1149 phys_to_virt(dma_pte_addr(pte)),
1150 level_pfn, start_pfn, last_pfn,
1151 freelist);
1152 }
1153next:
1154 pfn = level_pfn + level_size(level);
1155 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1156
1157 if (first_pte)
1158 domain_flush_cache(domain, first_pte,
1159 (void *)++last_pte - (void *)first_pte);
1160}
1161
1162/* We can't just free the pages because the IOMMU may still be walking
1163 the page tables, and may have cached the intermediate levels. The
1164 pages can only be freed after the IOTLB flush has been done. */
1165static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1166 unsigned long last_pfn, struct list_head *freelist)
1167{
1168 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1169 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1170 BUG_ON(start_pfn > last_pfn);
1171
1172 /* we don't need lock here; nobody else touches the iova range */
1173 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1174 domain->pgd, 0, start_pfn, last_pfn, freelist);
1175
1176 /* free pgd */
1177 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1178 struct page *pgd_page = virt_to_page(domain->pgd);
1179 list_add_tail(&pgd_page->lru, freelist);
1180 domain->pgd = NULL;
1181 }
1182}
1183
1184/* iommu handling */
1185static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1186{
1187 struct root_entry *root;
1188
1189 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1190 if (!root) {
1191 pr_err("Allocating root entry for %s failed\n",
1192 iommu->name);
1193 return -ENOMEM;
1194 }
1195
1196 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1197 iommu->root_entry = root;
1198
1199 return 0;
1200}
1201
1202static void iommu_set_root_entry(struct intel_iommu *iommu)
1203{
1204 u64 addr;
1205 u32 sts;
1206 unsigned long flag;
1207
1208 addr = virt_to_phys(iommu->root_entry);
1209 if (sm_supported(iommu))
1210 addr |= DMA_RTADDR_SMT;
1211
1212 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1213 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1214
1215 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1216
1217 /* Make sure hardware complete it */
1218 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1219 readl, (sts & DMA_GSTS_RTPS), sts);
1220
1221 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222
1223 /*
1224 * Hardware invalidates all DMA remapping hardware translation
1225 * caches as part of SRTP flow.
1226 */
1227 if (cap_esrtps(iommu->cap))
1228 return;
1229
1230 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1231 if (sm_supported(iommu))
1232 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1233 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1234}
1235
1236void iommu_flush_write_buffer(struct intel_iommu *iommu)
1237{
1238 u32 val;
1239 unsigned long flag;
1240
1241 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1242 return;
1243
1244 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1245 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1246
1247 /* Make sure hardware complete it */
1248 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1249 readl, (!(val & DMA_GSTS_WBFS)), val);
1250
1251 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1252}
1253
1254/* return value determine if we need a write buffer flush */
1255static void __iommu_flush_context(struct intel_iommu *iommu,
1256 u16 did, u16 source_id, u8 function_mask,
1257 u64 type)
1258{
1259 u64 val = 0;
1260 unsigned long flag;
1261
1262 switch (type) {
1263 case DMA_CCMD_GLOBAL_INVL:
1264 val = DMA_CCMD_GLOBAL_INVL;
1265 break;
1266 case DMA_CCMD_DOMAIN_INVL:
1267 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1268 break;
1269 case DMA_CCMD_DEVICE_INVL:
1270 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1271 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1272 break;
1273 default:
1274 BUG();
1275 }
1276 val |= DMA_CCMD_ICC;
1277
1278 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1279 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1280
1281 /* Make sure hardware complete it */
1282 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1283 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1284
1285 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1286}
1287
1288/* return value determine if we need a write buffer flush */
1289static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1290 u64 addr, unsigned int size_order, u64 type)
1291{
1292 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1293 u64 val = 0, val_iva = 0;
1294 unsigned long flag;
1295
1296 switch (type) {
1297 case DMA_TLB_GLOBAL_FLUSH:
1298 /* global flush doesn't need set IVA_REG */
1299 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1300 break;
1301 case DMA_TLB_DSI_FLUSH:
1302 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303 break;
1304 case DMA_TLB_PSI_FLUSH:
1305 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1306 /* IH bit is passed in as part of address */
1307 val_iva = size_order | addr;
1308 break;
1309 default:
1310 BUG();
1311 }
1312 /* Note: set drain read/write */
1313#if 0
1314 /*
1315 * This is probably to be super secure.. Looks like we can
1316 * ignore it without any impact.
1317 */
1318 if (cap_read_drain(iommu->cap))
1319 val |= DMA_TLB_READ_DRAIN;
1320#endif
1321 if (cap_write_drain(iommu->cap))
1322 val |= DMA_TLB_WRITE_DRAIN;
1323
1324 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1325 /* Note: Only uses first TLB reg currently */
1326 if (val_iva)
1327 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1328 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1329
1330 /* Make sure hardware complete it */
1331 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1332 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1333
1334 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1335
1336 /* check IOTLB invalidation granularity */
1337 if (DMA_TLB_IAIG(val) == 0)
1338 pr_err("Flush IOTLB failed\n");
1339 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1340 pr_debug("TLB flush request %Lx, actual %Lx\n",
1341 (unsigned long long)DMA_TLB_IIRG(type),
1342 (unsigned long long)DMA_TLB_IAIG(val));
1343}
1344
1345static struct device_domain_info *
1346domain_lookup_dev_info(struct dmar_domain *domain,
1347 struct intel_iommu *iommu, u8 bus, u8 devfn)
1348{
1349 struct device_domain_info *info;
1350 unsigned long flags;
1351
1352 spin_lock_irqsave(&domain->lock, flags);
1353 list_for_each_entry(info, &domain->devices, link) {
1354 if (info->iommu == iommu && info->bus == bus &&
1355 info->devfn == devfn) {
1356 spin_unlock_irqrestore(&domain->lock, flags);
1357 return info;
1358 }
1359 }
1360 spin_unlock_irqrestore(&domain->lock, flags);
1361
1362 return NULL;
1363}
1364
1365static void domain_update_iotlb(struct dmar_domain *domain)
1366{
1367 struct device_domain_info *info;
1368 bool has_iotlb_device = false;
1369 unsigned long flags;
1370
1371 spin_lock_irqsave(&domain->lock, flags);
1372 list_for_each_entry(info, &domain->devices, link) {
1373 if (info->ats_enabled) {
1374 has_iotlb_device = true;
1375 break;
1376 }
1377 }
1378 domain->has_iotlb_device = has_iotlb_device;
1379 spin_unlock_irqrestore(&domain->lock, flags);
1380}
1381
1382/*
1383 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1384 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1385 * check because it applies only to the built-in QAT devices and it doesn't
1386 * grant additional privileges.
1387 */
1388#define BUGGY_QAT_DEVID_MASK 0x4940
1389static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1390{
1391 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1392 return false;
1393
1394 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1395 return false;
1396
1397 return true;
1398}
1399
1400static void iommu_enable_pci_caps(struct device_domain_info *info)
1401{
1402 struct pci_dev *pdev;
1403
1404 if (!dev_is_pci(info->dev))
1405 return;
1406
1407 pdev = to_pci_dev(info->dev);
1408 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1409 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1410 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1411 * reserved, which should be set to 0.
1412 */
1413 if (!ecap_dit(info->iommu->ecap))
1414 info->pfsid = 0;
1415 else {
1416 struct pci_dev *pf_pdev;
1417
1418 /* pdev will be returned if device is not a vf */
1419 pf_pdev = pci_physfn(pdev);
1420 info->pfsid = pci_dev_id(pf_pdev);
1421 }
1422
1423 /* The PCIe spec, in its wisdom, declares that the behaviour of
1424 the device if you enable PASID support after ATS support is
1425 undefined. So always enable PASID support on devices which
1426 have it, even if we can't yet know if we're ever going to
1427 use it. */
1428 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1429 info->pasid_enabled = 1;
1430
1431 if (info->pri_supported &&
1432 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1433 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1434 info->pri_enabled = 1;
1435
1436 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1437 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1438 info->ats_enabled = 1;
1439 domain_update_iotlb(info->domain);
1440 info->ats_qdep = pci_ats_queue_depth(pdev);
1441 }
1442}
1443
1444static void iommu_disable_pci_caps(struct device_domain_info *info)
1445{
1446 struct pci_dev *pdev;
1447
1448 if (!dev_is_pci(info->dev))
1449 return;
1450
1451 pdev = to_pci_dev(info->dev);
1452
1453 if (info->ats_enabled) {
1454 pci_disable_ats(pdev);
1455 info->ats_enabled = 0;
1456 domain_update_iotlb(info->domain);
1457 }
1458
1459 if (info->pri_enabled) {
1460 pci_disable_pri(pdev);
1461 info->pri_enabled = 0;
1462 }
1463
1464 if (info->pasid_enabled) {
1465 pci_disable_pasid(pdev);
1466 info->pasid_enabled = 0;
1467 }
1468}
1469
1470static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1471 u64 addr, unsigned int mask)
1472{
1473 u16 sid, qdep;
1474
1475 if (!info || !info->ats_enabled)
1476 return;
1477
1478 sid = info->bus << 8 | info->devfn;
1479 qdep = info->ats_qdep;
1480 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481 qdep, addr, mask);
1482 quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1483}
1484
1485static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1486 u64 addr, unsigned mask)
1487{
1488 struct device_domain_info *info;
1489 unsigned long flags;
1490
1491 if (!domain->has_iotlb_device)
1492 return;
1493
1494 spin_lock_irqsave(&domain->lock, flags);
1495 list_for_each_entry(info, &domain->devices, link)
1496 __iommu_flush_dev_iotlb(info, addr, mask);
1497 spin_unlock_irqrestore(&domain->lock, flags);
1498}
1499
1500static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1501 struct dmar_domain *domain,
1502 unsigned long pfn, unsigned int pages,
1503 int ih, int map)
1504{
1505 unsigned int aligned_pages = __roundup_pow_of_two(pages);
1506 unsigned int mask = ilog2(aligned_pages);
1507 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1508 u16 did = domain_id_iommu(domain, iommu);
1509
1510 BUG_ON(pages == 0);
1511
1512 if (ih)
1513 ih = 1 << 6;
1514
1515 if (domain->use_first_level) {
1516 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1517 } else {
1518 unsigned long bitmask = aligned_pages - 1;
1519
1520 /*
1521 * PSI masks the low order bits of the base address. If the
1522 * address isn't aligned to the mask, then compute a mask value
1523 * needed to ensure the target range is flushed.
1524 */
1525 if (unlikely(bitmask & pfn)) {
1526 unsigned long end_pfn = pfn + pages - 1, shared_bits;
1527
1528 /*
1529 * Since end_pfn <= pfn + bitmask, the only way bits
1530 * higher than bitmask can differ in pfn and end_pfn is
1531 * by carrying. This means after masking out bitmask,
1532 * high bits starting with the first set bit in
1533 * shared_bits are all equal in both pfn and end_pfn.
1534 */
1535 shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1536 mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1537 }
1538
1539 /*
1540 * Fallback to domain selective flush if no PSI support or
1541 * the size is too big.
1542 */
1543 if (!cap_pgsel_inv(iommu->cap) ||
1544 mask > cap_max_amask_val(iommu->cap))
1545 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1546 DMA_TLB_DSI_FLUSH);
1547 else
1548 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1549 DMA_TLB_PSI_FLUSH);
1550 }
1551
1552 /*
1553 * In caching mode, changes of pages from non-present to present require
1554 * flush. However, device IOTLB doesn't need to be flushed in this case.
1555 */
1556 if (!cap_caching_mode(iommu->cap) || !map)
1557 iommu_flush_dev_iotlb(domain, addr, mask);
1558}
1559
1560/* Notification for newly created mappings */
1561static inline void __mapping_notify_one(struct intel_iommu *iommu,
1562 struct dmar_domain *domain,
1563 unsigned long pfn, unsigned int pages)
1564{
1565 /*
1566 * It's a non-present to present mapping. Only flush if caching mode
1567 * and second level.
1568 */
1569 if (cap_caching_mode(iommu->cap) && !domain->use_first_level)
1570 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1571 else
1572 iommu_flush_write_buffer(iommu);
1573}
1574
1575static void intel_flush_iotlb_all(struct iommu_domain *domain)
1576{
1577 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1578 struct iommu_domain_info *info;
1579 unsigned long idx;
1580
1581 xa_for_each(&dmar_domain->iommu_array, idx, info) {
1582 struct intel_iommu *iommu = info->iommu;
1583 u16 did = domain_id_iommu(dmar_domain, iommu);
1584
1585 if (dmar_domain->use_first_level)
1586 qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1587 else
1588 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1589 DMA_TLB_DSI_FLUSH);
1590
1591 if (!cap_caching_mode(iommu->cap))
1592 iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1593 }
1594}
1595
1596static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1597{
1598 u32 pmen;
1599 unsigned long flags;
1600
1601 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1602 return;
1603
1604 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1605 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1606 pmen &= ~DMA_PMEN_EPM;
1607 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1608
1609 /* wait for the protected region status bit to clear */
1610 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1611 readl, !(pmen & DMA_PMEN_PRS), pmen);
1612
1613 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1614}
1615
1616static void iommu_enable_translation(struct intel_iommu *iommu)
1617{
1618 u32 sts;
1619 unsigned long flags;
1620
1621 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 iommu->gcmd |= DMA_GCMD_TE;
1623 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1624
1625 /* Make sure hardware complete it */
1626 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1627 readl, (sts & DMA_GSTS_TES), sts);
1628
1629 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1630}
1631
1632static void iommu_disable_translation(struct intel_iommu *iommu)
1633{
1634 u32 sts;
1635 unsigned long flag;
1636
1637 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1638 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1639 return;
1640
1641 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1642 iommu->gcmd &= ~DMA_GCMD_TE;
1643 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1644
1645 /* Make sure hardware complete it */
1646 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1647 readl, (!(sts & DMA_GSTS_TES)), sts);
1648
1649 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1650}
1651
1652static int iommu_init_domains(struct intel_iommu *iommu)
1653{
1654 u32 ndomains;
1655
1656 ndomains = cap_ndoms(iommu->cap);
1657 pr_debug("%s: Number of Domains supported <%d>\n",
1658 iommu->name, ndomains);
1659
1660 spin_lock_init(&iommu->lock);
1661
1662 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1663 if (!iommu->domain_ids)
1664 return -ENOMEM;
1665
1666 /*
1667 * If Caching mode is set, then invalid translations are tagged
1668 * with domain-id 0, hence we need to pre-allocate it. We also
1669 * use domain-id 0 as a marker for non-allocated domain-id, so
1670 * make sure it is not used for a real domain.
1671 */
1672 set_bit(0, iommu->domain_ids);
1673
1674 /*
1675 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1676 * entry for first-level or pass-through translation modes should
1677 * be programmed with a domain id different from those used for
1678 * second-level or nested translation. We reserve a domain id for
1679 * this purpose.
1680 */
1681 if (sm_supported(iommu))
1682 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1683
1684 return 0;
1685}
1686
1687static void disable_dmar_iommu(struct intel_iommu *iommu)
1688{
1689 if (!iommu->domain_ids)
1690 return;
1691
1692 /*
1693 * All iommu domains must have been detached from the devices,
1694 * hence there should be no domain IDs in use.
1695 */
1696 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1697 > NUM_RESERVED_DID))
1698 return;
1699
1700 if (iommu->gcmd & DMA_GCMD_TE)
1701 iommu_disable_translation(iommu);
1702}
1703
1704static void free_dmar_iommu(struct intel_iommu *iommu)
1705{
1706 if (iommu->domain_ids) {
1707 bitmap_free(iommu->domain_ids);
1708 iommu->domain_ids = NULL;
1709 }
1710
1711 if (iommu->copied_tables) {
1712 bitmap_free(iommu->copied_tables);
1713 iommu->copied_tables = NULL;
1714 }
1715
1716 /* free context mapping */
1717 free_context_table(iommu);
1718
1719#ifdef CONFIG_INTEL_IOMMU_SVM
1720 if (pasid_supported(iommu)) {
1721 if (ecap_prs(iommu->ecap))
1722 intel_svm_finish_prq(iommu);
1723 }
1724 if (vccap_pasid(iommu->vccap))
1725 ioasid_unregister_allocator(&iommu->pasid_allocator);
1726
1727#endif
1728}
1729
1730/*
1731 * Check and return whether first level is used by default for
1732 * DMA translation.
1733 */
1734static bool first_level_by_default(unsigned int type)
1735{
1736 /* Only SL is available in legacy mode */
1737 if (!scalable_mode_support())
1738 return false;
1739
1740 /* Only level (either FL or SL) is available, just use it */
1741 if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1742 return intel_cap_flts_sanity();
1743
1744 /* Both levels are available, decide it based on domain type */
1745 return type != IOMMU_DOMAIN_UNMANAGED;
1746}
1747
1748static struct dmar_domain *alloc_domain(unsigned int type)
1749{
1750 struct dmar_domain *domain;
1751
1752 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1753 if (!domain)
1754 return NULL;
1755
1756 domain->nid = NUMA_NO_NODE;
1757 if (first_level_by_default(type))
1758 domain->use_first_level = true;
1759 domain->has_iotlb_device = false;
1760 INIT_LIST_HEAD(&domain->devices);
1761 spin_lock_init(&domain->lock);
1762 xa_init(&domain->iommu_array);
1763
1764 return domain;
1765}
1766
1767static int domain_attach_iommu(struct dmar_domain *domain,
1768 struct intel_iommu *iommu)
1769{
1770 struct iommu_domain_info *info, *curr;
1771 unsigned long ndomains;
1772 int num, ret = -ENOSPC;
1773
1774 info = kzalloc(sizeof(*info), GFP_KERNEL);
1775 if (!info)
1776 return -ENOMEM;
1777
1778 spin_lock(&iommu->lock);
1779 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1780 if (curr) {
1781 curr->refcnt++;
1782 spin_unlock(&iommu->lock);
1783 kfree(info);
1784 return 0;
1785 }
1786
1787 ndomains = cap_ndoms(iommu->cap);
1788 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1789 if (num >= ndomains) {
1790 pr_err("%s: No free domain ids\n", iommu->name);
1791 goto err_unlock;
1792 }
1793
1794 set_bit(num, iommu->domain_ids);
1795 info->refcnt = 1;
1796 info->did = num;
1797 info->iommu = iommu;
1798 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1799 NULL, info, GFP_ATOMIC);
1800 if (curr) {
1801 ret = xa_err(curr) ? : -EBUSY;
1802 goto err_clear;
1803 }
1804 domain_update_iommu_cap(domain);
1805
1806 spin_unlock(&iommu->lock);
1807 return 0;
1808
1809err_clear:
1810 clear_bit(info->did, iommu->domain_ids);
1811err_unlock:
1812 spin_unlock(&iommu->lock);
1813 kfree(info);
1814 return ret;
1815}
1816
1817static void domain_detach_iommu(struct dmar_domain *domain,
1818 struct intel_iommu *iommu)
1819{
1820 struct iommu_domain_info *info;
1821
1822 spin_lock(&iommu->lock);
1823 info = xa_load(&domain->iommu_array, iommu->seq_id);
1824 if (--info->refcnt == 0) {
1825 clear_bit(info->did, iommu->domain_ids);
1826 xa_erase(&domain->iommu_array, iommu->seq_id);
1827 domain->nid = NUMA_NO_NODE;
1828 domain_update_iommu_cap(domain);
1829 kfree(info);
1830 }
1831 spin_unlock(&iommu->lock);
1832}
1833
1834static inline int guestwidth_to_adjustwidth(int gaw)
1835{
1836 int agaw;
1837 int r = (gaw - 12) % 9;
1838
1839 if (r == 0)
1840 agaw = gaw;
1841 else
1842 agaw = gaw + 9 - r;
1843 if (agaw > 64)
1844 agaw = 64;
1845 return agaw;
1846}
1847
1848static void domain_exit(struct dmar_domain *domain)
1849{
1850 if (domain->pgd) {
1851 LIST_HEAD(freelist);
1852
1853 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1854 put_pages_list(&freelist);
1855 }
1856
1857 if (WARN_ON(!list_empty(&domain->devices)))
1858 return;
1859
1860 kfree(domain);
1861}
1862
1863/*
1864 * Get the PASID directory size for scalable mode context entry.
1865 * Value of X in the PDTS field of a scalable mode context entry
1866 * indicates PASID directory with 2^(X + 7) entries.
1867 */
1868static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1869{
1870 unsigned long pds, max_pde;
1871
1872 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1873 pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1874 if (pds < 7)
1875 return 0;
1876
1877 return pds - 7;
1878}
1879
1880/*
1881 * Set the RID_PASID field of a scalable mode context entry. The
1882 * IOMMU hardware will use the PASID value set in this field for
1883 * DMA translations of DMA requests without PASID.
1884 */
1885static inline void
1886context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1887{
1888 context->hi |= pasid & ((1 << 20) - 1);
1889}
1890
1891/*
1892 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1893 * entry.
1894 */
1895static inline void context_set_sm_dte(struct context_entry *context)
1896{
1897 context->lo |= (1 << 2);
1898}
1899
1900/*
1901 * Set the PRE(Page Request Enable) field of a scalable mode context
1902 * entry.
1903 */
1904static inline void context_set_sm_pre(struct context_entry *context)
1905{
1906 context->lo |= (1 << 4);
1907}
1908
1909/* Convert value to context PASID directory size field coding. */
1910#define context_pdts(pds) (((pds) & 0x7) << 9)
1911
1912static int domain_context_mapping_one(struct dmar_domain *domain,
1913 struct intel_iommu *iommu,
1914 struct pasid_table *table,
1915 u8 bus, u8 devfn)
1916{
1917 struct device_domain_info *info =
1918 domain_lookup_dev_info(domain, iommu, bus, devfn);
1919 u16 did = domain_id_iommu(domain, iommu);
1920 int translation = CONTEXT_TT_MULTI_LEVEL;
1921 struct context_entry *context;
1922 int ret;
1923
1924 WARN_ON(did == 0);
1925
1926 if (hw_pass_through && domain_type_is_si(domain))
1927 translation = CONTEXT_TT_PASS_THROUGH;
1928
1929 pr_debug("Set context mapping for %02x:%02x.%d\n",
1930 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1931
1932 BUG_ON(!domain->pgd);
1933
1934 spin_lock(&iommu->lock);
1935 ret = -ENOMEM;
1936 context = iommu_context_addr(iommu, bus, devfn, 1);
1937 if (!context)
1938 goto out_unlock;
1939
1940 ret = 0;
1941 if (context_present(context) && !context_copied(iommu, bus, devfn))
1942 goto out_unlock;
1943
1944 /*
1945 * For kdump cases, old valid entries may be cached due to the
1946 * in-flight DMA and copied pgtable, but there is no unmapping
1947 * behaviour for them, thus we need an explicit cache flush for
1948 * the newly-mapped device. For kdump, at this point, the device
1949 * is supposed to finish reset at its driver probe stage, so no
1950 * in-flight DMA will exist, and we don't need to worry anymore
1951 * hereafter.
1952 */
1953 if (context_copied(iommu, bus, devfn)) {
1954 u16 did_old = context_domain_id(context);
1955
1956 if (did_old < cap_ndoms(iommu->cap)) {
1957 iommu->flush.flush_context(iommu, did_old,
1958 (((u16)bus) << 8) | devfn,
1959 DMA_CCMD_MASK_NOBIT,
1960 DMA_CCMD_DEVICE_INVL);
1961 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1962 DMA_TLB_DSI_FLUSH);
1963 }
1964
1965 clear_context_copied(iommu, bus, devfn);
1966 }
1967
1968 context_clear_entry(context);
1969
1970 if (sm_supported(iommu)) {
1971 unsigned long pds;
1972
1973 WARN_ON(!table);
1974
1975 /* Setup the PASID DIR pointer: */
1976 pds = context_get_sm_pds(table);
1977 context->lo = (u64)virt_to_phys(table->table) |
1978 context_pdts(pds);
1979
1980 /* Setup the RID_PASID field: */
1981 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1982
1983 /*
1984 * Setup the Device-TLB enable bit and Page request
1985 * Enable bit:
1986 */
1987 if (info && info->ats_supported)
1988 context_set_sm_dte(context);
1989 if (info && info->pri_supported)
1990 context_set_sm_pre(context);
1991 if (info && info->pasid_supported)
1992 context_set_pasid(context);
1993 } else {
1994 struct dma_pte *pgd = domain->pgd;
1995 int agaw;
1996
1997 context_set_domain_id(context, did);
1998
1999 if (translation != CONTEXT_TT_PASS_THROUGH) {
2000 /*
2001 * Skip top levels of page tables for iommu which has
2002 * less agaw than default. Unnecessary for PT mode.
2003 */
2004 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2005 ret = -ENOMEM;
2006 pgd = phys_to_virt(dma_pte_addr(pgd));
2007 if (!dma_pte_present(pgd))
2008 goto out_unlock;
2009 }
2010
2011 if (info && info->ats_supported)
2012 translation = CONTEXT_TT_DEV_IOTLB;
2013 else
2014 translation = CONTEXT_TT_MULTI_LEVEL;
2015
2016 context_set_address_root(context, virt_to_phys(pgd));
2017 context_set_address_width(context, agaw);
2018 } else {
2019 /*
2020 * In pass through mode, AW must be programmed to
2021 * indicate the largest AGAW value supported by
2022 * hardware. And ASR is ignored by hardware.
2023 */
2024 context_set_address_width(context, iommu->msagaw);
2025 }
2026
2027 context_set_translation_type(context, translation);
2028 }
2029
2030 context_set_fault_enable(context);
2031 context_set_present(context);
2032 if (!ecap_coherent(iommu->ecap))
2033 clflush_cache_range(context, sizeof(*context));
2034
2035 /*
2036 * It's a non-present to present mapping. If hardware doesn't cache
2037 * non-present entry we only need to flush the write-buffer. If the
2038 * _does_ cache non-present entries, then it does so in the special
2039 * domain #0, which we have to flush:
2040 */
2041 if (cap_caching_mode(iommu->cap)) {
2042 iommu->flush.flush_context(iommu, 0,
2043 (((u16)bus) << 8) | devfn,
2044 DMA_CCMD_MASK_NOBIT,
2045 DMA_CCMD_DEVICE_INVL);
2046 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2047 } else {
2048 iommu_flush_write_buffer(iommu);
2049 }
2050
2051 ret = 0;
2052
2053out_unlock:
2054 spin_unlock(&iommu->lock);
2055
2056 return ret;
2057}
2058
2059struct domain_context_mapping_data {
2060 struct dmar_domain *domain;
2061 struct intel_iommu *iommu;
2062 struct pasid_table *table;
2063};
2064
2065static int domain_context_mapping_cb(struct pci_dev *pdev,
2066 u16 alias, void *opaque)
2067{
2068 struct domain_context_mapping_data *data = opaque;
2069
2070 return domain_context_mapping_one(data->domain, data->iommu,
2071 data->table, PCI_BUS_NUM(alias),
2072 alias & 0xff);
2073}
2074
2075static int
2076domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077{
2078 struct domain_context_mapping_data data;
2079 struct pasid_table *table;
2080 struct intel_iommu *iommu;
2081 u8 bus, devfn;
2082
2083 iommu = device_to_iommu(dev, &bus, &devfn);
2084 if (!iommu)
2085 return -ENODEV;
2086
2087 table = intel_pasid_get_table(dev);
2088
2089 if (!dev_is_pci(dev))
2090 return domain_context_mapping_one(domain, iommu, table,
2091 bus, devfn);
2092
2093 data.domain = domain;
2094 data.iommu = iommu;
2095 data.table = table;
2096
2097 return pci_for_each_dma_alias(to_pci_dev(dev),
2098 &domain_context_mapping_cb, &data);
2099}
2100
2101/* Returns a number of VTD pages, but aligned to MM page size */
2102static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103 size_t size)
2104{
2105 host_addr &= ~PAGE_MASK;
2106 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107}
2108
2109/* Return largest possible superpage level for a given mapping */
2110static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111 unsigned long iov_pfn,
2112 unsigned long phy_pfn,
2113 unsigned long pages)
2114{
2115 int support, level = 1;
2116 unsigned long pfnmerge;
2117
2118 support = domain->iommu_superpage;
2119
2120 /* To use a large page, the virtual *and* physical addresses
2121 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122 of them will mean we have to use smaller pages. So just
2123 merge them and check both at once. */
2124 pfnmerge = iov_pfn | phy_pfn;
2125
2126 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127 pages >>= VTD_STRIDE_SHIFT;
2128 if (!pages)
2129 break;
2130 pfnmerge >>= VTD_STRIDE_SHIFT;
2131 level++;
2132 support--;
2133 }
2134 return level;
2135}
2136
2137/*
2138 * Ensure that old small page tables are removed to make room for superpage(s).
2139 * We're going to add new large pages, so make sure we don't remove their parent
2140 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2141 */
2142static void switch_to_super_page(struct dmar_domain *domain,
2143 unsigned long start_pfn,
2144 unsigned long end_pfn, int level)
2145{
2146 unsigned long lvl_pages = lvl_to_nr_pages(level);
2147 struct iommu_domain_info *info;
2148 struct dma_pte *pte = NULL;
2149 unsigned long i;
2150
2151 while (start_pfn <= end_pfn) {
2152 if (!pte)
2153 pte = pfn_to_dma_pte(domain, start_pfn, &level);
2154
2155 if (dma_pte_present(pte)) {
2156 dma_pte_free_pagetable(domain, start_pfn,
2157 start_pfn + lvl_pages - 1,
2158 level + 1);
2159
2160 xa_for_each(&domain->iommu_array, i, info)
2161 iommu_flush_iotlb_psi(info->iommu, domain,
2162 start_pfn, lvl_pages,
2163 0, 0);
2164 }
2165
2166 pte++;
2167 start_pfn += lvl_pages;
2168 if (first_pte_in_page(pte))
2169 pte = NULL;
2170 }
2171}
2172
2173static int
2174__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2175 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2176{
2177 struct dma_pte *first_pte = NULL, *pte = NULL;
2178 unsigned int largepage_lvl = 0;
2179 unsigned long lvl_pages = 0;
2180 phys_addr_t pteval;
2181 u64 attr;
2182
2183 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2184
2185 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2186 return -EINVAL;
2187
2188 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2189 attr |= DMA_FL_PTE_PRESENT;
2190 if (domain->use_first_level) {
2191 attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2192 if (prot & DMA_PTE_WRITE)
2193 attr |= DMA_FL_PTE_DIRTY;
2194 }
2195
2196 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2197
2198 while (nr_pages > 0) {
2199 uint64_t tmp;
2200
2201 if (!pte) {
2202 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2203 phys_pfn, nr_pages);
2204
2205 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2206 if (!pte)
2207 return -ENOMEM;
2208 first_pte = pte;
2209
2210 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2211
2212 /* It is large page*/
2213 if (largepage_lvl > 1) {
2214 unsigned long end_pfn;
2215 unsigned long pages_to_remove;
2216
2217 pteval |= DMA_PTE_LARGE_PAGE;
2218 pages_to_remove = min_t(unsigned long, nr_pages,
2219 nr_pte_to_next_page(pte) * lvl_pages);
2220 end_pfn = iov_pfn + pages_to_remove - 1;
2221 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2222 } else {
2223 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2224 }
2225
2226 }
2227 /* We don't need lock here, nobody else
2228 * touches the iova range
2229 */
2230 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2231 if (tmp) {
2232 static int dumps = 5;
2233 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2234 iov_pfn, tmp, (unsigned long long)pteval);
2235 if (dumps) {
2236 dumps--;
2237 debug_dma_dump_mappings(NULL);
2238 }
2239 WARN_ON(1);
2240 }
2241
2242 nr_pages -= lvl_pages;
2243 iov_pfn += lvl_pages;
2244 phys_pfn += lvl_pages;
2245 pteval += lvl_pages * VTD_PAGE_SIZE;
2246
2247 /* If the next PTE would be the first in a new page, then we
2248 * need to flush the cache on the entries we've just written.
2249 * And then we'll need to recalculate 'pte', so clear it and
2250 * let it get set again in the if (!pte) block above.
2251 *
2252 * If we're done (!nr_pages) we need to flush the cache too.
2253 *
2254 * Also if we've been setting superpages, we may need to
2255 * recalculate 'pte' and switch back to smaller pages for the
2256 * end of the mapping, if the trailing size is not enough to
2257 * use another superpage (i.e. nr_pages < lvl_pages).
2258 */
2259 pte++;
2260 if (!nr_pages || first_pte_in_page(pte) ||
2261 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2262 domain_flush_cache(domain, first_pte,
2263 (void *)pte - (void *)first_pte);
2264 pte = NULL;
2265 }
2266 }
2267
2268 return 0;
2269}
2270
2271static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2272{
2273 struct intel_iommu *iommu = info->iommu;
2274 struct context_entry *context;
2275 u16 did_old;
2276
2277 if (!iommu)
2278 return;
2279
2280 spin_lock(&iommu->lock);
2281 context = iommu_context_addr(iommu, bus, devfn, 0);
2282 if (!context) {
2283 spin_unlock(&iommu->lock);
2284 return;
2285 }
2286
2287 if (sm_supported(iommu)) {
2288 if (hw_pass_through && domain_type_is_si(info->domain))
2289 did_old = FLPT_DEFAULT_DID;
2290 else
2291 did_old = domain_id_iommu(info->domain, iommu);
2292 } else {
2293 did_old = context_domain_id(context);
2294 }
2295
2296 context_clear_entry(context);
2297 __iommu_flush_cache(iommu, context, sizeof(*context));
2298 spin_unlock(&iommu->lock);
2299 iommu->flush.flush_context(iommu,
2300 did_old,
2301 (((u16)bus) << 8) | devfn,
2302 DMA_CCMD_MASK_NOBIT,
2303 DMA_CCMD_DEVICE_INVL);
2304
2305 if (sm_supported(iommu))
2306 qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2307
2308 iommu->flush.flush_iotlb(iommu,
2309 did_old,
2310 0,
2311 0,
2312 DMA_TLB_DSI_FLUSH);
2313
2314 __iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2315}
2316
2317static int domain_setup_first_level(struct intel_iommu *iommu,
2318 struct dmar_domain *domain,
2319 struct device *dev,
2320 u32 pasid)
2321{
2322 struct dma_pte *pgd = domain->pgd;
2323 int agaw, level;
2324 int flags = 0;
2325
2326 /*
2327 * Skip top levels of page tables for iommu which has
2328 * less agaw than default. Unnecessary for PT mode.
2329 */
2330 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2331 pgd = phys_to_virt(dma_pte_addr(pgd));
2332 if (!dma_pte_present(pgd))
2333 return -ENOMEM;
2334 }
2335
2336 level = agaw_to_level(agaw);
2337 if (level != 4 && level != 5)
2338 return -EINVAL;
2339
2340 if (pasid != PASID_RID2PASID)
2341 flags |= PASID_FLAG_SUPERVISOR_MODE;
2342 if (level == 5)
2343 flags |= PASID_FLAG_FL5LP;
2344
2345 if (domain->force_snooping)
2346 flags |= PASID_FLAG_PAGE_SNOOP;
2347
2348 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2349 domain_id_iommu(domain, iommu),
2350 flags);
2351}
2352
2353static bool dev_is_real_dma_subdevice(struct device *dev)
2354{
2355 return dev && dev_is_pci(dev) &&
2356 pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2357}
2358
2359static int iommu_domain_identity_map(struct dmar_domain *domain,
2360 unsigned long first_vpfn,
2361 unsigned long last_vpfn)
2362{
2363 /*
2364 * RMRR range might have overlap with physical memory range,
2365 * clear it first
2366 */
2367 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2368
2369 return __domain_mapping(domain, first_vpfn,
2370 first_vpfn, last_vpfn - first_vpfn + 1,
2371 DMA_PTE_READ|DMA_PTE_WRITE);
2372}
2373
2374static int md_domain_init(struct dmar_domain *domain, int guest_width);
2375
2376static int __init si_domain_init(int hw)
2377{
2378 struct dmar_rmrr_unit *rmrr;
2379 struct device *dev;
2380 int i, nid, ret;
2381
2382 si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2383 if (!si_domain)
2384 return -EFAULT;
2385
2386 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2387 domain_exit(si_domain);
2388 si_domain = NULL;
2389 return -EFAULT;
2390 }
2391
2392 if (hw)
2393 return 0;
2394
2395 for_each_online_node(nid) {
2396 unsigned long start_pfn, end_pfn;
2397 int i;
2398
2399 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2400 ret = iommu_domain_identity_map(si_domain,
2401 mm_to_dma_pfn(start_pfn),
2402 mm_to_dma_pfn(end_pfn));
2403 if (ret)
2404 return ret;
2405 }
2406 }
2407
2408 /*
2409 * Identity map the RMRRs so that devices with RMRRs could also use
2410 * the si_domain.
2411 */
2412 for_each_rmrr_units(rmrr) {
2413 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2414 i, dev) {
2415 unsigned long long start = rmrr->base_address;
2416 unsigned long long end = rmrr->end_address;
2417
2418 if (WARN_ON(end < start ||
2419 end >> agaw_to_width(si_domain->agaw)))
2420 continue;
2421
2422 ret = iommu_domain_identity_map(si_domain,
2423 mm_to_dma_pfn(start >> PAGE_SHIFT),
2424 mm_to_dma_pfn(end >> PAGE_SHIFT));
2425 if (ret)
2426 return ret;
2427 }
2428 }
2429
2430 return 0;
2431}
2432
2433static int dmar_domain_attach_device(struct dmar_domain *domain,
2434 struct device *dev)
2435{
2436 struct device_domain_info *info = dev_iommu_priv_get(dev);
2437 struct intel_iommu *iommu;
2438 unsigned long flags;
2439 u8 bus, devfn;
2440 int ret;
2441
2442 iommu = device_to_iommu(dev, &bus, &devfn);
2443 if (!iommu)
2444 return -ENODEV;
2445
2446 ret = domain_attach_iommu(domain, iommu);
2447 if (ret)
2448 return ret;
2449 info->domain = domain;
2450 spin_lock_irqsave(&domain->lock, flags);
2451 list_add(&info->link, &domain->devices);
2452 spin_unlock_irqrestore(&domain->lock, flags);
2453
2454 /* PASID table is mandatory for a PCI device in scalable mode. */
2455 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2456 /* Setup the PASID entry for requests without PASID: */
2457 if (hw_pass_through && domain_type_is_si(domain))
2458 ret = intel_pasid_setup_pass_through(iommu, domain,
2459 dev, PASID_RID2PASID);
2460 else if (domain->use_first_level)
2461 ret = domain_setup_first_level(iommu, domain, dev,
2462 PASID_RID2PASID);
2463 else
2464 ret = intel_pasid_setup_second_level(iommu, domain,
2465 dev, PASID_RID2PASID);
2466 if (ret) {
2467 dev_err(dev, "Setup RID2PASID failed\n");
2468 device_block_translation(dev);
2469 return ret;
2470 }
2471 }
2472
2473 ret = domain_context_mapping(domain, dev);
2474 if (ret) {
2475 dev_err(dev, "Domain context map failed\n");
2476 device_block_translation(dev);
2477 return ret;
2478 }
2479
2480 iommu_enable_pci_caps(info);
2481
2482 return 0;
2483}
2484
2485static bool device_has_rmrr(struct device *dev)
2486{
2487 struct dmar_rmrr_unit *rmrr;
2488 struct device *tmp;
2489 int i;
2490
2491 rcu_read_lock();
2492 for_each_rmrr_units(rmrr) {
2493 /*
2494 * Return TRUE if this RMRR contains the device that
2495 * is passed in.
2496 */
2497 for_each_active_dev_scope(rmrr->devices,
2498 rmrr->devices_cnt, i, tmp)
2499 if (tmp == dev ||
2500 is_downstream_to_pci_bridge(dev, tmp)) {
2501 rcu_read_unlock();
2502 return true;
2503 }
2504 }
2505 rcu_read_unlock();
2506 return false;
2507}
2508
2509/**
2510 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2511 * is relaxable (ie. is allowed to be not enforced under some conditions)
2512 * @dev: device handle
2513 *
2514 * We assume that PCI USB devices with RMRRs have them largely
2515 * for historical reasons and that the RMRR space is not actively used post
2516 * boot. This exclusion may change if vendors begin to abuse it.
2517 *
2518 * The same exception is made for graphics devices, with the requirement that
2519 * any use of the RMRR regions will be torn down before assigning the device
2520 * to a guest.
2521 *
2522 * Return: true if the RMRR is relaxable, false otherwise
2523 */
2524static bool device_rmrr_is_relaxable(struct device *dev)
2525{
2526 struct pci_dev *pdev;
2527
2528 if (!dev_is_pci(dev))
2529 return false;
2530
2531 pdev = to_pci_dev(dev);
2532 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2533 return true;
2534 else
2535 return false;
2536}
2537
2538/*
2539 * There are a couple cases where we need to restrict the functionality of
2540 * devices associated with RMRRs. The first is when evaluating a device for
2541 * identity mapping because problems exist when devices are moved in and out
2542 * of domains and their respective RMRR information is lost. This means that
2543 * a device with associated RMRRs will never be in a "passthrough" domain.
2544 * The second is use of the device through the IOMMU API. This interface
2545 * expects to have full control of the IOVA space for the device. We cannot
2546 * satisfy both the requirement that RMRR access is maintained and have an
2547 * unencumbered IOVA space. We also have no ability to quiesce the device's
2548 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2549 * We therefore prevent devices associated with an RMRR from participating in
2550 * the IOMMU API, which eliminates them from device assignment.
2551 *
2552 * In both cases, devices which have relaxable RMRRs are not concerned by this
2553 * restriction. See device_rmrr_is_relaxable comment.
2554 */
2555static bool device_is_rmrr_locked(struct device *dev)
2556{
2557 if (!device_has_rmrr(dev))
2558 return false;
2559
2560 if (device_rmrr_is_relaxable(dev))
2561 return false;
2562
2563 return true;
2564}
2565
2566/*
2567 * Return the required default domain type for a specific device.
2568 *
2569 * @dev: the device in query
2570 * @startup: true if this is during early boot
2571 *
2572 * Returns:
2573 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2574 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2575 * - 0: both identity and dynamic domains work for this device
2576 */
2577static int device_def_domain_type(struct device *dev)
2578{
2579 if (dev_is_pci(dev)) {
2580 struct pci_dev *pdev = to_pci_dev(dev);
2581
2582 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2583 return IOMMU_DOMAIN_IDENTITY;
2584
2585 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2586 return IOMMU_DOMAIN_IDENTITY;
2587 }
2588
2589 return 0;
2590}
2591
2592static void intel_iommu_init_qi(struct intel_iommu *iommu)
2593{
2594 /*
2595 * Start from the sane iommu hardware state.
2596 * If the queued invalidation is already initialized by us
2597 * (for example, while enabling interrupt-remapping) then
2598 * we got the things already rolling from a sane state.
2599 */
2600 if (!iommu->qi) {
2601 /*
2602 * Clear any previous faults.
2603 */
2604 dmar_fault(-1, iommu);
2605 /*
2606 * Disable queued invalidation if supported and already enabled
2607 * before OS handover.
2608 */
2609 dmar_disable_qi(iommu);
2610 }
2611
2612 if (dmar_enable_qi(iommu)) {
2613 /*
2614 * Queued Invalidate not enabled, use Register Based Invalidate
2615 */
2616 iommu->flush.flush_context = __iommu_flush_context;
2617 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2618 pr_info("%s: Using Register based invalidation\n",
2619 iommu->name);
2620 } else {
2621 iommu->flush.flush_context = qi_flush_context;
2622 iommu->flush.flush_iotlb = qi_flush_iotlb;
2623 pr_info("%s: Using Queued invalidation\n", iommu->name);
2624 }
2625}
2626
2627static int copy_context_table(struct intel_iommu *iommu,
2628 struct root_entry *old_re,
2629 struct context_entry **tbl,
2630 int bus, bool ext)
2631{
2632 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2633 struct context_entry *new_ce = NULL, ce;
2634 struct context_entry *old_ce = NULL;
2635 struct root_entry re;
2636 phys_addr_t old_ce_phys;
2637
2638 tbl_idx = ext ? bus * 2 : bus;
2639 memcpy(&re, old_re, sizeof(re));
2640
2641 for (devfn = 0; devfn < 256; devfn++) {
2642 /* First calculate the correct index */
2643 idx = (ext ? devfn * 2 : devfn) % 256;
2644
2645 if (idx == 0) {
2646 /* First save what we may have and clean up */
2647 if (new_ce) {
2648 tbl[tbl_idx] = new_ce;
2649 __iommu_flush_cache(iommu, new_ce,
2650 VTD_PAGE_SIZE);
2651 pos = 1;
2652 }
2653
2654 if (old_ce)
2655 memunmap(old_ce);
2656
2657 ret = 0;
2658 if (devfn < 0x80)
2659 old_ce_phys = root_entry_lctp(&re);
2660 else
2661 old_ce_phys = root_entry_uctp(&re);
2662
2663 if (!old_ce_phys) {
2664 if (ext && devfn == 0) {
2665 /* No LCTP, try UCTP */
2666 devfn = 0x7f;
2667 continue;
2668 } else {
2669 goto out;
2670 }
2671 }
2672
2673 ret = -ENOMEM;
2674 old_ce = memremap(old_ce_phys, PAGE_SIZE,
2675 MEMREMAP_WB);
2676 if (!old_ce)
2677 goto out;
2678
2679 new_ce = alloc_pgtable_page(iommu->node);
2680 if (!new_ce)
2681 goto out_unmap;
2682
2683 ret = 0;
2684 }
2685
2686 /* Now copy the context entry */
2687 memcpy(&ce, old_ce + idx, sizeof(ce));
2688
2689 if (!context_present(&ce))
2690 continue;
2691
2692 did = context_domain_id(&ce);
2693 if (did >= 0 && did < cap_ndoms(iommu->cap))
2694 set_bit(did, iommu->domain_ids);
2695
2696 set_context_copied(iommu, bus, devfn);
2697 new_ce[idx] = ce;
2698 }
2699
2700 tbl[tbl_idx + pos] = new_ce;
2701
2702 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2703
2704out_unmap:
2705 memunmap(old_ce);
2706
2707out:
2708 return ret;
2709}
2710
2711static int copy_translation_tables(struct intel_iommu *iommu)
2712{
2713 struct context_entry **ctxt_tbls;
2714 struct root_entry *old_rt;
2715 phys_addr_t old_rt_phys;
2716 int ctxt_table_entries;
2717 u64 rtaddr_reg;
2718 int bus, ret;
2719 bool new_ext, ext;
2720
2721 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2722 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2723 new_ext = !!sm_supported(iommu);
2724
2725 /*
2726 * The RTT bit can only be changed when translation is disabled,
2727 * but disabling translation means to open a window for data
2728 * corruption. So bail out and don't copy anything if we would
2729 * have to change the bit.
2730 */
2731 if (new_ext != ext)
2732 return -EINVAL;
2733
2734 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2735 if (!iommu->copied_tables)
2736 return -ENOMEM;
2737
2738 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2739 if (!old_rt_phys)
2740 return -EINVAL;
2741
2742 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2743 if (!old_rt)
2744 return -ENOMEM;
2745
2746 /* This is too big for the stack - allocate it from slab */
2747 ctxt_table_entries = ext ? 512 : 256;
2748 ret = -ENOMEM;
2749 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2750 if (!ctxt_tbls)
2751 goto out_unmap;
2752
2753 for (bus = 0; bus < 256; bus++) {
2754 ret = copy_context_table(iommu, &old_rt[bus],
2755 ctxt_tbls, bus, ext);
2756 if (ret) {
2757 pr_err("%s: Failed to copy context table for bus %d\n",
2758 iommu->name, bus);
2759 continue;
2760 }
2761 }
2762
2763 spin_lock(&iommu->lock);
2764
2765 /* Context tables are copied, now write them to the root_entry table */
2766 for (bus = 0; bus < 256; bus++) {
2767 int idx = ext ? bus * 2 : bus;
2768 u64 val;
2769
2770 if (ctxt_tbls[idx]) {
2771 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2772 iommu->root_entry[bus].lo = val;
2773 }
2774
2775 if (!ext || !ctxt_tbls[idx + 1])
2776 continue;
2777
2778 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2779 iommu->root_entry[bus].hi = val;
2780 }
2781
2782 spin_unlock(&iommu->lock);
2783
2784 kfree(ctxt_tbls);
2785
2786 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2787
2788 ret = 0;
2789
2790out_unmap:
2791 memunmap(old_rt);
2792
2793 return ret;
2794}
2795
2796#ifdef CONFIG_INTEL_IOMMU_SVM
2797static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2798{
2799 struct intel_iommu *iommu = data;
2800 ioasid_t ioasid;
2801
2802 if (!iommu)
2803 return INVALID_IOASID;
2804 /*
2805 * VT-d virtual command interface always uses the full 20 bit
2806 * PASID range. Host can partition guest PASID range based on
2807 * policies but it is out of guest's control.
2808 */
2809 if (min < PASID_MIN || max > intel_pasid_max_id)
2810 return INVALID_IOASID;
2811
2812 if (vcmd_alloc_pasid(iommu, &ioasid))
2813 return INVALID_IOASID;
2814
2815 return ioasid;
2816}
2817
2818static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2819{
2820 struct intel_iommu *iommu = data;
2821
2822 if (!iommu)
2823 return;
2824 /*
2825 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2826 * We can only free the PASID when all the devices are unbound.
2827 */
2828 if (ioasid_find(NULL, ioasid, NULL)) {
2829 pr_alert("Cannot free active IOASID %d\n", ioasid);
2830 return;
2831 }
2832 vcmd_free_pasid(iommu, ioasid);
2833}
2834
2835static void register_pasid_allocator(struct intel_iommu *iommu)
2836{
2837 /*
2838 * If we are running in the host, no need for custom allocator
2839 * in that PASIDs are allocated from the host system-wide.
2840 */
2841 if (!cap_caching_mode(iommu->cap))
2842 return;
2843
2844 if (!sm_supported(iommu)) {
2845 pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2846 return;
2847 }
2848
2849 /*
2850 * Register a custom PASID allocator if we are running in a guest,
2851 * guest PASID must be obtained via virtual command interface.
2852 * There can be multiple vIOMMUs in each guest but only one allocator
2853 * is active. All vIOMMU allocators will eventually be calling the same
2854 * host allocator.
2855 */
2856 if (!vccap_pasid(iommu->vccap))
2857 return;
2858
2859 pr_info("Register custom PASID allocator\n");
2860 iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2861 iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2862 iommu->pasid_allocator.pdata = (void *)iommu;
2863 if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2864 pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2865 /*
2866 * Disable scalable mode on this IOMMU if there
2867 * is no custom allocator. Mixing SM capable vIOMMU
2868 * and non-SM vIOMMU are not supported.
2869 */
2870 intel_iommu_sm = 0;
2871 }
2872}
2873#endif
2874
2875static int __init init_dmars(void)
2876{
2877 struct dmar_drhd_unit *drhd;
2878 struct intel_iommu *iommu;
2879 int ret;
2880
2881 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2882 if (ret)
2883 goto free_iommu;
2884
2885 for_each_iommu(iommu, drhd) {
2886 if (drhd->ignored) {
2887 iommu_disable_translation(iommu);
2888 continue;
2889 }
2890
2891 /*
2892 * Find the max pasid size of all IOMMU's in the system.
2893 * We need to ensure the system pasid table is no bigger
2894 * than the smallest supported.
2895 */
2896 if (pasid_supported(iommu)) {
2897 u32 temp = 2 << ecap_pss(iommu->ecap);
2898
2899 intel_pasid_max_id = min_t(u32, temp,
2900 intel_pasid_max_id);
2901 }
2902
2903 intel_iommu_init_qi(iommu);
2904
2905 ret = iommu_init_domains(iommu);
2906 if (ret)
2907 goto free_iommu;
2908
2909 init_translation_status(iommu);
2910
2911 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2912 iommu_disable_translation(iommu);
2913 clear_translation_pre_enabled(iommu);
2914 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2915 iommu->name);
2916 }
2917
2918 /*
2919 * TBD:
2920 * we could share the same root & context tables
2921 * among all IOMMU's. Need to Split it later.
2922 */
2923 ret = iommu_alloc_root_entry(iommu);
2924 if (ret)
2925 goto free_iommu;
2926
2927 if (translation_pre_enabled(iommu)) {
2928 pr_info("Translation already enabled - trying to copy translation structures\n");
2929
2930 ret = copy_translation_tables(iommu);
2931 if (ret) {
2932 /*
2933 * We found the IOMMU with translation
2934 * enabled - but failed to copy over the
2935 * old root-entry table. Try to proceed
2936 * by disabling translation now and
2937 * allocating a clean root-entry table.
2938 * This might cause DMAR faults, but
2939 * probably the dump will still succeed.
2940 */
2941 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2942 iommu->name);
2943 iommu_disable_translation(iommu);
2944 clear_translation_pre_enabled(iommu);
2945 } else {
2946 pr_info("Copied translation tables from previous kernel for %s\n",
2947 iommu->name);
2948 }
2949 }
2950
2951 if (!ecap_pass_through(iommu->ecap))
2952 hw_pass_through = 0;
2953 intel_svm_check(iommu);
2954 }
2955
2956 /*
2957 * Now that qi is enabled on all iommus, set the root entry and flush
2958 * caches. This is required on some Intel X58 chipsets, otherwise the
2959 * flush_context function will loop forever and the boot hangs.
2960 */
2961 for_each_active_iommu(iommu, drhd) {
2962 iommu_flush_write_buffer(iommu);
2963#ifdef CONFIG_INTEL_IOMMU_SVM
2964 register_pasid_allocator(iommu);
2965#endif
2966 iommu_set_root_entry(iommu);
2967 }
2968
2969#ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2970 dmar_map_gfx = 0;
2971#endif
2972
2973 if (!dmar_map_gfx)
2974 iommu_identity_mapping |= IDENTMAP_GFX;
2975
2976 check_tylersburg_isoch();
2977
2978 ret = si_domain_init(hw_pass_through);
2979 if (ret)
2980 goto free_iommu;
2981
2982 /*
2983 * for each drhd
2984 * enable fault log
2985 * global invalidate context cache
2986 * global invalidate iotlb
2987 * enable translation
2988 */
2989 for_each_iommu(iommu, drhd) {
2990 if (drhd->ignored) {
2991 /*
2992 * we always have to disable PMRs or DMA may fail on
2993 * this device
2994 */
2995 if (force_on)
2996 iommu_disable_protect_mem_regions(iommu);
2997 continue;
2998 }
2999
3000 iommu_flush_write_buffer(iommu);
3001
3002#ifdef CONFIG_INTEL_IOMMU_SVM
3003 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3004 /*
3005 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3006 * could cause possible lock race condition.
3007 */
3008 up_write(&dmar_global_lock);
3009 ret = intel_svm_enable_prq(iommu);
3010 down_write(&dmar_global_lock);
3011 if (ret)
3012 goto free_iommu;
3013 }
3014#endif
3015 ret = dmar_set_interrupt(iommu);
3016 if (ret)
3017 goto free_iommu;
3018 }
3019
3020 return 0;
3021
3022free_iommu:
3023 for_each_active_iommu(iommu, drhd) {
3024 disable_dmar_iommu(iommu);
3025 free_dmar_iommu(iommu);
3026 }
3027 if (si_domain) {
3028 domain_exit(si_domain);
3029 si_domain = NULL;
3030 }
3031
3032 return ret;
3033}
3034
3035static void __init init_no_remapping_devices(void)
3036{
3037 struct dmar_drhd_unit *drhd;
3038 struct device *dev;
3039 int i;
3040
3041 for_each_drhd_unit(drhd) {
3042 if (!drhd->include_all) {
3043 for_each_active_dev_scope(drhd->devices,
3044 drhd->devices_cnt, i, dev)
3045 break;
3046 /* ignore DMAR unit if no devices exist */
3047 if (i == drhd->devices_cnt)
3048 drhd->ignored = 1;
3049 }
3050 }
3051
3052 for_each_active_drhd_unit(drhd) {
3053 if (drhd->include_all)
3054 continue;
3055
3056 for_each_active_dev_scope(drhd->devices,
3057 drhd->devices_cnt, i, dev)
3058 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3059 break;
3060 if (i < drhd->devices_cnt)
3061 continue;
3062
3063 /* This IOMMU has *only* gfx devices. Either bypass it or
3064 set the gfx_mapped flag, as appropriate */
3065 drhd->gfx_dedicated = 1;
3066 if (!dmar_map_gfx)
3067 drhd->ignored = 1;
3068 }
3069}
3070
3071#ifdef CONFIG_SUSPEND
3072static int init_iommu_hw(void)
3073{
3074 struct dmar_drhd_unit *drhd;
3075 struct intel_iommu *iommu = NULL;
3076
3077 for_each_active_iommu(iommu, drhd)
3078 if (iommu->qi)
3079 dmar_reenable_qi(iommu);
3080
3081 for_each_iommu(iommu, drhd) {
3082 if (drhd->ignored) {
3083 /*
3084 * we always have to disable PMRs or DMA may fail on
3085 * this device
3086 */
3087 if (force_on)
3088 iommu_disable_protect_mem_regions(iommu);
3089 continue;
3090 }
3091
3092 iommu_flush_write_buffer(iommu);
3093 iommu_set_root_entry(iommu);
3094 iommu_enable_translation(iommu);
3095 iommu_disable_protect_mem_regions(iommu);
3096 }
3097
3098 return 0;
3099}
3100
3101static void iommu_flush_all(void)
3102{
3103 struct dmar_drhd_unit *drhd;
3104 struct intel_iommu *iommu;
3105
3106 for_each_active_iommu(iommu, drhd) {
3107 iommu->flush.flush_context(iommu, 0, 0, 0,
3108 DMA_CCMD_GLOBAL_INVL);
3109 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3110 DMA_TLB_GLOBAL_FLUSH);
3111 }
3112}
3113
3114static int iommu_suspend(void)
3115{
3116 struct dmar_drhd_unit *drhd;
3117 struct intel_iommu *iommu = NULL;
3118 unsigned long flag;
3119
3120 for_each_active_iommu(iommu, drhd) {
3121 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3122 GFP_KERNEL);
3123 if (!iommu->iommu_state)
3124 goto nomem;
3125 }
3126
3127 iommu_flush_all();
3128
3129 for_each_active_iommu(iommu, drhd) {
3130 iommu_disable_translation(iommu);
3131
3132 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3133
3134 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3135 readl(iommu->reg + DMAR_FECTL_REG);
3136 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3137 readl(iommu->reg + DMAR_FEDATA_REG);
3138 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3139 readl(iommu->reg + DMAR_FEADDR_REG);
3140 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3141 readl(iommu->reg + DMAR_FEUADDR_REG);
3142
3143 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3144 }
3145 return 0;
3146
3147nomem:
3148 for_each_active_iommu(iommu, drhd)
3149 kfree(iommu->iommu_state);
3150
3151 return -ENOMEM;
3152}
3153
3154static void iommu_resume(void)
3155{
3156 struct dmar_drhd_unit *drhd;
3157 struct intel_iommu *iommu = NULL;
3158 unsigned long flag;
3159
3160 if (init_iommu_hw()) {
3161 if (force_on)
3162 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3163 else
3164 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3165 return;
3166 }
3167
3168 for_each_active_iommu(iommu, drhd) {
3169
3170 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3171
3172 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3173 iommu->reg + DMAR_FECTL_REG);
3174 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3175 iommu->reg + DMAR_FEDATA_REG);
3176 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3177 iommu->reg + DMAR_FEADDR_REG);
3178 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3179 iommu->reg + DMAR_FEUADDR_REG);
3180
3181 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3182 }
3183
3184 for_each_active_iommu(iommu, drhd)
3185 kfree(iommu->iommu_state);
3186}
3187
3188static struct syscore_ops iommu_syscore_ops = {
3189 .resume = iommu_resume,
3190 .suspend = iommu_suspend,
3191};
3192
3193static void __init init_iommu_pm_ops(void)
3194{
3195 register_syscore_ops(&iommu_syscore_ops);
3196}
3197
3198#else
3199static inline void init_iommu_pm_ops(void) {}
3200#endif /* CONFIG_PM */
3201
3202static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3203{
3204 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3205 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3206 rmrr->end_address <= rmrr->base_address ||
3207 arch_rmrr_sanity_check(rmrr))
3208 return -EINVAL;
3209
3210 return 0;
3211}
3212
3213int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3214{
3215 struct acpi_dmar_reserved_memory *rmrr;
3216 struct dmar_rmrr_unit *rmrru;
3217
3218 rmrr = (struct acpi_dmar_reserved_memory *)header;
3219 if (rmrr_sanity_check(rmrr)) {
3220 pr_warn(FW_BUG
3221 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3222 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3223 rmrr->base_address, rmrr->end_address,
3224 dmi_get_system_info(DMI_BIOS_VENDOR),
3225 dmi_get_system_info(DMI_BIOS_VERSION),
3226 dmi_get_system_info(DMI_PRODUCT_VERSION));
3227 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3228 }
3229
3230 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3231 if (!rmrru)
3232 goto out;
3233
3234 rmrru->hdr = header;
3235
3236 rmrru->base_address = rmrr->base_address;
3237 rmrru->end_address = rmrr->end_address;
3238
3239 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3240 ((void *)rmrr) + rmrr->header.length,
3241 &rmrru->devices_cnt);
3242 if (rmrru->devices_cnt && rmrru->devices == NULL)
3243 goto free_rmrru;
3244
3245 list_add(&rmrru->list, &dmar_rmrr_units);
3246
3247 return 0;
3248free_rmrru:
3249 kfree(rmrru);
3250out:
3251 return -ENOMEM;
3252}
3253
3254static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3255{
3256 struct dmar_atsr_unit *atsru;
3257 struct acpi_dmar_atsr *tmp;
3258
3259 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3260 dmar_rcu_check()) {
3261 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3262 if (atsr->segment != tmp->segment)
3263 continue;
3264 if (atsr->header.length != tmp->header.length)
3265 continue;
3266 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3267 return atsru;
3268 }
3269
3270 return NULL;
3271}
3272
3273int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3274{
3275 struct acpi_dmar_atsr *atsr;
3276 struct dmar_atsr_unit *atsru;
3277
3278 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3279 return 0;
3280
3281 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3282 atsru = dmar_find_atsr(atsr);
3283 if (atsru)
3284 return 0;
3285
3286 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3287 if (!atsru)
3288 return -ENOMEM;
3289
3290 /*
3291 * If memory is allocated from slab by ACPI _DSM method, we need to
3292 * copy the memory content because the memory buffer will be freed
3293 * on return.
3294 */
3295 atsru->hdr = (void *)(atsru + 1);
3296 memcpy(atsru->hdr, hdr, hdr->length);
3297 atsru->include_all = atsr->flags & 0x1;
3298 if (!atsru->include_all) {
3299 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3300 (void *)atsr + atsr->header.length,
3301 &atsru->devices_cnt);
3302 if (atsru->devices_cnt && atsru->devices == NULL) {
3303 kfree(atsru);
3304 return -ENOMEM;
3305 }
3306 }
3307
3308 list_add_rcu(&atsru->list, &dmar_atsr_units);
3309
3310 return 0;
3311}
3312
3313static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3314{
3315 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3316 kfree(atsru);
3317}
3318
3319int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320{
3321 struct acpi_dmar_atsr *atsr;
3322 struct dmar_atsr_unit *atsru;
3323
3324 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3325 atsru = dmar_find_atsr(atsr);
3326 if (atsru) {
3327 list_del_rcu(&atsru->list);
3328 synchronize_rcu();
3329 intel_iommu_free_atsr(atsru);
3330 }
3331
3332 return 0;
3333}
3334
3335int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3336{
3337 int i;
3338 struct device *dev;
3339 struct acpi_dmar_atsr *atsr;
3340 struct dmar_atsr_unit *atsru;
3341
3342 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3343 atsru = dmar_find_atsr(atsr);
3344 if (!atsru)
3345 return 0;
3346
3347 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3348 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3349 i, dev)
3350 return -EBUSY;
3351 }
3352
3353 return 0;
3354}
3355
3356static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3357{
3358 struct dmar_satc_unit *satcu;
3359 struct acpi_dmar_satc *tmp;
3360
3361 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3362 dmar_rcu_check()) {
3363 tmp = (struct acpi_dmar_satc *)satcu->hdr;
3364 if (satc->segment != tmp->segment)
3365 continue;
3366 if (satc->header.length != tmp->header.length)
3367 continue;
3368 if (memcmp(satc, tmp, satc->header.length) == 0)
3369 return satcu;
3370 }
3371
3372 return NULL;
3373}
3374
3375int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3376{
3377 struct acpi_dmar_satc *satc;
3378 struct dmar_satc_unit *satcu;
3379
3380 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3381 return 0;
3382
3383 satc = container_of(hdr, struct acpi_dmar_satc, header);
3384 satcu = dmar_find_satc(satc);
3385 if (satcu)
3386 return 0;
3387
3388 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3389 if (!satcu)
3390 return -ENOMEM;
3391
3392 satcu->hdr = (void *)(satcu + 1);
3393 memcpy(satcu->hdr, hdr, hdr->length);
3394 satcu->atc_required = satc->flags & 0x1;
3395 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3396 (void *)satc + satc->header.length,
3397 &satcu->devices_cnt);
3398 if (satcu->devices_cnt && !satcu->devices) {
3399 kfree(satcu);
3400 return -ENOMEM;
3401 }
3402 list_add_rcu(&satcu->list, &dmar_satc_units);
3403
3404 return 0;
3405}
3406
3407static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3408{
3409 int sp, ret;
3410 struct intel_iommu *iommu = dmaru->iommu;
3411
3412 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3413 if (ret)
3414 goto out;
3415
3416 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3417 pr_warn("%s: Doesn't support hardware pass through.\n",
3418 iommu->name);
3419 return -ENXIO;
3420 }
3421
3422 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3423 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3424 pr_warn("%s: Doesn't support large page.\n",
3425 iommu->name);
3426 return -ENXIO;
3427 }
3428
3429 /*
3430 * Disable translation if already enabled prior to OS handover.
3431 */
3432 if (iommu->gcmd & DMA_GCMD_TE)
3433 iommu_disable_translation(iommu);
3434
3435 ret = iommu_init_domains(iommu);
3436 if (ret == 0)
3437 ret = iommu_alloc_root_entry(iommu);
3438 if (ret)
3439 goto out;
3440
3441 intel_svm_check(iommu);
3442
3443 if (dmaru->ignored) {
3444 /*
3445 * we always have to disable PMRs or DMA may fail on this device
3446 */
3447 if (force_on)
3448 iommu_disable_protect_mem_regions(iommu);
3449 return 0;
3450 }
3451
3452 intel_iommu_init_qi(iommu);
3453 iommu_flush_write_buffer(iommu);
3454
3455#ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457 ret = intel_svm_enable_prq(iommu);
3458 if (ret)
3459 goto disable_iommu;
3460 }
3461#endif
3462 ret = dmar_set_interrupt(iommu);
3463 if (ret)
3464 goto disable_iommu;
3465
3466 iommu_set_root_entry(iommu);
3467 iommu_enable_translation(iommu);
3468
3469 iommu_disable_protect_mem_regions(iommu);
3470 return 0;
3471
3472disable_iommu:
3473 disable_dmar_iommu(iommu);
3474out:
3475 free_dmar_iommu(iommu);
3476 return ret;
3477}
3478
3479int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3480{
3481 int ret = 0;
3482 struct intel_iommu *iommu = dmaru->iommu;
3483
3484 if (!intel_iommu_enabled)
3485 return 0;
3486 if (iommu == NULL)
3487 return -EINVAL;
3488
3489 if (insert) {
3490 ret = intel_iommu_add(dmaru);
3491 } else {
3492 disable_dmar_iommu(iommu);
3493 free_dmar_iommu(iommu);
3494 }
3495
3496 return ret;
3497}
3498
3499static void intel_iommu_free_dmars(void)
3500{
3501 struct dmar_rmrr_unit *rmrru, *rmrr_n;
3502 struct dmar_atsr_unit *atsru, *atsr_n;
3503 struct dmar_satc_unit *satcu, *satc_n;
3504
3505 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3506 list_del(&rmrru->list);
3507 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3508 kfree(rmrru);
3509 }
3510
3511 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3512 list_del(&atsru->list);
3513 intel_iommu_free_atsr(atsru);
3514 }
3515 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3516 list_del(&satcu->list);
3517 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3518 kfree(satcu);
3519 }
3520}
3521
3522static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3523{
3524 struct dmar_satc_unit *satcu;
3525 struct acpi_dmar_satc *satc;
3526 struct device *tmp;
3527 int i;
3528
3529 dev = pci_physfn(dev);
3530 rcu_read_lock();
3531
3532 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3533 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3534 if (satc->segment != pci_domain_nr(dev->bus))
3535 continue;
3536 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3537 if (to_pci_dev(tmp) == dev)
3538 goto out;
3539 }
3540 satcu = NULL;
3541out:
3542 rcu_read_unlock();
3543 return satcu;
3544}
3545
3546static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3547{
3548 int i, ret = 1;
3549 struct pci_bus *bus;
3550 struct pci_dev *bridge = NULL;
3551 struct device *tmp;
3552 struct acpi_dmar_atsr *atsr;
3553 struct dmar_atsr_unit *atsru;
3554 struct dmar_satc_unit *satcu;
3555
3556 dev = pci_physfn(dev);
3557 satcu = dmar_find_matched_satc_unit(dev);
3558 if (satcu)
3559 /*
3560 * This device supports ATS as it is in SATC table.
3561 * When IOMMU is in legacy mode, enabling ATS is done
3562 * automatically by HW for the device that requires
3563 * ATS, hence OS should not enable this device ATS
3564 * to avoid duplicated TLB invalidation.
3565 */
3566 return !(satcu->atc_required && !sm_supported(iommu));
3567
3568 for (bus = dev->bus; bus; bus = bus->parent) {
3569 bridge = bus->self;
3570 /* If it's an integrated device, allow ATS */
3571 if (!bridge)
3572 return 1;
3573 /* Connected via non-PCIe: no ATS */
3574 if (!pci_is_pcie(bridge) ||
3575 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3576 return 0;
3577 /* If we found the root port, look it up in the ATSR */
3578 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3579 break;
3580 }
3581
3582 rcu_read_lock();
3583 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3584 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3585 if (atsr->segment != pci_domain_nr(dev->bus))
3586 continue;
3587
3588 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3589 if (tmp == &bridge->dev)
3590 goto out;
3591
3592 if (atsru->include_all)
3593 goto out;
3594 }
3595 ret = 0;
3596out:
3597 rcu_read_unlock();
3598
3599 return ret;
3600}
3601
3602int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3603{
3604 int ret;
3605 struct dmar_rmrr_unit *rmrru;
3606 struct dmar_atsr_unit *atsru;
3607 struct dmar_satc_unit *satcu;
3608 struct acpi_dmar_atsr *atsr;
3609 struct acpi_dmar_reserved_memory *rmrr;
3610 struct acpi_dmar_satc *satc;
3611
3612 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3613 return 0;
3614
3615 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3616 rmrr = container_of(rmrru->hdr,
3617 struct acpi_dmar_reserved_memory, header);
3618 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3619 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3620 ((void *)rmrr) + rmrr->header.length,
3621 rmrr->segment, rmrru->devices,
3622 rmrru->devices_cnt);
3623 if (ret < 0)
3624 return ret;
3625 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3626 dmar_remove_dev_scope(info, rmrr->segment,
3627 rmrru->devices, rmrru->devices_cnt);
3628 }
3629 }
3630
3631 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3632 if (atsru->include_all)
3633 continue;
3634
3635 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3636 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3637 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3638 (void *)atsr + atsr->header.length,
3639 atsr->segment, atsru->devices,
3640 atsru->devices_cnt);
3641 if (ret > 0)
3642 break;
3643 else if (ret < 0)
3644 return ret;
3645 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3646 if (dmar_remove_dev_scope(info, atsr->segment,
3647 atsru->devices, atsru->devices_cnt))
3648 break;
3649 }
3650 }
3651 list_for_each_entry(satcu, &dmar_satc_units, list) {
3652 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3653 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3654 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3655 (void *)satc + satc->header.length,
3656 satc->segment, satcu->devices,
3657 satcu->devices_cnt);
3658 if (ret > 0)
3659 break;
3660 else if (ret < 0)
3661 return ret;
3662 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3663 if (dmar_remove_dev_scope(info, satc->segment,
3664 satcu->devices, satcu->devices_cnt))
3665 break;
3666 }
3667 }
3668
3669 return 0;
3670}
3671
3672static int intel_iommu_memory_notifier(struct notifier_block *nb,
3673 unsigned long val, void *v)
3674{
3675 struct memory_notify *mhp = v;
3676 unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3677 unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3678 mhp->nr_pages - 1);
3679
3680 switch (val) {
3681 case MEM_GOING_ONLINE:
3682 if (iommu_domain_identity_map(si_domain,
3683 start_vpfn, last_vpfn)) {
3684 pr_warn("Failed to build identity map for [%lx-%lx]\n",
3685 start_vpfn, last_vpfn);
3686 return NOTIFY_BAD;
3687 }
3688 break;
3689
3690 case MEM_OFFLINE:
3691 case MEM_CANCEL_ONLINE:
3692 {
3693 struct dmar_drhd_unit *drhd;
3694 struct intel_iommu *iommu;
3695 LIST_HEAD(freelist);
3696
3697 domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3698
3699 rcu_read_lock();
3700 for_each_active_iommu(iommu, drhd)
3701 iommu_flush_iotlb_psi(iommu, si_domain,
3702 start_vpfn, mhp->nr_pages,
3703 list_empty(&freelist), 0);
3704 rcu_read_unlock();
3705 put_pages_list(&freelist);
3706 }
3707 break;
3708 }
3709
3710 return NOTIFY_OK;
3711}
3712
3713static struct notifier_block intel_iommu_memory_nb = {
3714 .notifier_call = intel_iommu_memory_notifier,
3715 .priority = 0
3716};
3717
3718static void intel_disable_iommus(void)
3719{
3720 struct intel_iommu *iommu = NULL;
3721 struct dmar_drhd_unit *drhd;
3722
3723 for_each_iommu(iommu, drhd)
3724 iommu_disable_translation(iommu);
3725}
3726
3727void intel_iommu_shutdown(void)
3728{
3729 struct dmar_drhd_unit *drhd;
3730 struct intel_iommu *iommu = NULL;
3731
3732 if (no_iommu || dmar_disabled)
3733 return;
3734
3735 down_write(&dmar_global_lock);
3736
3737 /* Disable PMRs explicitly here. */
3738 for_each_iommu(iommu, drhd)
3739 iommu_disable_protect_mem_regions(iommu);
3740
3741 /* Make sure the IOMMUs are switched off */
3742 intel_disable_iommus();
3743
3744 up_write(&dmar_global_lock);
3745}
3746
3747static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3748{
3749 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3750
3751 return container_of(iommu_dev, struct intel_iommu, iommu);
3752}
3753
3754static ssize_t version_show(struct device *dev,
3755 struct device_attribute *attr, char *buf)
3756{
3757 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3758 u32 ver = readl(iommu->reg + DMAR_VER_REG);
3759 return sprintf(buf, "%d:%d\n",
3760 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3761}
3762static DEVICE_ATTR_RO(version);
3763
3764static ssize_t address_show(struct device *dev,
3765 struct device_attribute *attr, char *buf)
3766{
3767 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3768 return sprintf(buf, "%llx\n", iommu->reg_phys);
3769}
3770static DEVICE_ATTR_RO(address);
3771
3772static ssize_t cap_show(struct device *dev,
3773 struct device_attribute *attr, char *buf)
3774{
3775 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3776 return sprintf(buf, "%llx\n", iommu->cap);
3777}
3778static DEVICE_ATTR_RO(cap);
3779
3780static ssize_t ecap_show(struct device *dev,
3781 struct device_attribute *attr, char *buf)
3782{
3783 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3784 return sprintf(buf, "%llx\n", iommu->ecap);
3785}
3786static DEVICE_ATTR_RO(ecap);
3787
3788static ssize_t domains_supported_show(struct device *dev,
3789 struct device_attribute *attr, char *buf)
3790{
3791 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3792 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3793}
3794static DEVICE_ATTR_RO(domains_supported);
3795
3796static ssize_t domains_used_show(struct device *dev,
3797 struct device_attribute *attr, char *buf)
3798{
3799 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3800 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3801 cap_ndoms(iommu->cap)));
3802}
3803static DEVICE_ATTR_RO(domains_used);
3804
3805static struct attribute *intel_iommu_attrs[] = {
3806 &dev_attr_version.attr,
3807 &dev_attr_address.attr,
3808 &dev_attr_cap.attr,
3809 &dev_attr_ecap.attr,
3810 &dev_attr_domains_supported.attr,
3811 &dev_attr_domains_used.attr,
3812 NULL,
3813};
3814
3815static struct attribute_group intel_iommu_group = {
3816 .name = "intel-iommu",
3817 .attrs = intel_iommu_attrs,
3818};
3819
3820const struct attribute_group *intel_iommu_groups[] = {
3821 &intel_iommu_group,
3822 NULL,
3823};
3824
3825static inline bool has_external_pci(void)
3826{
3827 struct pci_dev *pdev = NULL;
3828
3829 for_each_pci_dev(pdev)
3830 if (pdev->external_facing) {
3831 pci_dev_put(pdev);
3832 return true;
3833 }
3834
3835 return false;
3836}
3837
3838static int __init platform_optin_force_iommu(void)
3839{
3840 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3841 return 0;
3842
3843 if (no_iommu || dmar_disabled)
3844 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3845
3846 /*
3847 * If Intel-IOMMU is disabled by default, we will apply identity
3848 * map for all devices except those marked as being untrusted.
3849 */
3850 if (dmar_disabled)
3851 iommu_set_default_passthrough(false);
3852
3853 dmar_disabled = 0;
3854 no_iommu = 0;
3855
3856 return 1;
3857}
3858
3859static int __init probe_acpi_namespace_devices(void)
3860{
3861 struct dmar_drhd_unit *drhd;
3862 /* To avoid a -Wunused-but-set-variable warning. */
3863 struct intel_iommu *iommu __maybe_unused;
3864 struct device *dev;
3865 int i, ret = 0;
3866
3867 for_each_active_iommu(iommu, drhd) {
3868 for_each_active_dev_scope(drhd->devices,
3869 drhd->devices_cnt, i, dev) {
3870 struct acpi_device_physical_node *pn;
3871 struct iommu_group *group;
3872 struct acpi_device *adev;
3873
3874 if (dev->bus != &acpi_bus_type)
3875 continue;
3876
3877 adev = to_acpi_device(dev);
3878 mutex_lock(&adev->physical_node_lock);
3879 list_for_each_entry(pn,
3880 &adev->physical_node_list, node) {
3881 group = iommu_group_get(pn->dev);
3882 if (group) {
3883 iommu_group_put(group);
3884 continue;
3885 }
3886
3887 ret = iommu_probe_device(pn->dev);
3888 if (ret)
3889 break;
3890 }
3891 mutex_unlock(&adev->physical_node_lock);
3892
3893 if (ret)
3894 return ret;
3895 }
3896 }
3897
3898 return 0;
3899}
3900
3901static __init int tboot_force_iommu(void)
3902{
3903 if (!tboot_enabled())
3904 return 0;
3905
3906 if (no_iommu || dmar_disabled)
3907 pr_warn("Forcing Intel-IOMMU to enabled\n");
3908
3909 dmar_disabled = 0;
3910 no_iommu = 0;
3911
3912 return 1;
3913}
3914
3915int __init intel_iommu_init(void)
3916{
3917 int ret = -ENODEV;
3918 struct dmar_drhd_unit *drhd;
3919 struct intel_iommu *iommu;
3920
3921 /*
3922 * Intel IOMMU is required for a TXT/tboot launch or platform
3923 * opt in, so enforce that.
3924 */
3925 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3926 platform_optin_force_iommu();
3927
3928 down_write(&dmar_global_lock);
3929 if (dmar_table_init()) {
3930 if (force_on)
3931 panic("tboot: Failed to initialize DMAR table\n");
3932 goto out_free_dmar;
3933 }
3934
3935 if (dmar_dev_scope_init() < 0) {
3936 if (force_on)
3937 panic("tboot: Failed to initialize DMAR device scope\n");
3938 goto out_free_dmar;
3939 }
3940
3941 up_write(&dmar_global_lock);
3942
3943 /*
3944 * The bus notifier takes the dmar_global_lock, so lockdep will
3945 * complain later when we register it under the lock.
3946 */
3947 dmar_register_bus_notifier();
3948
3949 down_write(&dmar_global_lock);
3950
3951 if (!no_iommu)
3952 intel_iommu_debugfs_init();
3953
3954 if (no_iommu || dmar_disabled) {
3955 /*
3956 * We exit the function here to ensure IOMMU's remapping and
3957 * mempool aren't setup, which means that the IOMMU's PMRs
3958 * won't be disabled via the call to init_dmars(). So disable
3959 * it explicitly here. The PMRs were setup by tboot prior to
3960 * calling SENTER, but the kernel is expected to reset/tear
3961 * down the PMRs.
3962 */
3963 if (intel_iommu_tboot_noforce) {
3964 for_each_iommu(iommu, drhd)
3965 iommu_disable_protect_mem_regions(iommu);
3966 }
3967
3968 /*
3969 * Make sure the IOMMUs are switched off, even when we
3970 * boot into a kexec kernel and the previous kernel left
3971 * them enabled
3972 */
3973 intel_disable_iommus();
3974 goto out_free_dmar;
3975 }
3976
3977 if (list_empty(&dmar_rmrr_units))
3978 pr_info("No RMRR found\n");
3979
3980 if (list_empty(&dmar_atsr_units))
3981 pr_info("No ATSR found\n");
3982
3983 if (list_empty(&dmar_satc_units))
3984 pr_info("No SATC found\n");
3985
3986 init_no_remapping_devices();
3987
3988 ret = init_dmars();
3989 if (ret) {
3990 if (force_on)
3991 panic("tboot: Failed to initialize DMARs\n");
3992 pr_err("Initialization failed\n");
3993 goto out_free_dmar;
3994 }
3995 up_write(&dmar_global_lock);
3996
3997 init_iommu_pm_ops();
3998
3999 down_read(&dmar_global_lock);
4000 for_each_active_iommu(iommu, drhd) {
4001 /*
4002 * The flush queue implementation does not perform
4003 * page-selective invalidations that are required for efficient
4004 * TLB flushes in virtual environments. The benefit of batching
4005 * is likely to be much lower than the overhead of synchronizing
4006 * the virtual and physical IOMMU page-tables.
4007 */
4008 if (cap_caching_mode(iommu->cap)) {
4009 pr_info_once("IOMMU batching disallowed due to virtualization\n");
4010 iommu_set_dma_strict();
4011 }
4012 iommu_device_sysfs_add(&iommu->iommu, NULL,
4013 intel_iommu_groups,
4014 "%s", iommu->name);
4015 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4016 }
4017 up_read(&dmar_global_lock);
4018
4019 if (si_domain && !hw_pass_through)
4020 register_memory_notifier(&intel_iommu_memory_nb);
4021
4022 down_read(&dmar_global_lock);
4023 if (probe_acpi_namespace_devices())
4024 pr_warn("ACPI name space devices didn't probe correctly\n");
4025
4026 /* Finally, we enable the DMA remapping hardware. */
4027 for_each_iommu(iommu, drhd) {
4028 if (!drhd->ignored && !translation_pre_enabled(iommu))
4029 iommu_enable_translation(iommu);
4030
4031 iommu_disable_protect_mem_regions(iommu);
4032 }
4033 up_read(&dmar_global_lock);
4034
4035 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4036
4037 intel_iommu_enabled = 1;
4038
4039 return 0;
4040
4041out_free_dmar:
4042 intel_iommu_free_dmars();
4043 up_write(&dmar_global_lock);
4044 return ret;
4045}
4046
4047static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4048{
4049 struct device_domain_info *info = opaque;
4050
4051 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4052 return 0;
4053}
4054
4055/*
4056 * NB - intel-iommu lacks any sort of reference counting for the users of
4057 * dependent devices. If multiple endpoints have intersecting dependent
4058 * devices, unbinding the driver from any one of them will possibly leave
4059 * the others unable to operate.
4060 */
4061static void domain_context_clear(struct device_domain_info *info)
4062{
4063 if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4064 return;
4065
4066 pci_for_each_dma_alias(to_pci_dev(info->dev),
4067 &domain_context_clear_one_cb, info);
4068}
4069
4070static void dmar_remove_one_dev_info(struct device *dev)
4071{
4072 struct device_domain_info *info = dev_iommu_priv_get(dev);
4073 struct dmar_domain *domain = info->domain;
4074 struct intel_iommu *iommu = info->iommu;
4075 unsigned long flags;
4076
4077 if (!dev_is_real_dma_subdevice(info->dev)) {
4078 if (dev_is_pci(info->dev) && sm_supported(iommu))
4079 intel_pasid_tear_down_entry(iommu, info->dev,
4080 PASID_RID2PASID, false);
4081
4082 iommu_disable_pci_caps(info);
4083 domain_context_clear(info);
4084 }
4085
4086 spin_lock_irqsave(&domain->lock, flags);
4087 list_del(&info->link);
4088 spin_unlock_irqrestore(&domain->lock, flags);
4089
4090 domain_detach_iommu(domain, iommu);
4091 info->domain = NULL;
4092}
4093
4094/*
4095 * Clear the page table pointer in context or pasid table entries so that
4096 * all DMA requests without PASID from the device are blocked. If the page
4097 * table has been set, clean up the data structures.
4098 */
4099static void device_block_translation(struct device *dev)
4100{
4101 struct device_domain_info *info = dev_iommu_priv_get(dev);
4102 struct intel_iommu *iommu = info->iommu;
4103 unsigned long flags;
4104
4105 iommu_disable_pci_caps(info);
4106 if (!dev_is_real_dma_subdevice(dev)) {
4107 if (sm_supported(iommu))
4108 intel_pasid_tear_down_entry(iommu, dev,
4109 PASID_RID2PASID, false);
4110 else
4111 domain_context_clear(info);
4112 }
4113
4114 if (!info->domain)
4115 return;
4116
4117 spin_lock_irqsave(&info->domain->lock, flags);
4118 list_del(&info->link);
4119 spin_unlock_irqrestore(&info->domain->lock, flags);
4120
4121 domain_detach_iommu(info->domain, iommu);
4122 info->domain = NULL;
4123}
4124
4125static int md_domain_init(struct dmar_domain *domain, int guest_width)
4126{
4127 int adjust_width;
4128
4129 /* calculate AGAW */
4130 domain->gaw = guest_width;
4131 adjust_width = guestwidth_to_adjustwidth(guest_width);
4132 domain->agaw = width_to_agaw(adjust_width);
4133
4134 domain->iommu_coherency = false;
4135 domain->iommu_superpage = 0;
4136 domain->max_addr = 0;
4137
4138 /* always allocate the top pgd */
4139 domain->pgd = alloc_pgtable_page(domain->nid);
4140 if (!domain->pgd)
4141 return -ENOMEM;
4142 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4143 return 0;
4144}
4145
4146static int blocking_domain_attach_dev(struct iommu_domain *domain,
4147 struct device *dev)
4148{
4149 device_block_translation(dev);
4150 return 0;
4151}
4152
4153static struct iommu_domain blocking_domain = {
4154 .ops = &(const struct iommu_domain_ops) {
4155 .attach_dev = blocking_domain_attach_dev,
4156 .free = intel_iommu_domain_free
4157 }
4158};
4159
4160static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4161{
4162 struct dmar_domain *dmar_domain;
4163 struct iommu_domain *domain;
4164
4165 switch (type) {
4166 case IOMMU_DOMAIN_BLOCKED:
4167 return &blocking_domain;
4168 case IOMMU_DOMAIN_DMA:
4169 case IOMMU_DOMAIN_DMA_FQ:
4170 case IOMMU_DOMAIN_UNMANAGED:
4171 dmar_domain = alloc_domain(type);
4172 if (!dmar_domain) {
4173 pr_err("Can't allocate dmar_domain\n");
4174 return NULL;
4175 }
4176 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177 pr_err("Domain initialization failed\n");
4178 domain_exit(dmar_domain);
4179 return NULL;
4180 }
4181
4182 domain = &dmar_domain->domain;
4183 domain->geometry.aperture_start = 0;
4184 domain->geometry.aperture_end =
4185 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186 domain->geometry.force_aperture = true;
4187
4188 return domain;
4189 case IOMMU_DOMAIN_IDENTITY:
4190 return &si_domain->domain;
4191 case IOMMU_DOMAIN_SVA:
4192 return intel_svm_domain_alloc();
4193 default:
4194 return NULL;
4195 }
4196
4197 return NULL;
4198}
4199
4200static void intel_iommu_domain_free(struct iommu_domain *domain)
4201{
4202 if (domain != &si_domain->domain && domain != &blocking_domain)
4203 domain_exit(to_dmar_domain(domain));
4204}
4205
4206static int prepare_domain_attach_device(struct iommu_domain *domain,
4207 struct device *dev)
4208{
4209 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4210 struct intel_iommu *iommu;
4211 int addr_width;
4212
4213 iommu = device_to_iommu(dev, NULL, NULL);
4214 if (!iommu)
4215 return -ENODEV;
4216
4217 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4218 return -EINVAL;
4219
4220 /* check if this iommu agaw is sufficient for max mapped address */
4221 addr_width = agaw_to_width(iommu->agaw);
4222 if (addr_width > cap_mgaw(iommu->cap))
4223 addr_width = cap_mgaw(iommu->cap);
4224
4225 if (dmar_domain->max_addr > (1LL << addr_width))
4226 return -EINVAL;
4227 dmar_domain->gaw = addr_width;
4228
4229 /*
4230 * Knock out extra levels of page tables if necessary
4231 */
4232 while (iommu->agaw < dmar_domain->agaw) {
4233 struct dma_pte *pte;
4234
4235 pte = dmar_domain->pgd;
4236 if (dma_pte_present(pte)) {
4237 dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4238 free_pgtable_page(pte);
4239 }
4240 dmar_domain->agaw--;
4241 }
4242
4243 return 0;
4244}
4245
4246static int intel_iommu_attach_device(struct iommu_domain *domain,
4247 struct device *dev)
4248{
4249 struct device_domain_info *info = dev_iommu_priv_get(dev);
4250 int ret;
4251
4252 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4253 device_is_rmrr_locked(dev)) {
4254 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4255 return -EPERM;
4256 }
4257
4258 if (info->domain)
4259 device_block_translation(dev);
4260
4261 ret = prepare_domain_attach_device(domain, dev);
4262 if (ret)
4263 return ret;
4264
4265 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
4266}
4267
4268static int intel_iommu_map(struct iommu_domain *domain,
4269 unsigned long iova, phys_addr_t hpa,
4270 size_t size, int iommu_prot, gfp_t gfp)
4271{
4272 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4273 u64 max_addr;
4274 int prot = 0;
4275
4276 if (iommu_prot & IOMMU_READ)
4277 prot |= DMA_PTE_READ;
4278 if (iommu_prot & IOMMU_WRITE)
4279 prot |= DMA_PTE_WRITE;
4280 if (dmar_domain->set_pte_snp)
4281 prot |= DMA_PTE_SNP;
4282
4283 max_addr = iova + size;
4284 if (dmar_domain->max_addr < max_addr) {
4285 u64 end;
4286
4287 /* check if minimum agaw is sufficient for mapped address */
4288 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4289 if (end < max_addr) {
4290 pr_err("%s: iommu width (%d) is not "
4291 "sufficient for the mapped address (%llx)\n",
4292 __func__, dmar_domain->gaw, max_addr);
4293 return -EFAULT;
4294 }
4295 dmar_domain->max_addr = max_addr;
4296 }
4297 /* Round up size to next multiple of PAGE_SIZE, if it and
4298 the low bits of hpa would take us onto the next page */
4299 size = aligned_nrpages(hpa, size);
4300 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4301 hpa >> VTD_PAGE_SHIFT, size, prot);
4302}
4303
4304static int intel_iommu_map_pages(struct iommu_domain *domain,
4305 unsigned long iova, phys_addr_t paddr,
4306 size_t pgsize, size_t pgcount,
4307 int prot, gfp_t gfp, size_t *mapped)
4308{
4309 unsigned long pgshift = __ffs(pgsize);
4310 size_t size = pgcount << pgshift;
4311 int ret;
4312
4313 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4314 return -EINVAL;
4315
4316 if (!IS_ALIGNED(iova | paddr, pgsize))
4317 return -EINVAL;
4318
4319 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4320 if (!ret && mapped)
4321 *mapped = size;
4322
4323 return ret;
4324}
4325
4326static size_t intel_iommu_unmap(struct iommu_domain *domain,
4327 unsigned long iova, size_t size,
4328 struct iommu_iotlb_gather *gather)
4329{
4330 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4331 unsigned long start_pfn, last_pfn;
4332 int level = 0;
4333
4334 /* Cope with horrid API which requires us to unmap more than the
4335 size argument if it happens to be a large-page mapping. */
4336 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4337
4338 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4339 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4340
4341 start_pfn = iova >> VTD_PAGE_SHIFT;
4342 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4343
4344 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4345
4346 if (dmar_domain->max_addr == iova + size)
4347 dmar_domain->max_addr = iova;
4348
4349 iommu_iotlb_gather_add_page(domain, gather, iova, size);
4350
4351 return size;
4352}
4353
4354static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4355 unsigned long iova,
4356 size_t pgsize, size_t pgcount,
4357 struct iommu_iotlb_gather *gather)
4358{
4359 unsigned long pgshift = __ffs(pgsize);
4360 size_t size = pgcount << pgshift;
4361
4362 return intel_iommu_unmap(domain, iova, size, gather);
4363}
4364
4365static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4366 struct iommu_iotlb_gather *gather)
4367{
4368 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4369 unsigned long iova_pfn = IOVA_PFN(gather->start);
4370 size_t size = gather->end - gather->start;
4371 struct iommu_domain_info *info;
4372 unsigned long start_pfn;
4373 unsigned long nrpages;
4374 unsigned long i;
4375
4376 nrpages = aligned_nrpages(gather->start, size);
4377 start_pfn = mm_to_dma_pfn(iova_pfn);
4378
4379 xa_for_each(&dmar_domain->iommu_array, i, info)
4380 iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4381 start_pfn, nrpages,
4382 list_empty(&gather->freelist), 0);
4383
4384 put_pages_list(&gather->freelist);
4385}
4386
4387static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4388 dma_addr_t iova)
4389{
4390 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4391 struct dma_pte *pte;
4392 int level = 0;
4393 u64 phys = 0;
4394
4395 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4396 if (pte && dma_pte_present(pte))
4397 phys = dma_pte_addr(pte) +
4398 (iova & (BIT_MASK(level_to_offset_bits(level) +
4399 VTD_PAGE_SHIFT) - 1));
4400
4401 return phys;
4402}
4403
4404static bool domain_support_force_snooping(struct dmar_domain *domain)
4405{
4406 struct device_domain_info *info;
4407 bool support = true;
4408
4409 assert_spin_locked(&domain->lock);
4410 list_for_each_entry(info, &domain->devices, link) {
4411 if (!ecap_sc_support(info->iommu->ecap)) {
4412 support = false;
4413 break;
4414 }
4415 }
4416
4417 return support;
4418}
4419
4420static void domain_set_force_snooping(struct dmar_domain *domain)
4421{
4422 struct device_domain_info *info;
4423
4424 assert_spin_locked(&domain->lock);
4425 /*
4426 * Second level page table supports per-PTE snoop control. The
4427 * iommu_map() interface will handle this by setting SNP bit.
4428 */
4429 if (!domain->use_first_level) {
4430 domain->set_pte_snp = true;
4431 return;
4432 }
4433
4434 list_for_each_entry(info, &domain->devices, link)
4435 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4436 PASID_RID2PASID);
4437}
4438
4439static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4440{
4441 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4442 unsigned long flags;
4443
4444 if (dmar_domain->force_snooping)
4445 return true;
4446
4447 spin_lock_irqsave(&dmar_domain->lock, flags);
4448 if (!domain_support_force_snooping(dmar_domain)) {
4449 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4450 return false;
4451 }
4452
4453 domain_set_force_snooping(dmar_domain);
4454 dmar_domain->force_snooping = true;
4455 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4456
4457 return true;
4458}
4459
4460static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4461{
4462 struct device_domain_info *info = dev_iommu_priv_get(dev);
4463
4464 switch (cap) {
4465 case IOMMU_CAP_CACHE_COHERENCY:
4466 return true;
4467 case IOMMU_CAP_INTR_REMAP:
4468 return irq_remapping_enabled == 1;
4469 case IOMMU_CAP_PRE_BOOT_PROTECTION:
4470 return dmar_platform_optin();
4471 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
4472 return ecap_sc_support(info->iommu->ecap);
4473 default:
4474 return false;
4475 }
4476}
4477
4478static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4479{
4480 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4481 struct device_domain_info *info;
4482 struct intel_iommu *iommu;
4483 u8 bus, devfn;
4484 int ret;
4485
4486 iommu = device_to_iommu(dev, &bus, &devfn);
4487 if (!iommu || !iommu->iommu.ops)
4488 return ERR_PTR(-ENODEV);
4489
4490 info = kzalloc(sizeof(*info), GFP_KERNEL);
4491 if (!info)
4492 return ERR_PTR(-ENOMEM);
4493
4494 if (dev_is_real_dma_subdevice(dev)) {
4495 info->bus = pdev->bus->number;
4496 info->devfn = pdev->devfn;
4497 info->segment = pci_domain_nr(pdev->bus);
4498 } else {
4499 info->bus = bus;
4500 info->devfn = devfn;
4501 info->segment = iommu->segment;
4502 }
4503
4504 info->dev = dev;
4505 info->iommu = iommu;
4506 if (dev_is_pci(dev)) {
4507 if (ecap_dev_iotlb_support(iommu->ecap) &&
4508 pci_ats_supported(pdev) &&
4509 dmar_ats_supported(pdev, iommu)) {
4510 info->ats_supported = 1;
4511 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4512 }
4513 if (sm_supported(iommu)) {
4514 if (pasid_supported(iommu)) {
4515 int features = pci_pasid_features(pdev);
4516
4517 if (features >= 0)
4518 info->pasid_supported = features | 1;
4519 }
4520
4521 if (info->ats_supported && ecap_prs(iommu->ecap) &&
4522 pci_pri_supported(pdev))
4523 info->pri_supported = 1;
4524 }
4525 }
4526
4527 dev_iommu_priv_set(dev, info);
4528
4529 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
4530 ret = intel_pasid_alloc_table(dev);
4531 if (ret) {
4532 dev_err(dev, "PASID table allocation failed\n");
4533 dev_iommu_priv_set(dev, NULL);
4534 kfree(info);
4535 return ERR_PTR(ret);
4536 }
4537 }
4538
4539 return &iommu->iommu;
4540}
4541
4542static void intel_iommu_release_device(struct device *dev)
4543{
4544 struct device_domain_info *info = dev_iommu_priv_get(dev);
4545
4546 dmar_remove_one_dev_info(dev);
4547 intel_pasid_free_table(dev);
4548 dev_iommu_priv_set(dev, NULL);
4549 kfree(info);
4550 set_dma_ops(dev, NULL);
4551}
4552
4553static void intel_iommu_probe_finalize(struct device *dev)
4554{
4555 set_dma_ops(dev, NULL);
4556 iommu_setup_dma_ops(dev, 0, U64_MAX);
4557}
4558
4559static void intel_iommu_get_resv_regions(struct device *device,
4560 struct list_head *head)
4561{
4562 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4563 struct iommu_resv_region *reg;
4564 struct dmar_rmrr_unit *rmrr;
4565 struct device *i_dev;
4566 int i;
4567
4568 rcu_read_lock();
4569 for_each_rmrr_units(rmrr) {
4570 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4571 i, i_dev) {
4572 struct iommu_resv_region *resv;
4573 enum iommu_resv_type type;
4574 size_t length;
4575
4576 if (i_dev != device &&
4577 !is_downstream_to_pci_bridge(device, i_dev))
4578 continue;
4579
4580 length = rmrr->end_address - rmrr->base_address + 1;
4581
4582 type = device_rmrr_is_relaxable(device) ?
4583 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4584
4585 resv = iommu_alloc_resv_region(rmrr->base_address,
4586 length, prot, type,
4587 GFP_ATOMIC);
4588 if (!resv)
4589 break;
4590
4591 list_add_tail(&resv->list, head);
4592 }
4593 }
4594 rcu_read_unlock();
4595
4596#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4597 if (dev_is_pci(device)) {
4598 struct pci_dev *pdev = to_pci_dev(device);
4599
4600 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4601 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4602 IOMMU_RESV_DIRECT_RELAXABLE,
4603 GFP_KERNEL);
4604 if (reg)
4605 list_add_tail(®->list, head);
4606 }
4607 }
4608#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4609
4610 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4611 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4612 0, IOMMU_RESV_MSI, GFP_KERNEL);
4613 if (!reg)
4614 return;
4615 list_add_tail(®->list, head);
4616}
4617
4618static struct iommu_group *intel_iommu_device_group(struct device *dev)
4619{
4620 if (dev_is_pci(dev))
4621 return pci_device_group(dev);
4622 return generic_device_group(dev);
4623}
4624
4625static int intel_iommu_enable_sva(struct device *dev)
4626{
4627 struct device_domain_info *info = dev_iommu_priv_get(dev);
4628 struct intel_iommu *iommu;
4629 int ret;
4630
4631 if (!info || dmar_disabled)
4632 return -EINVAL;
4633
4634 iommu = info->iommu;
4635 if (!iommu)
4636 return -EINVAL;
4637
4638 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4639 return -ENODEV;
4640
4641 if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4642 return -EINVAL;
4643
4644 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4645 if (!ret)
4646 ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4647
4648 return ret;
4649}
4650
4651static int intel_iommu_disable_sva(struct device *dev)
4652{
4653 struct device_domain_info *info = dev_iommu_priv_get(dev);
4654 struct intel_iommu *iommu = info->iommu;
4655 int ret;
4656
4657 ret = iommu_unregister_device_fault_handler(dev);
4658 if (!ret)
4659 ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4660
4661 return ret;
4662}
4663
4664static int intel_iommu_enable_iopf(struct device *dev)
4665{
4666 struct device_domain_info *info = dev_iommu_priv_get(dev);
4667
4668 if (info && info->pri_supported)
4669 return 0;
4670
4671 return -ENODEV;
4672}
4673
4674static int
4675intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4676{
4677 switch (feat) {
4678 case IOMMU_DEV_FEAT_IOPF:
4679 return intel_iommu_enable_iopf(dev);
4680
4681 case IOMMU_DEV_FEAT_SVA:
4682 return intel_iommu_enable_sva(dev);
4683
4684 default:
4685 return -ENODEV;
4686 }
4687}
4688
4689static int
4690intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4691{
4692 switch (feat) {
4693 case IOMMU_DEV_FEAT_IOPF:
4694 return 0;
4695
4696 case IOMMU_DEV_FEAT_SVA:
4697 return intel_iommu_disable_sva(dev);
4698
4699 default:
4700 return -ENODEV;
4701 }
4702}
4703
4704static bool intel_iommu_is_attach_deferred(struct device *dev)
4705{
4706 struct device_domain_info *info = dev_iommu_priv_get(dev);
4707
4708 return translation_pre_enabled(info->iommu) && !info->domain;
4709}
4710
4711/*
4712 * Check that the device does not live on an external facing PCI port that is
4713 * marked as untrusted. Such devices should not be able to apply quirks and
4714 * thus not be able to bypass the IOMMU restrictions.
4715 */
4716static bool risky_device(struct pci_dev *pdev)
4717{
4718 if (pdev->untrusted) {
4719 pci_info(pdev,
4720 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4721 pdev->vendor, pdev->device);
4722 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4723 return true;
4724 }
4725 return false;
4726}
4727
4728static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4729 unsigned long iova, size_t size)
4730{
4731 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4732 unsigned long pages = aligned_nrpages(iova, size);
4733 unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4734 struct iommu_domain_info *info;
4735 unsigned long i;
4736
4737 xa_for_each(&dmar_domain->iommu_array, i, info)
4738 __mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4739}
4740
4741static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
4742{
4743 struct intel_iommu *iommu = device_to_iommu(dev, NULL, NULL);
4744 struct iommu_domain *domain;
4745
4746 /* Domain type specific cleanup: */
4747 domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
4748 if (domain) {
4749 switch (domain->type) {
4750 case IOMMU_DOMAIN_SVA:
4751 intel_svm_remove_dev_pasid(dev, pasid);
4752 break;
4753 default:
4754 /* should never reach here */
4755 WARN_ON(1);
4756 break;
4757 }
4758 }
4759
4760 intel_pasid_tear_down_entry(iommu, dev, pasid, false);
4761}
4762
4763const struct iommu_ops intel_iommu_ops = {
4764 .capable = intel_iommu_capable,
4765 .domain_alloc = intel_iommu_domain_alloc,
4766 .probe_device = intel_iommu_probe_device,
4767 .probe_finalize = intel_iommu_probe_finalize,
4768 .release_device = intel_iommu_release_device,
4769 .get_resv_regions = intel_iommu_get_resv_regions,
4770 .device_group = intel_iommu_device_group,
4771 .dev_enable_feat = intel_iommu_dev_enable_feat,
4772 .dev_disable_feat = intel_iommu_dev_disable_feat,
4773 .is_attach_deferred = intel_iommu_is_attach_deferred,
4774 .def_domain_type = device_def_domain_type,
4775 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4776 .pgsize_bitmap = SZ_4K,
4777#ifdef CONFIG_INTEL_IOMMU_SVM
4778 .page_response = intel_svm_page_response,
4779#endif
4780 .default_domain_ops = &(const struct iommu_domain_ops) {
4781 .attach_dev = intel_iommu_attach_device,
4782 .map_pages = intel_iommu_map_pages,
4783 .unmap_pages = intel_iommu_unmap_pages,
4784 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4785 .flush_iotlb_all = intel_flush_iotlb_all,
4786 .iotlb_sync = intel_iommu_tlb_sync,
4787 .iova_to_phys = intel_iommu_iova_to_phys,
4788 .free = intel_iommu_domain_free,
4789 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4790 }
4791};
4792
4793static void quirk_iommu_igfx(struct pci_dev *dev)
4794{
4795 if (risky_device(dev))
4796 return;
4797
4798 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4799 dmar_map_gfx = 0;
4800}
4801
4802/* G4x/GM45 integrated gfx dmar support is totally busted. */
4803DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4804DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4805DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4806DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4807DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4808DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4809DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4810
4811/* Broadwell igfx malfunctions with dmar */
4812DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4813DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4814DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4815DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4816DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4817DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4818DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4819DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4820DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4821DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4822DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4823DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4824DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4825DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4826DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4827DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4828DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4829DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4830DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4831DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4832DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4833DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4834DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4835DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4836
4837static void quirk_iommu_rwbf(struct pci_dev *dev)
4838{
4839 if (risky_device(dev))
4840 return;
4841
4842 /*
4843 * Mobile 4 Series Chipset neglects to set RWBF capability,
4844 * but needs it. Same seems to hold for the desktop versions.
4845 */
4846 pci_info(dev, "Forcing write-buffer flush capability\n");
4847 rwbf_quirk = 1;
4848}
4849
4850DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4851DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4852DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4853DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4854DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4855DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4856DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4857
4858#define GGC 0x52
4859#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4860#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4861#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4862#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4863#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4864#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4865#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4866#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4867
4868static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4869{
4870 unsigned short ggc;
4871
4872 if (risky_device(dev))
4873 return;
4874
4875 if (pci_read_config_word(dev, GGC, &ggc))
4876 return;
4877
4878 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4879 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4880 dmar_map_gfx = 0;
4881 } else if (dmar_map_gfx) {
4882 /* we have to ensure the gfx device is idle before we flush */
4883 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4884 iommu_set_dma_strict();
4885 }
4886}
4887DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4888DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4889DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4890DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4891
4892static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4893{
4894 unsigned short ver;
4895
4896 if (!IS_GFX_DEVICE(dev))
4897 return;
4898
4899 ver = (dev->device >> 8) & 0xff;
4900 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4901 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4902 ver != 0x9a && ver != 0xa7)
4903 return;
4904
4905 if (risky_device(dev))
4906 return;
4907
4908 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4909 iommu_skip_te_disable = 1;
4910}
4911DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4912
4913/* On Tylersburg chipsets, some BIOSes have been known to enable the
4914 ISOCH DMAR unit for the Azalia sound device, but not give it any
4915 TLB entries, which causes it to deadlock. Check for that. We do
4916 this in a function called from init_dmars(), instead of in a PCI
4917 quirk, because we don't want to print the obnoxious "BIOS broken"
4918 message if VT-d is actually disabled.
4919*/
4920static void __init check_tylersburg_isoch(void)
4921{
4922 struct pci_dev *pdev;
4923 uint32_t vtisochctrl;
4924
4925 /* If there's no Azalia in the system anyway, forget it. */
4926 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4927 if (!pdev)
4928 return;
4929
4930 if (risky_device(pdev)) {
4931 pci_dev_put(pdev);
4932 return;
4933 }
4934
4935 pci_dev_put(pdev);
4936
4937 /* System Management Registers. Might be hidden, in which case
4938 we can't do the sanity check. But that's OK, because the
4939 known-broken BIOSes _don't_ actually hide it, so far. */
4940 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4941 if (!pdev)
4942 return;
4943
4944 if (risky_device(pdev)) {
4945 pci_dev_put(pdev);
4946 return;
4947 }
4948
4949 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4950 pci_dev_put(pdev);
4951 return;
4952 }
4953
4954 pci_dev_put(pdev);
4955
4956 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4957 if (vtisochctrl & 1)
4958 return;
4959
4960 /* Drop all bits other than the number of TLB entries */
4961 vtisochctrl &= 0x1c;
4962
4963 /* If we have the recommended number of TLB entries (16), fine. */
4964 if (vtisochctrl == 0x10)
4965 return;
4966
4967 /* Zero TLB entries? You get to ride the short bus to school. */
4968 if (!vtisochctrl) {
4969 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4970 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4971 dmi_get_system_info(DMI_BIOS_VENDOR),
4972 dmi_get_system_info(DMI_BIOS_VERSION),
4973 dmi_get_system_info(DMI_PRODUCT_VERSION));
4974 iommu_identity_mapping |= IDENTMAP_AZALIA;
4975 return;
4976 }
4977
4978 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4979 vtisochctrl);
4980}
4981
4982/*
4983 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4984 * invalidation completion before posted writes initiated with translated address
4985 * that utilized translations matching the invalidation address range, violating
4986 * the invalidation completion ordering.
4987 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4988 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4989 * under the control of the trusted/privileged host device driver must use this
4990 * quirk.
4991 * Device TLBs are invalidated under the following six conditions:
4992 * 1. Device driver does DMA API unmap IOVA
4993 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4994 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4995 * exit_mmap() due to crash
4996 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4997 * VM has to free pages that were unmapped
4998 * 5. Userspace driver unmaps a DMA buffer
4999 * 6. Cache invalidation in vSVA usage (upcoming)
5000 *
5001 * For #1 and #2, device drivers are responsible for stopping DMA traffic
5002 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
5003 * invalidate TLB the same way as normal user unmap which will use this quirk.
5004 * The dTLB invalidation after PASID cache flush does not need this quirk.
5005 *
5006 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
5007 */
5008void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
5009 unsigned long address, unsigned long mask,
5010 u32 pasid, u16 qdep)
5011{
5012 u16 sid;
5013
5014 if (likely(!info->dtlb_extra_inval))
5015 return;
5016
5017 sid = PCI_DEVID(info->bus, info->devfn);
5018 if (pasid == PASID_RID2PASID) {
5019 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
5020 qdep, address, mask);
5021 } else {
5022 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
5023 pasid, qdep, address, mask);
5024 }
5025}