Loading...
Note: File does not exist in v3.1.
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-pages.h"
31#include "pasid.h"
32#include "cap_audit.h"
33#include "perfmon.h"
34
35#define ROOT_SIZE VTD_PAGE_SIZE
36#define CONTEXT_SIZE VTD_PAGE_SIZE
37
38#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
39#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
40#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
41#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
42
43#define IOAPIC_RANGE_START (0xfee00000)
44#define IOAPIC_RANGE_END (0xfeefffff)
45#define IOVA_START_ADDR (0x1000)
46
47#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
48
49#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
50#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
51
52/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
53 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
54#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
55 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
56#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
57
58static void __init check_tylersburg_isoch(void);
59static int rwbf_quirk;
60
61/*
62 * set to 1 to panic kernel if can't successfully enable VT-d
63 * (used when kernel is launched w/ TXT)
64 */
65static int force_on = 0;
66static int intel_iommu_tboot_noforce;
67static int no_platform_optin;
68
69#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
70
71/*
72 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
73 * if marked present.
74 */
75static phys_addr_t root_entry_lctp(struct root_entry *re)
76{
77 if (!(re->lo & 1))
78 return 0;
79
80 return re->lo & VTD_PAGE_MASK;
81}
82
83/*
84 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
85 * if marked present.
86 */
87static phys_addr_t root_entry_uctp(struct root_entry *re)
88{
89 if (!(re->hi & 1))
90 return 0;
91
92 return re->hi & VTD_PAGE_MASK;
93}
94
95static int device_rid_cmp_key(const void *key, const struct rb_node *node)
96{
97 struct device_domain_info *info =
98 rb_entry(node, struct device_domain_info, node);
99 const u16 *rid_lhs = key;
100
101 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
102 return -1;
103
104 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
105 return 1;
106
107 return 0;
108}
109
110static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
111{
112 struct device_domain_info *info =
113 rb_entry(lhs, struct device_domain_info, node);
114 u16 key = PCI_DEVID(info->bus, info->devfn);
115
116 return device_rid_cmp_key(&key, rhs);
117}
118
119/*
120 * Looks up an IOMMU-probed device using its source ID.
121 *
122 * Returns the pointer to the device if there is a match. Otherwise,
123 * returns NULL.
124 *
125 * Note that this helper doesn't guarantee that the device won't be
126 * released by the iommu subsystem after being returned. The caller
127 * should use its own synchronization mechanism to avoid the device
128 * being released during its use if its possibly the case.
129 */
130struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
131{
132 struct device_domain_info *info = NULL;
133 struct rb_node *node;
134 unsigned long flags;
135
136 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
137 node = rb_find(&rid, &iommu->device_rbtree, device_rid_cmp_key);
138 if (node)
139 info = rb_entry(node, struct device_domain_info, node);
140 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
141
142 return info ? info->dev : NULL;
143}
144
145static int device_rbtree_insert(struct intel_iommu *iommu,
146 struct device_domain_info *info)
147{
148 struct rb_node *curr;
149 unsigned long flags;
150
151 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
152 curr = rb_find_add(&info->node, &iommu->device_rbtree, device_rid_cmp);
153 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
154 if (WARN_ON(curr))
155 return -EEXIST;
156
157 return 0;
158}
159
160static void device_rbtree_remove(struct device_domain_info *info)
161{
162 struct intel_iommu *iommu = info->iommu;
163 unsigned long flags;
164
165 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
166 rb_erase(&info->node, &iommu->device_rbtree);
167 spin_unlock_irqrestore(&iommu->device_rbtree_lock, flags);
168}
169
170struct dmar_rmrr_unit {
171 struct list_head list; /* list of rmrr units */
172 struct acpi_dmar_header *hdr; /* ACPI header */
173 u64 base_address; /* reserved base address*/
174 u64 end_address; /* reserved end address */
175 struct dmar_dev_scope *devices; /* target devices */
176 int devices_cnt; /* target device count */
177};
178
179struct dmar_atsr_unit {
180 struct list_head list; /* list of ATSR units */
181 struct acpi_dmar_header *hdr; /* ACPI header */
182 struct dmar_dev_scope *devices; /* target devices */
183 int devices_cnt; /* target device count */
184 u8 include_all:1; /* include all ports */
185};
186
187struct dmar_satc_unit {
188 struct list_head list; /* list of SATC units */
189 struct acpi_dmar_header *hdr; /* ACPI header */
190 struct dmar_dev_scope *devices; /* target devices */
191 struct intel_iommu *iommu; /* the corresponding iommu */
192 int devices_cnt; /* target device count */
193 u8 atc_required:1; /* ATS is required */
194};
195
196static LIST_HEAD(dmar_atsr_units);
197static LIST_HEAD(dmar_rmrr_units);
198static LIST_HEAD(dmar_satc_units);
199
200#define for_each_rmrr_units(rmrr) \
201 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
202
203static void intel_iommu_domain_free(struct iommu_domain *domain);
204
205int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
206int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
207
208int intel_iommu_enabled = 0;
209EXPORT_SYMBOL_GPL(intel_iommu_enabled);
210
211static int intel_iommu_superpage = 1;
212static int iommu_identity_mapping;
213static int iommu_skip_te_disable;
214static int disable_igfx_iommu;
215
216#define IDENTMAP_AZALIA 4
217
218const struct iommu_ops intel_iommu_ops;
219static const struct iommu_dirty_ops intel_dirty_ops;
220
221static bool translation_pre_enabled(struct intel_iommu *iommu)
222{
223 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
224}
225
226static void clear_translation_pre_enabled(struct intel_iommu *iommu)
227{
228 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
229}
230
231static void init_translation_status(struct intel_iommu *iommu)
232{
233 u32 gsts;
234
235 gsts = readl(iommu->reg + DMAR_GSTS_REG);
236 if (gsts & DMA_GSTS_TES)
237 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
238}
239
240static int __init intel_iommu_setup(char *str)
241{
242 if (!str)
243 return -EINVAL;
244
245 while (*str) {
246 if (!strncmp(str, "on", 2)) {
247 dmar_disabled = 0;
248 pr_info("IOMMU enabled\n");
249 } else if (!strncmp(str, "off", 3)) {
250 dmar_disabled = 1;
251 no_platform_optin = 1;
252 pr_info("IOMMU disabled\n");
253 } else if (!strncmp(str, "igfx_off", 8)) {
254 disable_igfx_iommu = 1;
255 pr_info("Disable GFX device mapping\n");
256 } else if (!strncmp(str, "forcedac", 8)) {
257 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
258 iommu_dma_forcedac = true;
259 } else if (!strncmp(str, "strict", 6)) {
260 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
261 iommu_set_dma_strict();
262 } else if (!strncmp(str, "sp_off", 6)) {
263 pr_info("Disable supported super page\n");
264 intel_iommu_superpage = 0;
265 } else if (!strncmp(str, "sm_on", 5)) {
266 pr_info("Enable scalable mode if hardware supports\n");
267 intel_iommu_sm = 1;
268 } else if (!strncmp(str, "sm_off", 6)) {
269 pr_info("Scalable mode is disallowed\n");
270 intel_iommu_sm = 0;
271 } else if (!strncmp(str, "tboot_noforce", 13)) {
272 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
273 intel_iommu_tboot_noforce = 1;
274 } else {
275 pr_notice("Unknown option - '%s'\n", str);
276 }
277
278 str += strcspn(str, ",");
279 while (*str == ',')
280 str++;
281 }
282
283 return 1;
284}
285__setup("intel_iommu=", intel_iommu_setup);
286
287static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
288{
289 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
290
291 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
292}
293
294/*
295 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
296 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
297 * the returned SAGAW.
298 */
299static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
300{
301 unsigned long fl_sagaw, sl_sagaw;
302
303 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
304 sl_sagaw = cap_sagaw(iommu->cap);
305
306 /* Second level only. */
307 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
308 return sl_sagaw;
309
310 /* First level only. */
311 if (!ecap_slts(iommu->ecap))
312 return fl_sagaw;
313
314 return fl_sagaw & sl_sagaw;
315}
316
317static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
318{
319 unsigned long sagaw;
320 int agaw;
321
322 sagaw = __iommu_calculate_sagaw(iommu);
323 for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
324 if (test_bit(agaw, &sagaw))
325 break;
326 }
327
328 return agaw;
329}
330
331/*
332 * Calculate max SAGAW for each iommu.
333 */
334int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
335{
336 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
337}
338
339/*
340 * calculate agaw for each iommu.
341 * "SAGAW" may be different across iommus, use a default agaw, and
342 * get a supported less agaw for iommus that don't support the default agaw.
343 */
344int iommu_calculate_agaw(struct intel_iommu *iommu)
345{
346 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
347}
348
349static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
350{
351 return sm_supported(iommu) ?
352 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
353}
354
355/* Return the super pagesize bitmap if supported. */
356static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
357{
358 unsigned long bitmap = 0;
359
360 /*
361 * 1-level super page supports page size of 2MiB, 2-level super page
362 * supports page size of both 2MiB and 1GiB.
363 */
364 if (domain->iommu_superpage == 1)
365 bitmap |= SZ_2M;
366 else if (domain->iommu_superpage == 2)
367 bitmap |= SZ_2M | SZ_1G;
368
369 return bitmap;
370}
371
372struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
373 u8 devfn, int alloc)
374{
375 struct root_entry *root = &iommu->root_entry[bus];
376 struct context_entry *context;
377 u64 *entry;
378
379 /*
380 * Except that the caller requested to allocate a new entry,
381 * returning a copied context entry makes no sense.
382 */
383 if (!alloc && context_copied(iommu, bus, devfn))
384 return NULL;
385
386 entry = &root->lo;
387 if (sm_supported(iommu)) {
388 if (devfn >= 0x80) {
389 devfn -= 0x80;
390 entry = &root->hi;
391 }
392 devfn *= 2;
393 }
394 if (*entry & 1)
395 context = phys_to_virt(*entry & VTD_PAGE_MASK);
396 else {
397 unsigned long phy_addr;
398 if (!alloc)
399 return NULL;
400
401 context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
402 if (!context)
403 return NULL;
404
405 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
406 phy_addr = virt_to_phys((void *)context);
407 *entry = phy_addr | 1;
408 __iommu_flush_cache(iommu, entry, sizeof(*entry));
409 }
410 return &context[devfn];
411}
412
413/**
414 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
415 * sub-hierarchy of a candidate PCI-PCI bridge
416 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
417 * @bridge: the candidate PCI-PCI bridge
418 *
419 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
420 */
421static bool
422is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
423{
424 struct pci_dev *pdev, *pbridge;
425
426 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
427 return false;
428
429 pdev = to_pci_dev(dev);
430 pbridge = to_pci_dev(bridge);
431
432 if (pbridge->subordinate &&
433 pbridge->subordinate->number <= pdev->bus->number &&
434 pbridge->subordinate->busn_res.end >= pdev->bus->number)
435 return true;
436
437 return false;
438}
439
440static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
441{
442 struct dmar_drhd_unit *drhd;
443 u32 vtbar;
444 int rc;
445
446 /* We know that this device on this chipset has its own IOMMU.
447 * If we find it under a different IOMMU, then the BIOS is lying
448 * to us. Hope that the IOMMU for this device is actually
449 * disabled, and it needs no translation...
450 */
451 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
452 if (rc) {
453 /* "can't" happen */
454 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
455 return false;
456 }
457 vtbar &= 0xffff0000;
458
459 /* we know that the this iommu should be at offset 0xa000 from vtbar */
460 drhd = dmar_find_matched_drhd_unit(pdev);
461 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
462 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
463 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
464 return true;
465 }
466
467 return false;
468}
469
470static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
471{
472 if (!iommu || iommu->drhd->ignored)
473 return true;
474
475 if (dev_is_pci(dev)) {
476 struct pci_dev *pdev = to_pci_dev(dev);
477
478 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
479 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
480 quirk_ioat_snb_local_iommu(pdev))
481 return true;
482 }
483
484 return false;
485}
486
487static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
488{
489 struct dmar_drhd_unit *drhd = NULL;
490 struct pci_dev *pdev = NULL;
491 struct intel_iommu *iommu;
492 struct device *tmp;
493 u16 segment = 0;
494 int i;
495
496 if (!dev)
497 return NULL;
498
499 if (dev_is_pci(dev)) {
500 struct pci_dev *pf_pdev;
501
502 pdev = pci_real_dma_dev(to_pci_dev(dev));
503
504 /* VFs aren't listed in scope tables; we need to look up
505 * the PF instead to find the IOMMU. */
506 pf_pdev = pci_physfn(pdev);
507 dev = &pf_pdev->dev;
508 segment = pci_domain_nr(pdev->bus);
509 } else if (has_acpi_companion(dev))
510 dev = &ACPI_COMPANION(dev)->dev;
511
512 rcu_read_lock();
513 for_each_iommu(iommu, drhd) {
514 if (pdev && segment != drhd->segment)
515 continue;
516
517 for_each_active_dev_scope(drhd->devices,
518 drhd->devices_cnt, i, tmp) {
519 if (tmp == dev) {
520 /* For a VF use its original BDF# not that of the PF
521 * which we used for the IOMMU lookup. Strictly speaking
522 * we could do this for all PCI devices; we only need to
523 * get the BDF# from the scope table for ACPI matches. */
524 if (pdev && pdev->is_virtfn)
525 goto got_pdev;
526
527 if (bus && devfn) {
528 *bus = drhd->devices[i].bus;
529 *devfn = drhd->devices[i].devfn;
530 }
531 goto out;
532 }
533
534 if (is_downstream_to_pci_bridge(dev, tmp))
535 goto got_pdev;
536 }
537
538 if (pdev && drhd->include_all) {
539got_pdev:
540 if (bus && devfn) {
541 *bus = pdev->bus->number;
542 *devfn = pdev->devfn;
543 }
544 goto out;
545 }
546 }
547 iommu = NULL;
548out:
549 if (iommu_is_dummy(iommu, dev))
550 iommu = NULL;
551
552 rcu_read_unlock();
553
554 return iommu;
555}
556
557static void domain_flush_cache(struct dmar_domain *domain,
558 void *addr, int size)
559{
560 if (!domain->iommu_coherency)
561 clflush_cache_range(addr, size);
562}
563
564static void free_context_table(struct intel_iommu *iommu)
565{
566 struct context_entry *context;
567 int i;
568
569 if (!iommu->root_entry)
570 return;
571
572 for (i = 0; i < ROOT_ENTRY_NR; i++) {
573 context = iommu_context_addr(iommu, i, 0, 0);
574 if (context)
575 iommu_free_page(context);
576
577 if (!sm_supported(iommu))
578 continue;
579
580 context = iommu_context_addr(iommu, i, 0x80, 0);
581 if (context)
582 iommu_free_page(context);
583 }
584
585 iommu_free_page(iommu->root_entry);
586 iommu->root_entry = NULL;
587}
588
589#ifdef CONFIG_DMAR_DEBUG
590static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
591 u8 bus, u8 devfn, struct dma_pte *parent, int level)
592{
593 struct dma_pte *pte;
594 int offset;
595
596 while (1) {
597 offset = pfn_level_offset(pfn, level);
598 pte = &parent[offset];
599
600 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
601
602 if (!dma_pte_present(pte)) {
603 pr_info("page table not present at level %d\n", level - 1);
604 break;
605 }
606
607 if (level == 1 || dma_pte_superpage(pte))
608 break;
609
610 parent = phys_to_virt(dma_pte_addr(pte));
611 level--;
612 }
613}
614
615void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
616 unsigned long long addr, u32 pasid)
617{
618 struct pasid_dir_entry *dir, *pde;
619 struct pasid_entry *entries, *pte;
620 struct context_entry *ctx_entry;
621 struct root_entry *rt_entry;
622 int i, dir_index, index, level;
623 u8 devfn = source_id & 0xff;
624 u8 bus = source_id >> 8;
625 struct dma_pte *pgtable;
626
627 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
628
629 /* root entry dump */
630 if (!iommu->root_entry) {
631 pr_info("root table is not present\n");
632 return;
633 }
634 rt_entry = &iommu->root_entry[bus];
635
636 if (sm_supported(iommu))
637 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
638 rt_entry->hi, rt_entry->lo);
639 else
640 pr_info("root entry: 0x%016llx", rt_entry->lo);
641
642 /* context entry dump */
643 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
644 if (!ctx_entry) {
645 pr_info("context table is not present\n");
646 return;
647 }
648
649 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
650 ctx_entry->hi, ctx_entry->lo);
651
652 /* legacy mode does not require PASID entries */
653 if (!sm_supported(iommu)) {
654 if (!context_present(ctx_entry)) {
655 pr_info("legacy mode page table is not present\n");
656 return;
657 }
658 level = agaw_to_level(ctx_entry->hi & 7);
659 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
660 goto pgtable_walk;
661 }
662
663 if (!context_present(ctx_entry)) {
664 pr_info("pasid directory table is not present\n");
665 return;
666 }
667
668 /* get the pointer to pasid directory entry */
669 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
670
671 /* For request-without-pasid, get the pasid from context entry */
672 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
673 pasid = IOMMU_NO_PASID;
674
675 dir_index = pasid >> PASID_PDE_SHIFT;
676 pde = &dir[dir_index];
677 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
678
679 /* get the pointer to the pasid table entry */
680 entries = get_pasid_table_from_pde(pde);
681 if (!entries) {
682 pr_info("pasid table is not present\n");
683 return;
684 }
685 index = pasid & PASID_PTE_MASK;
686 pte = &entries[index];
687 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
688 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
689
690 if (!pasid_pte_is_present(pte)) {
691 pr_info("scalable mode page table is not present\n");
692 return;
693 }
694
695 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
696 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
697 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
698 } else {
699 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
700 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
701 }
702
703pgtable_walk:
704 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
705}
706#endif
707
708static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
709 unsigned long pfn, int *target_level,
710 gfp_t gfp)
711{
712 struct dma_pte *parent, *pte;
713 int level = agaw_to_level(domain->agaw);
714 int offset;
715
716 if (!domain_pfn_supported(domain, pfn))
717 /* Address beyond IOMMU's addressing capabilities. */
718 return NULL;
719
720 parent = domain->pgd;
721
722 while (1) {
723 void *tmp_page;
724
725 offset = pfn_level_offset(pfn, level);
726 pte = &parent[offset];
727 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
728 break;
729 if (level == *target_level)
730 break;
731
732 if (!dma_pte_present(pte)) {
733 uint64_t pteval, tmp;
734
735 tmp_page = iommu_alloc_page_node(domain->nid, gfp);
736
737 if (!tmp_page)
738 return NULL;
739
740 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
741 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
742 if (domain->use_first_level)
743 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
744
745 tmp = 0ULL;
746 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
747 /* Someone else set it while we were thinking; use theirs. */
748 iommu_free_page(tmp_page);
749 else
750 domain_flush_cache(domain, pte, sizeof(*pte));
751 }
752 if (level == 1)
753 break;
754
755 parent = phys_to_virt(dma_pte_addr(pte));
756 level--;
757 }
758
759 if (!*target_level)
760 *target_level = level;
761
762 return pte;
763}
764
765/* return address's pte at specific level */
766static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
767 unsigned long pfn,
768 int level, int *large_page)
769{
770 struct dma_pte *parent, *pte;
771 int total = agaw_to_level(domain->agaw);
772 int offset;
773
774 parent = domain->pgd;
775 while (level <= total) {
776 offset = pfn_level_offset(pfn, total);
777 pte = &parent[offset];
778 if (level == total)
779 return pte;
780
781 if (!dma_pte_present(pte)) {
782 *large_page = total;
783 break;
784 }
785
786 if (dma_pte_superpage(pte)) {
787 *large_page = total;
788 return pte;
789 }
790
791 parent = phys_to_virt(dma_pte_addr(pte));
792 total--;
793 }
794 return NULL;
795}
796
797/* clear last level pte, a tlb flush should be followed */
798static void dma_pte_clear_range(struct dmar_domain *domain,
799 unsigned long start_pfn,
800 unsigned long last_pfn)
801{
802 unsigned int large_page;
803 struct dma_pte *first_pte, *pte;
804
805 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
806 WARN_ON(start_pfn > last_pfn))
807 return;
808
809 /* we don't need lock here; nobody else touches the iova range */
810 do {
811 large_page = 1;
812 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
813 if (!pte) {
814 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
815 continue;
816 }
817 do {
818 dma_clear_pte(pte);
819 start_pfn += lvl_to_nr_pages(large_page);
820 pte++;
821 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
822
823 domain_flush_cache(domain, first_pte,
824 (void *)pte - (void *)first_pte);
825
826 } while (start_pfn && start_pfn <= last_pfn);
827}
828
829static void dma_pte_free_level(struct dmar_domain *domain, int level,
830 int retain_level, struct dma_pte *pte,
831 unsigned long pfn, unsigned long start_pfn,
832 unsigned long last_pfn)
833{
834 pfn = max(start_pfn, pfn);
835 pte = &pte[pfn_level_offset(pfn, level)];
836
837 do {
838 unsigned long level_pfn;
839 struct dma_pte *level_pte;
840
841 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
842 goto next;
843
844 level_pfn = pfn & level_mask(level);
845 level_pte = phys_to_virt(dma_pte_addr(pte));
846
847 if (level > 2) {
848 dma_pte_free_level(domain, level - 1, retain_level,
849 level_pte, level_pfn, start_pfn,
850 last_pfn);
851 }
852
853 /*
854 * Free the page table if we're below the level we want to
855 * retain and the range covers the entire table.
856 */
857 if (level < retain_level && !(start_pfn > level_pfn ||
858 last_pfn < level_pfn + level_size(level) - 1)) {
859 dma_clear_pte(pte);
860 domain_flush_cache(domain, pte, sizeof(*pte));
861 iommu_free_page(level_pte);
862 }
863next:
864 pfn += level_size(level);
865 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
866}
867
868/*
869 * clear last level (leaf) ptes and free page table pages below the
870 * level we wish to keep intact.
871 */
872static void dma_pte_free_pagetable(struct dmar_domain *domain,
873 unsigned long start_pfn,
874 unsigned long last_pfn,
875 int retain_level)
876{
877 dma_pte_clear_range(domain, start_pfn, last_pfn);
878
879 /* We don't need lock here; nobody else touches the iova range */
880 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
881 domain->pgd, 0, start_pfn, last_pfn);
882
883 /* free pgd */
884 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
885 iommu_free_page(domain->pgd);
886 domain->pgd = NULL;
887 }
888}
889
890/* When a page at a given level is being unlinked from its parent, we don't
891 need to *modify* it at all. All we need to do is make a list of all the
892 pages which can be freed just as soon as we've flushed the IOTLB and we
893 know the hardware page-walk will no longer touch them.
894 The 'pte' argument is the *parent* PTE, pointing to the page that is to
895 be freed. */
896static void dma_pte_list_pagetables(struct dmar_domain *domain,
897 int level, struct dma_pte *pte,
898 struct list_head *freelist)
899{
900 struct page *pg;
901
902 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
903 list_add_tail(&pg->lru, freelist);
904
905 if (level == 1)
906 return;
907
908 pte = page_address(pg);
909 do {
910 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
911 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
912 pte++;
913 } while (!first_pte_in_page(pte));
914}
915
916static void dma_pte_clear_level(struct dmar_domain *domain, int level,
917 struct dma_pte *pte, unsigned long pfn,
918 unsigned long start_pfn, unsigned long last_pfn,
919 struct list_head *freelist)
920{
921 struct dma_pte *first_pte = NULL, *last_pte = NULL;
922
923 pfn = max(start_pfn, pfn);
924 pte = &pte[pfn_level_offset(pfn, level)];
925
926 do {
927 unsigned long level_pfn = pfn & level_mask(level);
928
929 if (!dma_pte_present(pte))
930 goto next;
931
932 /* If range covers entire pagetable, free it */
933 if (start_pfn <= level_pfn &&
934 last_pfn >= level_pfn + level_size(level) - 1) {
935 /* These suborbinate page tables are going away entirely. Don't
936 bother to clear them; we're just going to *free* them. */
937 if (level > 1 && !dma_pte_superpage(pte))
938 dma_pte_list_pagetables(domain, level - 1, pte, freelist);
939
940 dma_clear_pte(pte);
941 if (!first_pte)
942 first_pte = pte;
943 last_pte = pte;
944 } else if (level > 1) {
945 /* Recurse down into a level that isn't *entirely* obsolete */
946 dma_pte_clear_level(domain, level - 1,
947 phys_to_virt(dma_pte_addr(pte)),
948 level_pfn, start_pfn, last_pfn,
949 freelist);
950 }
951next:
952 pfn = level_pfn + level_size(level);
953 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
954
955 if (first_pte)
956 domain_flush_cache(domain, first_pte,
957 (void *)++last_pte - (void *)first_pte);
958}
959
960/* We can't just free the pages because the IOMMU may still be walking
961 the page tables, and may have cached the intermediate levels. The
962 pages can only be freed after the IOTLB flush has been done. */
963static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
964 unsigned long last_pfn, struct list_head *freelist)
965{
966 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
967 WARN_ON(start_pfn > last_pfn))
968 return;
969
970 /* we don't need lock here; nobody else touches the iova range */
971 dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
972 domain->pgd, 0, start_pfn, last_pfn, freelist);
973
974 /* free pgd */
975 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
976 struct page *pgd_page = virt_to_page(domain->pgd);
977 list_add_tail(&pgd_page->lru, freelist);
978 domain->pgd = NULL;
979 }
980}
981
982/* iommu handling */
983static int iommu_alloc_root_entry(struct intel_iommu *iommu)
984{
985 struct root_entry *root;
986
987 root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
988 if (!root) {
989 pr_err("Allocating root entry for %s failed\n",
990 iommu->name);
991 return -ENOMEM;
992 }
993
994 __iommu_flush_cache(iommu, root, ROOT_SIZE);
995 iommu->root_entry = root;
996
997 return 0;
998}
999
1000static void iommu_set_root_entry(struct intel_iommu *iommu)
1001{
1002 u64 addr;
1003 u32 sts;
1004 unsigned long flag;
1005
1006 addr = virt_to_phys(iommu->root_entry);
1007 if (sm_supported(iommu))
1008 addr |= DMA_RTADDR_SMT;
1009
1010 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1011 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1012
1013 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1014
1015 /* Make sure hardware complete it */
1016 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1017 readl, (sts & DMA_GSTS_RTPS), sts);
1018
1019 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1020
1021 /*
1022 * Hardware invalidates all DMA remapping hardware translation
1023 * caches as part of SRTP flow.
1024 */
1025 if (cap_esrtps(iommu->cap))
1026 return;
1027
1028 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1029 if (sm_supported(iommu))
1030 qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1031 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1032}
1033
1034void iommu_flush_write_buffer(struct intel_iommu *iommu)
1035{
1036 u32 val;
1037 unsigned long flag;
1038
1039 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1040 return;
1041
1042 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1043 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1044
1045 /* Make sure hardware complete it */
1046 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1047 readl, (!(val & DMA_GSTS_WBFS)), val);
1048
1049 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1050}
1051
1052/* return value determine if we need a write buffer flush */
1053static void __iommu_flush_context(struct intel_iommu *iommu,
1054 u16 did, u16 source_id, u8 function_mask,
1055 u64 type)
1056{
1057 u64 val = 0;
1058 unsigned long flag;
1059
1060 switch (type) {
1061 case DMA_CCMD_GLOBAL_INVL:
1062 val = DMA_CCMD_GLOBAL_INVL;
1063 break;
1064 case DMA_CCMD_DOMAIN_INVL:
1065 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1066 break;
1067 case DMA_CCMD_DEVICE_INVL:
1068 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1069 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1070 break;
1071 default:
1072 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1073 iommu->name, type);
1074 return;
1075 }
1076 val |= DMA_CCMD_ICC;
1077
1078 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1079 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1080
1081 /* Make sure hardware complete it */
1082 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1083 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1084
1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086}
1087
1088void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1089 unsigned int size_order, u64 type)
1090{
1091 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1092 u64 val = 0, val_iva = 0;
1093 unsigned long flag;
1094
1095 switch (type) {
1096 case DMA_TLB_GLOBAL_FLUSH:
1097 /* global flush doesn't need set IVA_REG */
1098 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1099 break;
1100 case DMA_TLB_DSI_FLUSH:
1101 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1102 break;
1103 case DMA_TLB_PSI_FLUSH:
1104 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1105 /* IH bit is passed in as part of address */
1106 val_iva = size_order | addr;
1107 break;
1108 default:
1109 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1110 iommu->name, type);
1111 return;
1112 }
1113
1114 if (cap_write_drain(iommu->cap))
1115 val |= DMA_TLB_WRITE_DRAIN;
1116
1117 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1118 /* Note: Only uses first TLB reg currently */
1119 if (val_iva)
1120 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1121 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1122
1123 /* Make sure hardware complete it */
1124 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1125 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1126
1127 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1128
1129 /* check IOTLB invalidation granularity */
1130 if (DMA_TLB_IAIG(val) == 0)
1131 pr_err("Flush IOTLB failed\n");
1132 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1133 pr_debug("TLB flush request %Lx, actual %Lx\n",
1134 (unsigned long long)DMA_TLB_IIRG(type),
1135 (unsigned long long)DMA_TLB_IAIG(val));
1136}
1137
1138static struct device_domain_info *
1139domain_lookup_dev_info(struct dmar_domain *domain,
1140 struct intel_iommu *iommu, u8 bus, u8 devfn)
1141{
1142 struct device_domain_info *info;
1143 unsigned long flags;
1144
1145 spin_lock_irqsave(&domain->lock, flags);
1146 list_for_each_entry(info, &domain->devices, link) {
1147 if (info->iommu == iommu && info->bus == bus &&
1148 info->devfn == devfn) {
1149 spin_unlock_irqrestore(&domain->lock, flags);
1150 return info;
1151 }
1152 }
1153 spin_unlock_irqrestore(&domain->lock, flags);
1154
1155 return NULL;
1156}
1157
1158/*
1159 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1160 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1161 * check because it applies only to the built-in QAT devices and it doesn't
1162 * grant additional privileges.
1163 */
1164#define BUGGY_QAT_DEVID_MASK 0x4940
1165static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1166{
1167 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1168 return false;
1169
1170 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1171 return false;
1172
1173 return true;
1174}
1175
1176static void iommu_enable_pci_caps(struct device_domain_info *info)
1177{
1178 struct pci_dev *pdev;
1179
1180 if (!dev_is_pci(info->dev))
1181 return;
1182
1183 pdev = to_pci_dev(info->dev);
1184 if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1185 !pci_enable_ats(pdev, VTD_PAGE_SHIFT))
1186 info->ats_enabled = 1;
1187}
1188
1189static void iommu_disable_pci_caps(struct device_domain_info *info)
1190{
1191 struct pci_dev *pdev;
1192
1193 if (!dev_is_pci(info->dev))
1194 return;
1195
1196 pdev = to_pci_dev(info->dev);
1197
1198 if (info->ats_enabled) {
1199 pci_disable_ats(pdev);
1200 info->ats_enabled = 0;
1201 }
1202}
1203
1204static void intel_flush_iotlb_all(struct iommu_domain *domain)
1205{
1206 cache_tag_flush_all(to_dmar_domain(domain));
1207}
1208
1209static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1210{
1211 u32 pmen;
1212 unsigned long flags;
1213
1214 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1215 return;
1216
1217 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1218 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1219 pmen &= ~DMA_PMEN_EPM;
1220 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1221
1222 /* wait for the protected region status bit to clear */
1223 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1224 readl, !(pmen & DMA_PMEN_PRS), pmen);
1225
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227}
1228
1229static void iommu_enable_translation(struct intel_iommu *iommu)
1230{
1231 u32 sts;
1232 unsigned long flags;
1233
1234 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1235 iommu->gcmd |= DMA_GCMD_TE;
1236 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1237
1238 /* Make sure hardware complete it */
1239 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1240 readl, (sts & DMA_GSTS_TES), sts);
1241
1242 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1243}
1244
1245static void iommu_disable_translation(struct intel_iommu *iommu)
1246{
1247 u32 sts;
1248 unsigned long flag;
1249
1250 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1251 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1252 return;
1253
1254 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1255 iommu->gcmd &= ~DMA_GCMD_TE;
1256 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1257
1258 /* Make sure hardware complete it */
1259 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260 readl, (!(sts & DMA_GSTS_TES)), sts);
1261
1262 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263}
1264
1265static int iommu_init_domains(struct intel_iommu *iommu)
1266{
1267 u32 ndomains;
1268
1269 ndomains = cap_ndoms(iommu->cap);
1270 pr_debug("%s: Number of Domains supported <%d>\n",
1271 iommu->name, ndomains);
1272
1273 spin_lock_init(&iommu->lock);
1274
1275 iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1276 if (!iommu->domain_ids)
1277 return -ENOMEM;
1278
1279 /*
1280 * If Caching mode is set, then invalid translations are tagged
1281 * with domain-id 0, hence we need to pre-allocate it. We also
1282 * use domain-id 0 as a marker for non-allocated domain-id, so
1283 * make sure it is not used for a real domain.
1284 */
1285 set_bit(0, iommu->domain_ids);
1286
1287 /*
1288 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1289 * entry for first-level or pass-through translation modes should
1290 * be programmed with a domain id different from those used for
1291 * second-level or nested translation. We reserve a domain id for
1292 * this purpose. This domain id is also used for identity domain
1293 * in legacy mode.
1294 */
1295 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1296
1297 return 0;
1298}
1299
1300static void disable_dmar_iommu(struct intel_iommu *iommu)
1301{
1302 if (!iommu->domain_ids)
1303 return;
1304
1305 /*
1306 * All iommu domains must have been detached from the devices,
1307 * hence there should be no domain IDs in use.
1308 */
1309 if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1310 > NUM_RESERVED_DID))
1311 return;
1312
1313 if (iommu->gcmd & DMA_GCMD_TE)
1314 iommu_disable_translation(iommu);
1315}
1316
1317static void free_dmar_iommu(struct intel_iommu *iommu)
1318{
1319 if (iommu->domain_ids) {
1320 bitmap_free(iommu->domain_ids);
1321 iommu->domain_ids = NULL;
1322 }
1323
1324 if (iommu->copied_tables) {
1325 bitmap_free(iommu->copied_tables);
1326 iommu->copied_tables = NULL;
1327 }
1328
1329 /* free context mapping */
1330 free_context_table(iommu);
1331
1332 if (ecap_prs(iommu->ecap))
1333 intel_iommu_finish_prq(iommu);
1334}
1335
1336/*
1337 * Check and return whether first level is used by default for
1338 * DMA translation.
1339 */
1340static bool first_level_by_default(struct intel_iommu *iommu)
1341{
1342 /* Only SL is available in legacy mode */
1343 if (!sm_supported(iommu))
1344 return false;
1345
1346 /* Only level (either FL or SL) is available, just use it */
1347 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1348 return ecap_flts(iommu->ecap);
1349
1350 return true;
1351}
1352
1353int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1354{
1355 struct iommu_domain_info *info, *curr;
1356 unsigned long ndomains;
1357 int num, ret = -ENOSPC;
1358
1359 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1360 return 0;
1361
1362 info = kzalloc(sizeof(*info), GFP_KERNEL);
1363 if (!info)
1364 return -ENOMEM;
1365
1366 spin_lock(&iommu->lock);
1367 curr = xa_load(&domain->iommu_array, iommu->seq_id);
1368 if (curr) {
1369 curr->refcnt++;
1370 spin_unlock(&iommu->lock);
1371 kfree(info);
1372 return 0;
1373 }
1374
1375 ndomains = cap_ndoms(iommu->cap);
1376 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1377 if (num >= ndomains) {
1378 pr_err("%s: No free domain ids\n", iommu->name);
1379 goto err_unlock;
1380 }
1381
1382 set_bit(num, iommu->domain_ids);
1383 info->refcnt = 1;
1384 info->did = num;
1385 info->iommu = iommu;
1386 curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1387 NULL, info, GFP_ATOMIC);
1388 if (curr) {
1389 ret = xa_err(curr) ? : -EBUSY;
1390 goto err_clear;
1391 }
1392
1393 spin_unlock(&iommu->lock);
1394 return 0;
1395
1396err_clear:
1397 clear_bit(info->did, iommu->domain_ids);
1398err_unlock:
1399 spin_unlock(&iommu->lock);
1400 kfree(info);
1401 return ret;
1402}
1403
1404void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1405{
1406 struct iommu_domain_info *info;
1407
1408 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1409 return;
1410
1411 spin_lock(&iommu->lock);
1412 info = xa_load(&domain->iommu_array, iommu->seq_id);
1413 if (--info->refcnt == 0) {
1414 clear_bit(info->did, iommu->domain_ids);
1415 xa_erase(&domain->iommu_array, iommu->seq_id);
1416 domain->nid = NUMA_NO_NODE;
1417 kfree(info);
1418 }
1419 spin_unlock(&iommu->lock);
1420}
1421
1422static void domain_exit(struct dmar_domain *domain)
1423{
1424 if (domain->pgd) {
1425 LIST_HEAD(freelist);
1426
1427 domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1428 iommu_put_pages_list(&freelist);
1429 }
1430
1431 if (WARN_ON(!list_empty(&domain->devices)))
1432 return;
1433
1434 kfree(domain->qi_batch);
1435 kfree(domain);
1436}
1437
1438/*
1439 * For kdump cases, old valid entries may be cached due to the
1440 * in-flight DMA and copied pgtable, but there is no unmapping
1441 * behaviour for them, thus we need an explicit cache flush for
1442 * the newly-mapped device. For kdump, at this point, the device
1443 * is supposed to finish reset at its driver probe stage, so no
1444 * in-flight DMA will exist, and we don't need to worry anymore
1445 * hereafter.
1446 */
1447static void copied_context_tear_down(struct intel_iommu *iommu,
1448 struct context_entry *context,
1449 u8 bus, u8 devfn)
1450{
1451 u16 did_old;
1452
1453 if (!context_copied(iommu, bus, devfn))
1454 return;
1455
1456 assert_spin_locked(&iommu->lock);
1457
1458 did_old = context_domain_id(context);
1459 context_clear_entry(context);
1460
1461 if (did_old < cap_ndoms(iommu->cap)) {
1462 iommu->flush.flush_context(iommu, did_old,
1463 PCI_DEVID(bus, devfn),
1464 DMA_CCMD_MASK_NOBIT,
1465 DMA_CCMD_DEVICE_INVL);
1466 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1467 DMA_TLB_DSI_FLUSH);
1468 }
1469
1470 clear_context_copied(iommu, bus, devfn);
1471}
1472
1473/*
1474 * It's a non-present to present mapping. If hardware doesn't cache
1475 * non-present entry we only need to flush the write-buffer. If the
1476 * _does_ cache non-present entries, then it does so in the special
1477 * domain #0, which we have to flush:
1478 */
1479static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1480 u8 bus, u8 devfn)
1481{
1482 if (cap_caching_mode(iommu->cap)) {
1483 iommu->flush.flush_context(iommu, 0,
1484 PCI_DEVID(bus, devfn),
1485 DMA_CCMD_MASK_NOBIT,
1486 DMA_CCMD_DEVICE_INVL);
1487 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1488 } else {
1489 iommu_flush_write_buffer(iommu);
1490 }
1491}
1492
1493static int domain_context_mapping_one(struct dmar_domain *domain,
1494 struct intel_iommu *iommu,
1495 u8 bus, u8 devfn)
1496{
1497 struct device_domain_info *info =
1498 domain_lookup_dev_info(domain, iommu, bus, devfn);
1499 u16 did = domain_id_iommu(domain, iommu);
1500 int translation = CONTEXT_TT_MULTI_LEVEL;
1501 struct dma_pte *pgd = domain->pgd;
1502 struct context_entry *context;
1503 int ret;
1504
1505 pr_debug("Set context mapping for %02x:%02x.%d\n",
1506 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1507
1508 spin_lock(&iommu->lock);
1509 ret = -ENOMEM;
1510 context = iommu_context_addr(iommu, bus, devfn, 1);
1511 if (!context)
1512 goto out_unlock;
1513
1514 ret = 0;
1515 if (context_present(context) && !context_copied(iommu, bus, devfn))
1516 goto out_unlock;
1517
1518 copied_context_tear_down(iommu, context, bus, devfn);
1519 context_clear_entry(context);
1520 context_set_domain_id(context, did);
1521
1522 if (info && info->ats_supported)
1523 translation = CONTEXT_TT_DEV_IOTLB;
1524 else
1525 translation = CONTEXT_TT_MULTI_LEVEL;
1526
1527 context_set_address_root(context, virt_to_phys(pgd));
1528 context_set_address_width(context, domain->agaw);
1529 context_set_translation_type(context, translation);
1530 context_set_fault_enable(context);
1531 context_set_present(context);
1532 if (!ecap_coherent(iommu->ecap))
1533 clflush_cache_range(context, sizeof(*context));
1534 context_present_cache_flush(iommu, did, bus, devfn);
1535 ret = 0;
1536
1537out_unlock:
1538 spin_unlock(&iommu->lock);
1539
1540 return ret;
1541}
1542
1543static int domain_context_mapping_cb(struct pci_dev *pdev,
1544 u16 alias, void *opaque)
1545{
1546 struct device_domain_info *info = dev_iommu_priv_get(&pdev->dev);
1547 struct intel_iommu *iommu = info->iommu;
1548 struct dmar_domain *domain = opaque;
1549
1550 return domain_context_mapping_one(domain, iommu,
1551 PCI_BUS_NUM(alias), alias & 0xff);
1552}
1553
1554static int
1555domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1556{
1557 struct device_domain_info *info = dev_iommu_priv_get(dev);
1558 struct intel_iommu *iommu = info->iommu;
1559 u8 bus = info->bus, devfn = info->devfn;
1560
1561 if (!dev_is_pci(dev))
1562 return domain_context_mapping_one(domain, iommu, bus, devfn);
1563
1564 return pci_for_each_dma_alias(to_pci_dev(dev),
1565 domain_context_mapping_cb, domain);
1566}
1567
1568/* Return largest possible superpage level for a given mapping */
1569static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1570 unsigned long phy_pfn, unsigned long pages)
1571{
1572 int support, level = 1;
1573 unsigned long pfnmerge;
1574
1575 support = domain->iommu_superpage;
1576
1577 /* To use a large page, the virtual *and* physical addresses
1578 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1579 of them will mean we have to use smaller pages. So just
1580 merge them and check both at once. */
1581 pfnmerge = iov_pfn | phy_pfn;
1582
1583 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1584 pages >>= VTD_STRIDE_SHIFT;
1585 if (!pages)
1586 break;
1587 pfnmerge >>= VTD_STRIDE_SHIFT;
1588 level++;
1589 support--;
1590 }
1591 return level;
1592}
1593
1594/*
1595 * Ensure that old small page tables are removed to make room for superpage(s).
1596 * We're going to add new large pages, so make sure we don't remove their parent
1597 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1598 */
1599static void switch_to_super_page(struct dmar_domain *domain,
1600 unsigned long start_pfn,
1601 unsigned long end_pfn, int level)
1602{
1603 unsigned long lvl_pages = lvl_to_nr_pages(level);
1604 struct dma_pte *pte = NULL;
1605
1606 while (start_pfn <= end_pfn) {
1607 if (!pte)
1608 pte = pfn_to_dma_pte(domain, start_pfn, &level,
1609 GFP_ATOMIC);
1610
1611 if (dma_pte_present(pte)) {
1612 dma_pte_free_pagetable(domain, start_pfn,
1613 start_pfn + lvl_pages - 1,
1614 level + 1);
1615
1616 cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT,
1617 end_pfn << VTD_PAGE_SHIFT, 0);
1618 }
1619
1620 pte++;
1621 start_pfn += lvl_pages;
1622 if (first_pte_in_page(pte))
1623 pte = NULL;
1624 }
1625}
1626
1627static int
1628__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1629 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1630 gfp_t gfp)
1631{
1632 struct dma_pte *first_pte = NULL, *pte = NULL;
1633 unsigned int largepage_lvl = 0;
1634 unsigned long lvl_pages = 0;
1635 phys_addr_t pteval;
1636 u64 attr;
1637
1638 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1639 return -EINVAL;
1640
1641 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1642 return -EINVAL;
1643
1644 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1645 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1646 return -EINVAL;
1647 }
1648
1649 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1650 attr |= DMA_FL_PTE_PRESENT;
1651 if (domain->use_first_level) {
1652 attr |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1653 if (prot & DMA_PTE_WRITE)
1654 attr |= DMA_FL_PTE_DIRTY;
1655 }
1656
1657 domain->has_mappings = true;
1658
1659 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1660
1661 while (nr_pages > 0) {
1662 uint64_t tmp;
1663
1664 if (!pte) {
1665 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1666 phys_pfn, nr_pages);
1667
1668 pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl,
1669 gfp);
1670 if (!pte)
1671 return -ENOMEM;
1672 first_pte = pte;
1673
1674 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1675
1676 /* It is large page*/
1677 if (largepage_lvl > 1) {
1678 unsigned long end_pfn;
1679 unsigned long pages_to_remove;
1680
1681 pteval |= DMA_PTE_LARGE_PAGE;
1682 pages_to_remove = min_t(unsigned long, nr_pages,
1683 nr_pte_to_next_page(pte) * lvl_pages);
1684 end_pfn = iov_pfn + pages_to_remove - 1;
1685 switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
1686 } else {
1687 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1688 }
1689
1690 }
1691 /* We don't need lock here, nobody else
1692 * touches the iova range
1693 */
1694 tmp = 0ULL;
1695 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1696 static int dumps = 5;
1697 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1698 iov_pfn, tmp, (unsigned long long)pteval);
1699 if (dumps) {
1700 dumps--;
1701 debug_dma_dump_mappings(NULL);
1702 }
1703 WARN_ON(1);
1704 }
1705
1706 nr_pages -= lvl_pages;
1707 iov_pfn += lvl_pages;
1708 phys_pfn += lvl_pages;
1709 pteval += lvl_pages * VTD_PAGE_SIZE;
1710
1711 /* If the next PTE would be the first in a new page, then we
1712 * need to flush the cache on the entries we've just written.
1713 * And then we'll need to recalculate 'pte', so clear it and
1714 * let it get set again in the if (!pte) block above.
1715 *
1716 * If we're done (!nr_pages) we need to flush the cache too.
1717 *
1718 * Also if we've been setting superpages, we may need to
1719 * recalculate 'pte' and switch back to smaller pages for the
1720 * end of the mapping, if the trailing size is not enough to
1721 * use another superpage (i.e. nr_pages < lvl_pages).
1722 */
1723 pte++;
1724 if (!nr_pages || first_pte_in_page(pte) ||
1725 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1726 domain_flush_cache(domain, first_pte,
1727 (void *)pte - (void *)first_pte);
1728 pte = NULL;
1729 }
1730 }
1731
1732 return 0;
1733}
1734
1735static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1736{
1737 struct intel_iommu *iommu = info->iommu;
1738 struct context_entry *context;
1739 u16 did;
1740
1741 spin_lock(&iommu->lock);
1742 context = iommu_context_addr(iommu, bus, devfn, 0);
1743 if (!context) {
1744 spin_unlock(&iommu->lock);
1745 return;
1746 }
1747
1748 did = context_domain_id(context);
1749 context_clear_entry(context);
1750 __iommu_flush_cache(iommu, context, sizeof(*context));
1751 spin_unlock(&iommu->lock);
1752 intel_context_flush_present(info, context, did, true);
1753}
1754
1755int __domain_setup_first_level(struct intel_iommu *iommu,
1756 struct device *dev, ioasid_t pasid,
1757 u16 did, pgd_t *pgd, int flags,
1758 struct iommu_domain *old)
1759{
1760 if (!old)
1761 return intel_pasid_setup_first_level(iommu, dev, pgd,
1762 pasid, did, flags);
1763 return intel_pasid_replace_first_level(iommu, dev, pgd, pasid, did,
1764 iommu_domain_did(old, iommu),
1765 flags);
1766}
1767
1768static int domain_setup_second_level(struct intel_iommu *iommu,
1769 struct dmar_domain *domain,
1770 struct device *dev, ioasid_t pasid,
1771 struct iommu_domain *old)
1772{
1773 if (!old)
1774 return intel_pasid_setup_second_level(iommu, domain,
1775 dev, pasid);
1776 return intel_pasid_replace_second_level(iommu, domain, dev,
1777 iommu_domain_did(old, iommu),
1778 pasid);
1779}
1780
1781static int domain_setup_passthrough(struct intel_iommu *iommu,
1782 struct device *dev, ioasid_t pasid,
1783 struct iommu_domain *old)
1784{
1785 if (!old)
1786 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1787 return intel_pasid_replace_pass_through(iommu, dev,
1788 iommu_domain_did(old, iommu),
1789 pasid);
1790}
1791
1792static int domain_setup_first_level(struct intel_iommu *iommu,
1793 struct dmar_domain *domain,
1794 struct device *dev,
1795 u32 pasid, struct iommu_domain *old)
1796{
1797 struct dma_pte *pgd = domain->pgd;
1798 int level, flags = 0;
1799
1800 level = agaw_to_level(domain->agaw);
1801 if (level != 4 && level != 5)
1802 return -EINVAL;
1803
1804 if (level == 5)
1805 flags |= PASID_FLAG_FL5LP;
1806
1807 if (domain->force_snooping)
1808 flags |= PASID_FLAG_PAGE_SNOOP;
1809
1810 return __domain_setup_first_level(iommu, dev, pasid,
1811 domain_id_iommu(domain, iommu),
1812 (pgd_t *)pgd, flags, old);
1813}
1814
1815static int dmar_domain_attach_device(struct dmar_domain *domain,
1816 struct device *dev)
1817{
1818 struct device_domain_info *info = dev_iommu_priv_get(dev);
1819 struct intel_iommu *iommu = info->iommu;
1820 unsigned long flags;
1821 int ret;
1822
1823 ret = domain_attach_iommu(domain, iommu);
1824 if (ret)
1825 return ret;
1826
1827 info->domain = domain;
1828 spin_lock_irqsave(&domain->lock, flags);
1829 list_add(&info->link, &domain->devices);
1830 spin_unlock_irqrestore(&domain->lock, flags);
1831
1832 if (dev_is_real_dma_subdevice(dev))
1833 return 0;
1834
1835 if (!sm_supported(iommu))
1836 ret = domain_context_mapping(domain, dev);
1837 else if (domain->use_first_level)
1838 ret = domain_setup_first_level(iommu, domain, dev,
1839 IOMMU_NO_PASID, NULL);
1840 else
1841 ret = domain_setup_second_level(iommu, domain, dev,
1842 IOMMU_NO_PASID, NULL);
1843
1844 if (ret)
1845 goto out_block_translation;
1846
1847 iommu_enable_pci_caps(info);
1848
1849 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1850 if (ret)
1851 goto out_block_translation;
1852
1853 return 0;
1854
1855out_block_translation:
1856 device_block_translation(dev);
1857 return ret;
1858}
1859
1860/**
1861 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1862 * is relaxable (ie. is allowed to be not enforced under some conditions)
1863 * @dev: device handle
1864 *
1865 * We assume that PCI USB devices with RMRRs have them largely
1866 * for historical reasons and that the RMRR space is not actively used post
1867 * boot. This exclusion may change if vendors begin to abuse it.
1868 *
1869 * The same exception is made for graphics devices, with the requirement that
1870 * any use of the RMRR regions will be torn down before assigning the device
1871 * to a guest.
1872 *
1873 * Return: true if the RMRR is relaxable, false otherwise
1874 */
1875static bool device_rmrr_is_relaxable(struct device *dev)
1876{
1877 struct pci_dev *pdev;
1878
1879 if (!dev_is_pci(dev))
1880 return false;
1881
1882 pdev = to_pci_dev(dev);
1883 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1884 return true;
1885 else
1886 return false;
1887}
1888
1889static int device_def_domain_type(struct device *dev)
1890{
1891 struct device_domain_info *info = dev_iommu_priv_get(dev);
1892 struct intel_iommu *iommu = info->iommu;
1893
1894 /*
1895 * Hardware does not support the passthrough translation mode.
1896 * Always use a dynamaic mapping domain.
1897 */
1898 if (!ecap_pass_through(iommu->ecap))
1899 return IOMMU_DOMAIN_DMA;
1900
1901 if (dev_is_pci(dev)) {
1902 struct pci_dev *pdev = to_pci_dev(dev);
1903
1904 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1905 return IOMMU_DOMAIN_IDENTITY;
1906 }
1907
1908 return 0;
1909}
1910
1911static void intel_iommu_init_qi(struct intel_iommu *iommu)
1912{
1913 /*
1914 * Start from the sane iommu hardware state.
1915 * If the queued invalidation is already initialized by us
1916 * (for example, while enabling interrupt-remapping) then
1917 * we got the things already rolling from a sane state.
1918 */
1919 if (!iommu->qi) {
1920 /*
1921 * Clear any previous faults.
1922 */
1923 dmar_fault(-1, iommu);
1924 /*
1925 * Disable queued invalidation if supported and already enabled
1926 * before OS handover.
1927 */
1928 dmar_disable_qi(iommu);
1929 }
1930
1931 if (dmar_enable_qi(iommu)) {
1932 /*
1933 * Queued Invalidate not enabled, use Register Based Invalidate
1934 */
1935 iommu->flush.flush_context = __iommu_flush_context;
1936 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1937 pr_info("%s: Using Register based invalidation\n",
1938 iommu->name);
1939 } else {
1940 iommu->flush.flush_context = qi_flush_context;
1941 iommu->flush.flush_iotlb = qi_flush_iotlb;
1942 pr_info("%s: Using Queued invalidation\n", iommu->name);
1943 }
1944}
1945
1946static int copy_context_table(struct intel_iommu *iommu,
1947 struct root_entry *old_re,
1948 struct context_entry **tbl,
1949 int bus, bool ext)
1950{
1951 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1952 struct context_entry *new_ce = NULL, ce;
1953 struct context_entry *old_ce = NULL;
1954 struct root_entry re;
1955 phys_addr_t old_ce_phys;
1956
1957 tbl_idx = ext ? bus * 2 : bus;
1958 memcpy(&re, old_re, sizeof(re));
1959
1960 for (devfn = 0; devfn < 256; devfn++) {
1961 /* First calculate the correct index */
1962 idx = (ext ? devfn * 2 : devfn) % 256;
1963
1964 if (idx == 0) {
1965 /* First save what we may have and clean up */
1966 if (new_ce) {
1967 tbl[tbl_idx] = new_ce;
1968 __iommu_flush_cache(iommu, new_ce,
1969 VTD_PAGE_SIZE);
1970 pos = 1;
1971 }
1972
1973 if (old_ce)
1974 memunmap(old_ce);
1975
1976 ret = 0;
1977 if (devfn < 0x80)
1978 old_ce_phys = root_entry_lctp(&re);
1979 else
1980 old_ce_phys = root_entry_uctp(&re);
1981
1982 if (!old_ce_phys) {
1983 if (ext && devfn == 0) {
1984 /* No LCTP, try UCTP */
1985 devfn = 0x7f;
1986 continue;
1987 } else {
1988 goto out;
1989 }
1990 }
1991
1992 ret = -ENOMEM;
1993 old_ce = memremap(old_ce_phys, PAGE_SIZE,
1994 MEMREMAP_WB);
1995 if (!old_ce)
1996 goto out;
1997
1998 new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
1999 if (!new_ce)
2000 goto out_unmap;
2001
2002 ret = 0;
2003 }
2004
2005 /* Now copy the context entry */
2006 memcpy(&ce, old_ce + idx, sizeof(ce));
2007
2008 if (!context_present(&ce))
2009 continue;
2010
2011 did = context_domain_id(&ce);
2012 if (did >= 0 && did < cap_ndoms(iommu->cap))
2013 set_bit(did, iommu->domain_ids);
2014
2015 set_context_copied(iommu, bus, devfn);
2016 new_ce[idx] = ce;
2017 }
2018
2019 tbl[tbl_idx + pos] = new_ce;
2020
2021 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2022
2023out_unmap:
2024 memunmap(old_ce);
2025
2026out:
2027 return ret;
2028}
2029
2030static int copy_translation_tables(struct intel_iommu *iommu)
2031{
2032 struct context_entry **ctxt_tbls;
2033 struct root_entry *old_rt;
2034 phys_addr_t old_rt_phys;
2035 int ctxt_table_entries;
2036 u64 rtaddr_reg;
2037 int bus, ret;
2038 bool new_ext, ext;
2039
2040 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2041 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2042 new_ext = !!sm_supported(iommu);
2043
2044 /*
2045 * The RTT bit can only be changed when translation is disabled,
2046 * but disabling translation means to open a window for data
2047 * corruption. So bail out and don't copy anything if we would
2048 * have to change the bit.
2049 */
2050 if (new_ext != ext)
2051 return -EINVAL;
2052
2053 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2054 if (!iommu->copied_tables)
2055 return -ENOMEM;
2056
2057 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2058 if (!old_rt_phys)
2059 return -EINVAL;
2060
2061 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2062 if (!old_rt)
2063 return -ENOMEM;
2064
2065 /* This is too big for the stack - allocate it from slab */
2066 ctxt_table_entries = ext ? 512 : 256;
2067 ret = -ENOMEM;
2068 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2069 if (!ctxt_tbls)
2070 goto out_unmap;
2071
2072 for (bus = 0; bus < 256; bus++) {
2073 ret = copy_context_table(iommu, &old_rt[bus],
2074 ctxt_tbls, bus, ext);
2075 if (ret) {
2076 pr_err("%s: Failed to copy context table for bus %d\n",
2077 iommu->name, bus);
2078 continue;
2079 }
2080 }
2081
2082 spin_lock(&iommu->lock);
2083
2084 /* Context tables are copied, now write them to the root_entry table */
2085 for (bus = 0; bus < 256; bus++) {
2086 int idx = ext ? bus * 2 : bus;
2087 u64 val;
2088
2089 if (ctxt_tbls[idx]) {
2090 val = virt_to_phys(ctxt_tbls[idx]) | 1;
2091 iommu->root_entry[bus].lo = val;
2092 }
2093
2094 if (!ext || !ctxt_tbls[idx + 1])
2095 continue;
2096
2097 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2098 iommu->root_entry[bus].hi = val;
2099 }
2100
2101 spin_unlock(&iommu->lock);
2102
2103 kfree(ctxt_tbls);
2104
2105 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2106
2107 ret = 0;
2108
2109out_unmap:
2110 memunmap(old_rt);
2111
2112 return ret;
2113}
2114
2115static int __init init_dmars(void)
2116{
2117 struct dmar_drhd_unit *drhd;
2118 struct intel_iommu *iommu;
2119 int ret;
2120
2121 ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2122 if (ret)
2123 goto free_iommu;
2124
2125 for_each_iommu(iommu, drhd) {
2126 if (drhd->ignored) {
2127 iommu_disable_translation(iommu);
2128 continue;
2129 }
2130
2131 /*
2132 * Find the max pasid size of all IOMMU's in the system.
2133 * We need to ensure the system pasid table is no bigger
2134 * than the smallest supported.
2135 */
2136 if (pasid_supported(iommu)) {
2137 u32 temp = 2 << ecap_pss(iommu->ecap);
2138
2139 intel_pasid_max_id = min_t(u32, temp,
2140 intel_pasid_max_id);
2141 }
2142
2143 intel_iommu_init_qi(iommu);
2144
2145 ret = iommu_init_domains(iommu);
2146 if (ret)
2147 goto free_iommu;
2148
2149 init_translation_status(iommu);
2150
2151 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2152 iommu_disable_translation(iommu);
2153 clear_translation_pre_enabled(iommu);
2154 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2155 iommu->name);
2156 }
2157
2158 /*
2159 * TBD:
2160 * we could share the same root & context tables
2161 * among all IOMMU's. Need to Split it later.
2162 */
2163 ret = iommu_alloc_root_entry(iommu);
2164 if (ret)
2165 goto free_iommu;
2166
2167 if (translation_pre_enabled(iommu)) {
2168 pr_info("Translation already enabled - trying to copy translation structures\n");
2169
2170 ret = copy_translation_tables(iommu);
2171 if (ret) {
2172 /*
2173 * We found the IOMMU with translation
2174 * enabled - but failed to copy over the
2175 * old root-entry table. Try to proceed
2176 * by disabling translation now and
2177 * allocating a clean root-entry table.
2178 * This might cause DMAR faults, but
2179 * probably the dump will still succeed.
2180 */
2181 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2182 iommu->name);
2183 iommu_disable_translation(iommu);
2184 clear_translation_pre_enabled(iommu);
2185 } else {
2186 pr_info("Copied translation tables from previous kernel for %s\n",
2187 iommu->name);
2188 }
2189 }
2190
2191 intel_svm_check(iommu);
2192 }
2193
2194 /*
2195 * Now that qi is enabled on all iommus, set the root entry and flush
2196 * caches. This is required on some Intel X58 chipsets, otherwise the
2197 * flush_context function will loop forever and the boot hangs.
2198 */
2199 for_each_active_iommu(iommu, drhd) {
2200 iommu_flush_write_buffer(iommu);
2201 iommu_set_root_entry(iommu);
2202 }
2203
2204 check_tylersburg_isoch();
2205
2206 /*
2207 * for each drhd
2208 * enable fault log
2209 * global invalidate context cache
2210 * global invalidate iotlb
2211 * enable translation
2212 */
2213 for_each_iommu(iommu, drhd) {
2214 if (drhd->ignored) {
2215 /*
2216 * we always have to disable PMRs or DMA may fail on
2217 * this device
2218 */
2219 if (force_on)
2220 iommu_disable_protect_mem_regions(iommu);
2221 continue;
2222 }
2223
2224 iommu_flush_write_buffer(iommu);
2225
2226 if (ecap_prs(iommu->ecap)) {
2227 /*
2228 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2229 * could cause possible lock race condition.
2230 */
2231 up_write(&dmar_global_lock);
2232 ret = intel_iommu_enable_prq(iommu);
2233 down_write(&dmar_global_lock);
2234 if (ret)
2235 goto free_iommu;
2236 }
2237
2238 ret = dmar_set_interrupt(iommu);
2239 if (ret)
2240 goto free_iommu;
2241 }
2242
2243 return 0;
2244
2245free_iommu:
2246 for_each_active_iommu(iommu, drhd) {
2247 disable_dmar_iommu(iommu);
2248 free_dmar_iommu(iommu);
2249 }
2250
2251 return ret;
2252}
2253
2254static void __init init_no_remapping_devices(void)
2255{
2256 struct dmar_drhd_unit *drhd;
2257 struct device *dev;
2258 int i;
2259
2260 for_each_drhd_unit(drhd) {
2261 if (!drhd->include_all) {
2262 for_each_active_dev_scope(drhd->devices,
2263 drhd->devices_cnt, i, dev)
2264 break;
2265 /* ignore DMAR unit if no devices exist */
2266 if (i == drhd->devices_cnt)
2267 drhd->ignored = 1;
2268 }
2269 }
2270
2271 for_each_active_drhd_unit(drhd) {
2272 if (drhd->include_all)
2273 continue;
2274
2275 for_each_active_dev_scope(drhd->devices,
2276 drhd->devices_cnt, i, dev)
2277 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2278 break;
2279 if (i < drhd->devices_cnt)
2280 continue;
2281
2282 /* This IOMMU has *only* gfx devices. Either bypass it or
2283 set the gfx_mapped flag, as appropriate */
2284 drhd->gfx_dedicated = 1;
2285 if (disable_igfx_iommu)
2286 drhd->ignored = 1;
2287 }
2288}
2289
2290#ifdef CONFIG_SUSPEND
2291static int init_iommu_hw(void)
2292{
2293 struct dmar_drhd_unit *drhd;
2294 struct intel_iommu *iommu = NULL;
2295 int ret;
2296
2297 for_each_active_iommu(iommu, drhd) {
2298 if (iommu->qi) {
2299 ret = dmar_reenable_qi(iommu);
2300 if (ret)
2301 return ret;
2302 }
2303 }
2304
2305 for_each_iommu(iommu, drhd) {
2306 if (drhd->ignored) {
2307 /*
2308 * we always have to disable PMRs or DMA may fail on
2309 * this device
2310 */
2311 if (force_on)
2312 iommu_disable_protect_mem_regions(iommu);
2313 continue;
2314 }
2315
2316 iommu_flush_write_buffer(iommu);
2317 iommu_set_root_entry(iommu);
2318 iommu_enable_translation(iommu);
2319 iommu_disable_protect_mem_regions(iommu);
2320 }
2321
2322 return 0;
2323}
2324
2325static void iommu_flush_all(void)
2326{
2327 struct dmar_drhd_unit *drhd;
2328 struct intel_iommu *iommu;
2329
2330 for_each_active_iommu(iommu, drhd) {
2331 iommu->flush.flush_context(iommu, 0, 0, 0,
2332 DMA_CCMD_GLOBAL_INVL);
2333 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2334 DMA_TLB_GLOBAL_FLUSH);
2335 }
2336}
2337
2338static int iommu_suspend(void)
2339{
2340 struct dmar_drhd_unit *drhd;
2341 struct intel_iommu *iommu = NULL;
2342 unsigned long flag;
2343
2344 iommu_flush_all();
2345
2346 for_each_active_iommu(iommu, drhd) {
2347 iommu_disable_translation(iommu);
2348
2349 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350
2351 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2352 readl(iommu->reg + DMAR_FECTL_REG);
2353 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2354 readl(iommu->reg + DMAR_FEDATA_REG);
2355 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2356 readl(iommu->reg + DMAR_FEADDR_REG);
2357 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2358 readl(iommu->reg + DMAR_FEUADDR_REG);
2359
2360 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361 }
2362 return 0;
2363}
2364
2365static void iommu_resume(void)
2366{
2367 struct dmar_drhd_unit *drhd;
2368 struct intel_iommu *iommu = NULL;
2369 unsigned long flag;
2370
2371 if (init_iommu_hw()) {
2372 if (force_on)
2373 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
2374 else
2375 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2376 return;
2377 }
2378
2379 for_each_active_iommu(iommu, drhd) {
2380
2381 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2382
2383 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
2384 iommu->reg + DMAR_FECTL_REG);
2385 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
2386 iommu->reg + DMAR_FEDATA_REG);
2387 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
2388 iommu->reg + DMAR_FEADDR_REG);
2389 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2390 iommu->reg + DMAR_FEUADDR_REG);
2391
2392 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2393 }
2394}
2395
2396static struct syscore_ops iommu_syscore_ops = {
2397 .resume = iommu_resume,
2398 .suspend = iommu_suspend,
2399};
2400
2401static void __init init_iommu_pm_ops(void)
2402{
2403 register_syscore_ops(&iommu_syscore_ops);
2404}
2405
2406#else
2407static inline void init_iommu_pm_ops(void) {}
2408#endif /* CONFIG_PM */
2409
2410static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2411{
2412 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2413 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2414 rmrr->end_address <= rmrr->base_address ||
2415 arch_rmrr_sanity_check(rmrr))
2416 return -EINVAL;
2417
2418 return 0;
2419}
2420
2421int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2422{
2423 struct acpi_dmar_reserved_memory *rmrr;
2424 struct dmar_rmrr_unit *rmrru;
2425
2426 rmrr = (struct acpi_dmar_reserved_memory *)header;
2427 if (rmrr_sanity_check(rmrr)) {
2428 pr_warn(FW_BUG
2429 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2430 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2431 rmrr->base_address, rmrr->end_address,
2432 dmi_get_system_info(DMI_BIOS_VENDOR),
2433 dmi_get_system_info(DMI_BIOS_VERSION),
2434 dmi_get_system_info(DMI_PRODUCT_VERSION));
2435 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2436 }
2437
2438 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2439 if (!rmrru)
2440 goto out;
2441
2442 rmrru->hdr = header;
2443
2444 rmrru->base_address = rmrr->base_address;
2445 rmrru->end_address = rmrr->end_address;
2446
2447 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
2448 ((void *)rmrr) + rmrr->header.length,
2449 &rmrru->devices_cnt);
2450 if (rmrru->devices_cnt && rmrru->devices == NULL)
2451 goto free_rmrru;
2452
2453 list_add(&rmrru->list, &dmar_rmrr_units);
2454
2455 return 0;
2456free_rmrru:
2457 kfree(rmrru);
2458out:
2459 return -ENOMEM;
2460}
2461
2462static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2463{
2464 struct dmar_atsr_unit *atsru;
2465 struct acpi_dmar_atsr *tmp;
2466
2467 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2468 dmar_rcu_check()) {
2469 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2470 if (atsr->segment != tmp->segment)
2471 continue;
2472 if (atsr->header.length != tmp->header.length)
2473 continue;
2474 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2475 return atsru;
2476 }
2477
2478 return NULL;
2479}
2480
2481int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2482{
2483 struct acpi_dmar_atsr *atsr;
2484 struct dmar_atsr_unit *atsru;
2485
2486 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2487 return 0;
2488
2489 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2490 atsru = dmar_find_atsr(atsr);
2491 if (atsru)
2492 return 0;
2493
2494 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2495 if (!atsru)
2496 return -ENOMEM;
2497
2498 /*
2499 * If memory is allocated from slab by ACPI _DSM method, we need to
2500 * copy the memory content because the memory buffer will be freed
2501 * on return.
2502 */
2503 atsru->hdr = (void *)(atsru + 1);
2504 memcpy(atsru->hdr, hdr, hdr->length);
2505 atsru->include_all = atsr->flags & 0x1;
2506 if (!atsru->include_all) {
2507 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
2508 (void *)atsr + atsr->header.length,
2509 &atsru->devices_cnt);
2510 if (atsru->devices_cnt && atsru->devices == NULL) {
2511 kfree(atsru);
2512 return -ENOMEM;
2513 }
2514 }
2515
2516 list_add_rcu(&atsru->list, &dmar_atsr_units);
2517
2518 return 0;
2519}
2520
2521static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2522{
2523 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
2524 kfree(atsru);
2525}
2526
2527int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2528{
2529 struct acpi_dmar_atsr *atsr;
2530 struct dmar_atsr_unit *atsru;
2531
2532 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2533 atsru = dmar_find_atsr(atsr);
2534 if (atsru) {
2535 list_del_rcu(&atsru->list);
2536 synchronize_rcu();
2537 intel_iommu_free_atsr(atsru);
2538 }
2539
2540 return 0;
2541}
2542
2543int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2544{
2545 int i;
2546 struct device *dev;
2547 struct acpi_dmar_atsr *atsr;
2548 struct dmar_atsr_unit *atsru;
2549
2550 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2551 atsru = dmar_find_atsr(atsr);
2552 if (!atsru)
2553 return 0;
2554
2555 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2556 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2557 i, dev)
2558 return -EBUSY;
2559 }
2560
2561 return 0;
2562}
2563
2564static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2565{
2566 struct dmar_satc_unit *satcu;
2567 struct acpi_dmar_satc *tmp;
2568
2569 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2570 dmar_rcu_check()) {
2571 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2572 if (satc->segment != tmp->segment)
2573 continue;
2574 if (satc->header.length != tmp->header.length)
2575 continue;
2576 if (memcmp(satc, tmp, satc->header.length) == 0)
2577 return satcu;
2578 }
2579
2580 return NULL;
2581}
2582
2583int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2584{
2585 struct acpi_dmar_satc *satc;
2586 struct dmar_satc_unit *satcu;
2587
2588 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2589 return 0;
2590
2591 satc = container_of(hdr, struct acpi_dmar_satc, header);
2592 satcu = dmar_find_satc(satc);
2593 if (satcu)
2594 return 0;
2595
2596 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2597 if (!satcu)
2598 return -ENOMEM;
2599
2600 satcu->hdr = (void *)(satcu + 1);
2601 memcpy(satcu->hdr, hdr, hdr->length);
2602 satcu->atc_required = satc->flags & 0x1;
2603 satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
2604 (void *)satc + satc->header.length,
2605 &satcu->devices_cnt);
2606 if (satcu->devices_cnt && !satcu->devices) {
2607 kfree(satcu);
2608 return -ENOMEM;
2609 }
2610 list_add_rcu(&satcu->list, &dmar_satc_units);
2611
2612 return 0;
2613}
2614
2615static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2616{
2617 struct intel_iommu *iommu = dmaru->iommu;
2618 int ret;
2619
2620 ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
2621 if (ret)
2622 goto out;
2623
2624 /*
2625 * Disable translation if already enabled prior to OS handover.
2626 */
2627 if (iommu->gcmd & DMA_GCMD_TE)
2628 iommu_disable_translation(iommu);
2629
2630 ret = iommu_init_domains(iommu);
2631 if (ret == 0)
2632 ret = iommu_alloc_root_entry(iommu);
2633 if (ret)
2634 goto out;
2635
2636 intel_svm_check(iommu);
2637
2638 if (dmaru->ignored) {
2639 /*
2640 * we always have to disable PMRs or DMA may fail on this device
2641 */
2642 if (force_on)
2643 iommu_disable_protect_mem_regions(iommu);
2644 return 0;
2645 }
2646
2647 intel_iommu_init_qi(iommu);
2648 iommu_flush_write_buffer(iommu);
2649
2650 if (ecap_prs(iommu->ecap)) {
2651 ret = intel_iommu_enable_prq(iommu);
2652 if (ret)
2653 goto disable_iommu;
2654 }
2655
2656 ret = dmar_set_interrupt(iommu);
2657 if (ret)
2658 goto disable_iommu;
2659
2660 iommu_set_root_entry(iommu);
2661 iommu_enable_translation(iommu);
2662
2663 iommu_disable_protect_mem_regions(iommu);
2664 return 0;
2665
2666disable_iommu:
2667 disable_dmar_iommu(iommu);
2668out:
2669 free_dmar_iommu(iommu);
2670 return ret;
2671}
2672
2673int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2674{
2675 int ret = 0;
2676 struct intel_iommu *iommu = dmaru->iommu;
2677
2678 if (!intel_iommu_enabled)
2679 return 0;
2680 if (iommu == NULL)
2681 return -EINVAL;
2682
2683 if (insert) {
2684 ret = intel_iommu_add(dmaru);
2685 } else {
2686 disable_dmar_iommu(iommu);
2687 free_dmar_iommu(iommu);
2688 }
2689
2690 return ret;
2691}
2692
2693static void intel_iommu_free_dmars(void)
2694{
2695 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2696 struct dmar_atsr_unit *atsru, *atsr_n;
2697 struct dmar_satc_unit *satcu, *satc_n;
2698
2699 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2700 list_del(&rmrru->list);
2701 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
2702 kfree(rmrru);
2703 }
2704
2705 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2706 list_del(&atsru->list);
2707 intel_iommu_free_atsr(atsru);
2708 }
2709 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2710 list_del(&satcu->list);
2711 dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
2712 kfree(satcu);
2713 }
2714}
2715
2716static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2717{
2718 struct dmar_satc_unit *satcu;
2719 struct acpi_dmar_satc *satc;
2720 struct device *tmp;
2721 int i;
2722
2723 dev = pci_physfn(dev);
2724 rcu_read_lock();
2725
2726 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2727 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2728 if (satc->segment != pci_domain_nr(dev->bus))
2729 continue;
2730 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2731 if (to_pci_dev(tmp) == dev)
2732 goto out;
2733 }
2734 satcu = NULL;
2735out:
2736 rcu_read_unlock();
2737 return satcu;
2738}
2739
2740static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2741{
2742 int i, ret = 1;
2743 struct pci_bus *bus;
2744 struct pci_dev *bridge = NULL;
2745 struct device *tmp;
2746 struct acpi_dmar_atsr *atsr;
2747 struct dmar_atsr_unit *atsru;
2748 struct dmar_satc_unit *satcu;
2749
2750 dev = pci_physfn(dev);
2751 satcu = dmar_find_matched_satc_unit(dev);
2752 if (satcu)
2753 /*
2754 * This device supports ATS as it is in SATC table.
2755 * When IOMMU is in legacy mode, enabling ATS is done
2756 * automatically by HW for the device that requires
2757 * ATS, hence OS should not enable this device ATS
2758 * to avoid duplicated TLB invalidation.
2759 */
2760 return !(satcu->atc_required && !sm_supported(iommu));
2761
2762 for (bus = dev->bus; bus; bus = bus->parent) {
2763 bridge = bus->self;
2764 /* If it's an integrated device, allow ATS */
2765 if (!bridge)
2766 return 1;
2767 /* Connected via non-PCIe: no ATS */
2768 if (!pci_is_pcie(bridge) ||
2769 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2770 return 0;
2771 /* If we found the root port, look it up in the ATSR */
2772 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
2773 break;
2774 }
2775
2776 rcu_read_lock();
2777 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2778 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2779 if (atsr->segment != pci_domain_nr(dev->bus))
2780 continue;
2781
2782 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2783 if (tmp == &bridge->dev)
2784 goto out;
2785
2786 if (atsru->include_all)
2787 goto out;
2788 }
2789 ret = 0;
2790out:
2791 rcu_read_unlock();
2792
2793 return ret;
2794}
2795
2796int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2797{
2798 int ret;
2799 struct dmar_rmrr_unit *rmrru;
2800 struct dmar_atsr_unit *atsru;
2801 struct dmar_satc_unit *satcu;
2802 struct acpi_dmar_atsr *atsr;
2803 struct acpi_dmar_reserved_memory *rmrr;
2804 struct acpi_dmar_satc *satc;
2805
2806 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2807 return 0;
2808
2809 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2810 rmrr = container_of(rmrru->hdr,
2811 struct acpi_dmar_reserved_memory, header);
2812 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2813 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
2814 ((void *)rmrr) + rmrr->header.length,
2815 rmrr->segment, rmrru->devices,
2816 rmrru->devices_cnt);
2817 if (ret < 0)
2818 return ret;
2819 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2820 dmar_remove_dev_scope(info, rmrr->segment,
2821 rmrru->devices, rmrru->devices_cnt);
2822 }
2823 }
2824
2825 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2826 if (atsru->include_all)
2827 continue;
2828
2829 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2830 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2831 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
2832 (void *)atsr + atsr->header.length,
2833 atsr->segment, atsru->devices,
2834 atsru->devices_cnt);
2835 if (ret > 0)
2836 break;
2837 else if (ret < 0)
2838 return ret;
2839 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2840 if (dmar_remove_dev_scope(info, atsr->segment,
2841 atsru->devices, atsru->devices_cnt))
2842 break;
2843 }
2844 }
2845 list_for_each_entry(satcu, &dmar_satc_units, list) {
2846 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2847 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2848 ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
2849 (void *)satc + satc->header.length,
2850 satc->segment, satcu->devices,
2851 satcu->devices_cnt);
2852 if (ret > 0)
2853 break;
2854 else if (ret < 0)
2855 return ret;
2856 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2857 if (dmar_remove_dev_scope(info, satc->segment,
2858 satcu->devices, satcu->devices_cnt))
2859 break;
2860 }
2861 }
2862
2863 return 0;
2864}
2865
2866static void intel_disable_iommus(void)
2867{
2868 struct intel_iommu *iommu = NULL;
2869 struct dmar_drhd_unit *drhd;
2870
2871 for_each_iommu(iommu, drhd)
2872 iommu_disable_translation(iommu);
2873}
2874
2875void intel_iommu_shutdown(void)
2876{
2877 struct dmar_drhd_unit *drhd;
2878 struct intel_iommu *iommu = NULL;
2879
2880 if (no_iommu || dmar_disabled)
2881 return;
2882
2883 down_write(&dmar_global_lock);
2884
2885 /* Disable PMRs explicitly here. */
2886 for_each_iommu(iommu, drhd)
2887 iommu_disable_protect_mem_regions(iommu);
2888
2889 /* Make sure the IOMMUs are switched off */
2890 intel_disable_iommus();
2891
2892 up_write(&dmar_global_lock);
2893}
2894
2895static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2896{
2897 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2898
2899 return container_of(iommu_dev, struct intel_iommu, iommu);
2900}
2901
2902static ssize_t version_show(struct device *dev,
2903 struct device_attribute *attr, char *buf)
2904{
2905 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2906 u32 ver = readl(iommu->reg + DMAR_VER_REG);
2907 return sysfs_emit(buf, "%d:%d\n",
2908 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2909}
2910static DEVICE_ATTR_RO(version);
2911
2912static ssize_t address_show(struct device *dev,
2913 struct device_attribute *attr, char *buf)
2914{
2915 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2916 return sysfs_emit(buf, "%llx\n", iommu->reg_phys);
2917}
2918static DEVICE_ATTR_RO(address);
2919
2920static ssize_t cap_show(struct device *dev,
2921 struct device_attribute *attr, char *buf)
2922{
2923 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2924 return sysfs_emit(buf, "%llx\n", iommu->cap);
2925}
2926static DEVICE_ATTR_RO(cap);
2927
2928static ssize_t ecap_show(struct device *dev,
2929 struct device_attribute *attr, char *buf)
2930{
2931 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2932 return sysfs_emit(buf, "%llx\n", iommu->ecap);
2933}
2934static DEVICE_ATTR_RO(ecap);
2935
2936static ssize_t domains_supported_show(struct device *dev,
2937 struct device_attribute *attr, char *buf)
2938{
2939 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2940 return sysfs_emit(buf, "%ld\n", cap_ndoms(iommu->cap));
2941}
2942static DEVICE_ATTR_RO(domains_supported);
2943
2944static ssize_t domains_used_show(struct device *dev,
2945 struct device_attribute *attr, char *buf)
2946{
2947 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2948 return sysfs_emit(buf, "%d\n",
2949 bitmap_weight(iommu->domain_ids,
2950 cap_ndoms(iommu->cap)));
2951}
2952static DEVICE_ATTR_RO(domains_used);
2953
2954static struct attribute *intel_iommu_attrs[] = {
2955 &dev_attr_version.attr,
2956 &dev_attr_address.attr,
2957 &dev_attr_cap.attr,
2958 &dev_attr_ecap.attr,
2959 &dev_attr_domains_supported.attr,
2960 &dev_attr_domains_used.attr,
2961 NULL,
2962};
2963
2964static struct attribute_group intel_iommu_group = {
2965 .name = "intel-iommu",
2966 .attrs = intel_iommu_attrs,
2967};
2968
2969const struct attribute_group *intel_iommu_groups[] = {
2970 &intel_iommu_group,
2971 NULL,
2972};
2973
2974static bool has_external_pci(void)
2975{
2976 struct pci_dev *pdev = NULL;
2977
2978 for_each_pci_dev(pdev)
2979 if (pdev->external_facing) {
2980 pci_dev_put(pdev);
2981 return true;
2982 }
2983
2984 return false;
2985}
2986
2987static int __init platform_optin_force_iommu(void)
2988{
2989 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2990 return 0;
2991
2992 if (no_iommu || dmar_disabled)
2993 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2994
2995 /*
2996 * If Intel-IOMMU is disabled by default, we will apply identity
2997 * map for all devices except those marked as being untrusted.
2998 */
2999 if (dmar_disabled)
3000 iommu_set_default_passthrough(false);
3001
3002 dmar_disabled = 0;
3003 no_iommu = 0;
3004
3005 return 1;
3006}
3007
3008static int __init probe_acpi_namespace_devices(void)
3009{
3010 struct dmar_drhd_unit *drhd;
3011 /* To avoid a -Wunused-but-set-variable warning. */
3012 struct intel_iommu *iommu __maybe_unused;
3013 struct device *dev;
3014 int i, ret = 0;
3015
3016 for_each_active_iommu(iommu, drhd) {
3017 for_each_active_dev_scope(drhd->devices,
3018 drhd->devices_cnt, i, dev) {
3019 struct acpi_device_physical_node *pn;
3020 struct acpi_device *adev;
3021
3022 if (dev->bus != &acpi_bus_type)
3023 continue;
3024
3025 adev = to_acpi_device(dev);
3026 mutex_lock(&adev->physical_node_lock);
3027 list_for_each_entry(pn,
3028 &adev->physical_node_list, node) {
3029 ret = iommu_probe_device(pn->dev);
3030 if (ret)
3031 break;
3032 }
3033 mutex_unlock(&adev->physical_node_lock);
3034
3035 if (ret)
3036 return ret;
3037 }
3038 }
3039
3040 return 0;
3041}
3042
3043static __init int tboot_force_iommu(void)
3044{
3045 if (!tboot_enabled())
3046 return 0;
3047
3048 if (no_iommu || dmar_disabled)
3049 pr_warn("Forcing Intel-IOMMU to enabled\n");
3050
3051 dmar_disabled = 0;
3052 no_iommu = 0;
3053
3054 return 1;
3055}
3056
3057int __init intel_iommu_init(void)
3058{
3059 int ret = -ENODEV;
3060 struct dmar_drhd_unit *drhd;
3061 struct intel_iommu *iommu;
3062
3063 /*
3064 * Intel IOMMU is required for a TXT/tboot launch or platform
3065 * opt in, so enforce that.
3066 */
3067 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3068 platform_optin_force_iommu();
3069
3070 down_write(&dmar_global_lock);
3071 if (dmar_table_init()) {
3072 if (force_on)
3073 panic("tboot: Failed to initialize DMAR table\n");
3074 goto out_free_dmar;
3075 }
3076
3077 if (dmar_dev_scope_init() < 0) {
3078 if (force_on)
3079 panic("tboot: Failed to initialize DMAR device scope\n");
3080 goto out_free_dmar;
3081 }
3082
3083 up_write(&dmar_global_lock);
3084
3085 /*
3086 * The bus notifier takes the dmar_global_lock, so lockdep will
3087 * complain later when we register it under the lock.
3088 */
3089 dmar_register_bus_notifier();
3090
3091 down_write(&dmar_global_lock);
3092
3093 if (!no_iommu)
3094 intel_iommu_debugfs_init();
3095
3096 if (no_iommu || dmar_disabled) {
3097 /*
3098 * We exit the function here to ensure IOMMU's remapping and
3099 * mempool aren't setup, which means that the IOMMU's PMRs
3100 * won't be disabled via the call to init_dmars(). So disable
3101 * it explicitly here. The PMRs were setup by tboot prior to
3102 * calling SENTER, but the kernel is expected to reset/tear
3103 * down the PMRs.
3104 */
3105 if (intel_iommu_tboot_noforce) {
3106 for_each_iommu(iommu, drhd)
3107 iommu_disable_protect_mem_regions(iommu);
3108 }
3109
3110 /*
3111 * Make sure the IOMMUs are switched off, even when we
3112 * boot into a kexec kernel and the previous kernel left
3113 * them enabled
3114 */
3115 intel_disable_iommus();
3116 goto out_free_dmar;
3117 }
3118
3119 if (list_empty(&dmar_rmrr_units))
3120 pr_info("No RMRR found\n");
3121
3122 if (list_empty(&dmar_atsr_units))
3123 pr_info("No ATSR found\n");
3124
3125 if (list_empty(&dmar_satc_units))
3126 pr_info("No SATC found\n");
3127
3128 init_no_remapping_devices();
3129
3130 ret = init_dmars();
3131 if (ret) {
3132 if (force_on)
3133 panic("tboot: Failed to initialize DMARs\n");
3134 pr_err("Initialization failed\n");
3135 goto out_free_dmar;
3136 }
3137 up_write(&dmar_global_lock);
3138
3139 init_iommu_pm_ops();
3140
3141 down_read(&dmar_global_lock);
3142 for_each_active_iommu(iommu, drhd) {
3143 /*
3144 * The flush queue implementation does not perform
3145 * page-selective invalidations that are required for efficient
3146 * TLB flushes in virtual environments. The benefit of batching
3147 * is likely to be much lower than the overhead of synchronizing
3148 * the virtual and physical IOMMU page-tables.
3149 */
3150 if (cap_caching_mode(iommu->cap) &&
3151 !first_level_by_default(iommu)) {
3152 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3153 iommu_set_dma_strict();
3154 }
3155 iommu_device_sysfs_add(&iommu->iommu, NULL,
3156 intel_iommu_groups,
3157 "%s", iommu->name);
3158 /*
3159 * The iommu device probe is protected by the iommu_probe_device_lock.
3160 * Release the dmar_global_lock before entering the device probe path
3161 * to avoid unnecessary lock order splat.
3162 */
3163 up_read(&dmar_global_lock);
3164 iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
3165 down_read(&dmar_global_lock);
3166
3167 iommu_pmu_register(iommu);
3168 }
3169
3170 if (probe_acpi_namespace_devices())
3171 pr_warn("ACPI name space devices didn't probe correctly\n");
3172
3173 /* Finally, we enable the DMA remapping hardware. */
3174 for_each_iommu(iommu, drhd) {
3175 if (!drhd->ignored && !translation_pre_enabled(iommu))
3176 iommu_enable_translation(iommu);
3177
3178 iommu_disable_protect_mem_regions(iommu);
3179 }
3180 up_read(&dmar_global_lock);
3181
3182 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3183
3184 intel_iommu_enabled = 1;
3185
3186 return 0;
3187
3188out_free_dmar:
3189 intel_iommu_free_dmars();
3190 up_write(&dmar_global_lock);
3191 return ret;
3192}
3193
3194static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3195{
3196 struct device_domain_info *info = opaque;
3197
3198 domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
3199 return 0;
3200}
3201
3202/*
3203 * NB - intel-iommu lacks any sort of reference counting for the users of
3204 * dependent devices. If multiple endpoints have intersecting dependent
3205 * devices, unbinding the driver from any one of them will possibly leave
3206 * the others unable to operate.
3207 */
3208static void domain_context_clear(struct device_domain_info *info)
3209{
3210 if (!dev_is_pci(info->dev)) {
3211 domain_context_clear_one(info, info->bus, info->devfn);
3212 return;
3213 }
3214
3215 pci_for_each_dma_alias(to_pci_dev(info->dev),
3216 &domain_context_clear_one_cb, info);
3217}
3218
3219/*
3220 * Clear the page table pointer in context or pasid table entries so that
3221 * all DMA requests without PASID from the device are blocked. If the page
3222 * table has been set, clean up the data structures.
3223 */
3224void device_block_translation(struct device *dev)
3225{
3226 struct device_domain_info *info = dev_iommu_priv_get(dev);
3227 struct intel_iommu *iommu = info->iommu;
3228 unsigned long flags;
3229
3230 if (info->domain)
3231 cache_tag_unassign_domain(info->domain, dev, IOMMU_NO_PASID);
3232
3233 iommu_disable_pci_caps(info);
3234 if (!dev_is_real_dma_subdevice(dev)) {
3235 if (sm_supported(iommu))
3236 intel_pasid_tear_down_entry(iommu, dev,
3237 IOMMU_NO_PASID, false);
3238 else
3239 domain_context_clear(info);
3240 }
3241
3242 if (!info->domain)
3243 return;
3244
3245 spin_lock_irqsave(&info->domain->lock, flags);
3246 list_del(&info->link);
3247 spin_unlock_irqrestore(&info->domain->lock, flags);
3248
3249 domain_detach_iommu(info->domain, iommu);
3250 info->domain = NULL;
3251}
3252
3253static int blocking_domain_attach_dev(struct iommu_domain *domain,
3254 struct device *dev)
3255{
3256 device_block_translation(dev);
3257 return 0;
3258}
3259
3260static struct iommu_domain blocking_domain = {
3261 .type = IOMMU_DOMAIN_BLOCKED,
3262 .ops = &(const struct iommu_domain_ops) {
3263 .attach_dev = blocking_domain_attach_dev,
3264 }
3265};
3266
3267static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3268{
3269 if (!intel_iommu_superpage)
3270 return 0;
3271
3272 if (first_stage)
3273 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3274
3275 return fls(cap_super_page_val(iommu->cap));
3276}
3277
3278static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3279{
3280 struct device_domain_info *info = dev_iommu_priv_get(dev);
3281 struct intel_iommu *iommu = info->iommu;
3282 struct dmar_domain *domain;
3283 int addr_width;
3284
3285 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3286 if (!domain)
3287 return ERR_PTR(-ENOMEM);
3288
3289 INIT_LIST_HEAD(&domain->devices);
3290 INIT_LIST_HEAD(&domain->dev_pasids);
3291 INIT_LIST_HEAD(&domain->cache_tags);
3292 spin_lock_init(&domain->lock);
3293 spin_lock_init(&domain->cache_lock);
3294 xa_init(&domain->iommu_array);
3295
3296 domain->nid = dev_to_node(dev);
3297 domain->use_first_level = first_stage;
3298
3299 /* calculate the address width */
3300 addr_width = agaw_to_width(iommu->agaw);
3301 if (addr_width > cap_mgaw(iommu->cap))
3302 addr_width = cap_mgaw(iommu->cap);
3303 domain->gaw = addr_width;
3304 domain->agaw = iommu->agaw;
3305 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3306
3307 /* iommu memory access coherency */
3308 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3309
3310 /* pagesize bitmap */
3311 domain->domain.pgsize_bitmap = SZ_4K;
3312 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3313 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3314
3315 /*
3316 * IOVA aperture: First-level translation restricts the input-address
3317 * to a canonical address (i.e., address bits 63:N have the same value
3318 * as address bit [N-1], where N is 48-bits with 4-level paging and
3319 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3320 */
3321 domain->domain.geometry.force_aperture = true;
3322 domain->domain.geometry.aperture_start = 0;
3323 if (first_stage)
3324 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3325 else
3326 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3327
3328 /* always allocate the top pgd */
3329 domain->pgd = iommu_alloc_page_node(domain->nid, GFP_KERNEL);
3330 if (!domain->pgd) {
3331 kfree(domain);
3332 return ERR_PTR(-ENOMEM);
3333 }
3334 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3335
3336 return domain;
3337}
3338
3339static struct iommu_domain *
3340intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3341 const struct iommu_user_data *user_data)
3342{
3343 struct device_domain_info *info = dev_iommu_priv_get(dev);
3344 bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
3345 bool nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3346 struct intel_iommu *iommu = info->iommu;
3347 struct dmar_domain *dmar_domain;
3348 struct iommu_domain *domain;
3349 bool first_stage;
3350
3351 if (flags &
3352 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3353 | IOMMU_HWPT_FAULT_ID_VALID)))
3354 return ERR_PTR(-EOPNOTSUPP);
3355 if (nested_parent && !nested_supported(iommu))
3356 return ERR_PTR(-EOPNOTSUPP);
3357 if (user_data || (dirty_tracking && !ssads_supported(iommu)))
3358 return ERR_PTR(-EOPNOTSUPP);
3359
3360 /*
3361 * Always allocate the guest compatible page table unless
3362 * IOMMU_HWPT_ALLOC_NEST_PARENT or IOMMU_HWPT_ALLOC_DIRTY_TRACKING
3363 * is specified.
3364 */
3365 if (nested_parent || dirty_tracking) {
3366 if (!sm_supported(iommu) || !ecap_slts(iommu->ecap))
3367 return ERR_PTR(-EOPNOTSUPP);
3368 first_stage = false;
3369 } else {
3370 first_stage = first_level_by_default(iommu);
3371 }
3372
3373 dmar_domain = paging_domain_alloc(dev, first_stage);
3374 if (IS_ERR(dmar_domain))
3375 return ERR_CAST(dmar_domain);
3376 domain = &dmar_domain->domain;
3377 domain->type = IOMMU_DOMAIN_UNMANAGED;
3378 domain->owner = &intel_iommu_ops;
3379 domain->ops = intel_iommu_ops.default_domain_ops;
3380
3381 if (nested_parent) {
3382 dmar_domain->nested_parent = true;
3383 INIT_LIST_HEAD(&dmar_domain->s1_domains);
3384 spin_lock_init(&dmar_domain->s1_lock);
3385 }
3386
3387 if (dirty_tracking) {
3388 if (dmar_domain->use_first_level) {
3389 iommu_domain_free(domain);
3390 return ERR_PTR(-EOPNOTSUPP);
3391 }
3392 domain->dirty_ops = &intel_dirty_ops;
3393 }
3394
3395 return domain;
3396}
3397
3398static void intel_iommu_domain_free(struct iommu_domain *domain)
3399{
3400 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3401
3402 WARN_ON(dmar_domain->nested_parent &&
3403 !list_empty(&dmar_domain->s1_domains));
3404 domain_exit(dmar_domain);
3405}
3406
3407int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3408{
3409 struct device_domain_info *info = dev_iommu_priv_get(dev);
3410 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3411 struct intel_iommu *iommu = info->iommu;
3412 int addr_width;
3413
3414 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
3415 return -EPERM;
3416
3417 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3418 return -EINVAL;
3419
3420 if (domain->dirty_ops && !ssads_supported(iommu))
3421 return -EINVAL;
3422
3423 if (dmar_domain->iommu_coherency !=
3424 iommu_paging_structure_coherency(iommu))
3425 return -EINVAL;
3426
3427 if (dmar_domain->iommu_superpage !=
3428 iommu_superpage_capability(iommu, dmar_domain->use_first_level))
3429 return -EINVAL;
3430
3431 if (dmar_domain->use_first_level &&
3432 (!sm_supported(iommu) || !ecap_flts(iommu->ecap)))
3433 return -EINVAL;
3434
3435 /* check if this iommu agaw is sufficient for max mapped address */
3436 addr_width = agaw_to_width(iommu->agaw);
3437 if (addr_width > cap_mgaw(iommu->cap))
3438 addr_width = cap_mgaw(iommu->cap);
3439
3440 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3441 return -EINVAL;
3442
3443 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3444 context_copied(iommu, info->bus, info->devfn))
3445 return intel_pasid_setup_sm_context(dev);
3446
3447 return 0;
3448}
3449
3450static int intel_iommu_attach_device(struct iommu_domain *domain,
3451 struct device *dev)
3452{
3453 int ret;
3454
3455 device_block_translation(dev);
3456
3457 ret = paging_domain_compatible(domain, dev);
3458 if (ret)
3459 return ret;
3460
3461 return dmar_domain_attach_device(to_dmar_domain(domain), dev);
3462}
3463
3464static int intel_iommu_map(struct iommu_domain *domain,
3465 unsigned long iova, phys_addr_t hpa,
3466 size_t size, int iommu_prot, gfp_t gfp)
3467{
3468 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3469 u64 max_addr;
3470 int prot = 0;
3471
3472 if (iommu_prot & IOMMU_READ)
3473 prot |= DMA_PTE_READ;
3474 if (iommu_prot & IOMMU_WRITE)
3475 prot |= DMA_PTE_WRITE;
3476 if (dmar_domain->set_pte_snp)
3477 prot |= DMA_PTE_SNP;
3478
3479 max_addr = iova + size;
3480 if (dmar_domain->max_addr < max_addr) {
3481 u64 end;
3482
3483 /* check if minimum agaw is sufficient for mapped address */
3484 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3485 if (end < max_addr) {
3486 pr_err("%s: iommu width (%d) is not "
3487 "sufficient for the mapped address (%llx)\n",
3488 __func__, dmar_domain->gaw, max_addr);
3489 return -EFAULT;
3490 }
3491 dmar_domain->max_addr = max_addr;
3492 }
3493 /* Round up size to next multiple of PAGE_SIZE, if it and
3494 the low bits of hpa would take us onto the next page */
3495 size = aligned_nrpages(hpa, size);
3496 return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3497 hpa >> VTD_PAGE_SHIFT, size, prot, gfp);
3498}
3499
3500static int intel_iommu_map_pages(struct iommu_domain *domain,
3501 unsigned long iova, phys_addr_t paddr,
3502 size_t pgsize, size_t pgcount,
3503 int prot, gfp_t gfp, size_t *mapped)
3504{
3505 unsigned long pgshift = __ffs(pgsize);
3506 size_t size = pgcount << pgshift;
3507 int ret;
3508
3509 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3510 return -EINVAL;
3511
3512 if (!IS_ALIGNED(iova | paddr, pgsize))
3513 return -EINVAL;
3514
3515 ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
3516 if (!ret && mapped)
3517 *mapped = size;
3518
3519 return ret;
3520}
3521
3522static size_t intel_iommu_unmap(struct iommu_domain *domain,
3523 unsigned long iova, size_t size,
3524 struct iommu_iotlb_gather *gather)
3525{
3526 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3527 unsigned long start_pfn, last_pfn;
3528 int level = 0;
3529
3530 /* Cope with horrid API which requires us to unmap more than the
3531 size argument if it happens to be a large-page mapping. */
3532 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3533 &level, GFP_ATOMIC)))
3534 return 0;
3535
3536 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3537 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3538
3539 start_pfn = iova >> VTD_PAGE_SHIFT;
3540 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3541
3542 domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
3543
3544 if (dmar_domain->max_addr == iova + size)
3545 dmar_domain->max_addr = iova;
3546
3547 /*
3548 * We do not use page-selective IOTLB invalidation in flush queue,
3549 * so there is no need to track page and sync iotlb.
3550 */
3551 if (!iommu_iotlb_gather_queued(gather))
3552 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3553
3554 return size;
3555}
3556
3557static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3558 unsigned long iova,
3559 size_t pgsize, size_t pgcount,
3560 struct iommu_iotlb_gather *gather)
3561{
3562 unsigned long pgshift = __ffs(pgsize);
3563 size_t size = pgcount << pgshift;
3564
3565 return intel_iommu_unmap(domain, iova, size, gather);
3566}
3567
3568static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3569 struct iommu_iotlb_gather *gather)
3570{
3571 cache_tag_flush_range(to_dmar_domain(domain), gather->start,
3572 gather->end, list_empty(&gather->freelist));
3573 iommu_put_pages_list(&gather->freelist);
3574}
3575
3576static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3577 dma_addr_t iova)
3578{
3579 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3580 struct dma_pte *pte;
3581 int level = 0;
3582 u64 phys = 0;
3583
3584 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level,
3585 GFP_ATOMIC);
3586 if (pte && dma_pte_present(pte))
3587 phys = dma_pte_addr(pte) +
3588 (iova & (BIT_MASK(level_to_offset_bits(level) +
3589 VTD_PAGE_SHIFT) - 1));
3590
3591 return phys;
3592}
3593
3594static bool domain_support_force_snooping(struct dmar_domain *domain)
3595{
3596 struct device_domain_info *info;
3597 bool support = true;
3598
3599 assert_spin_locked(&domain->lock);
3600 list_for_each_entry(info, &domain->devices, link) {
3601 if (!ecap_sc_support(info->iommu->ecap)) {
3602 support = false;
3603 break;
3604 }
3605 }
3606
3607 return support;
3608}
3609
3610static void domain_set_force_snooping(struct dmar_domain *domain)
3611{
3612 struct device_domain_info *info;
3613
3614 assert_spin_locked(&domain->lock);
3615 /*
3616 * Second level page table supports per-PTE snoop control. The
3617 * iommu_map() interface will handle this by setting SNP bit.
3618 */
3619 if (!domain->use_first_level) {
3620 domain->set_pte_snp = true;
3621 return;
3622 }
3623
3624 list_for_each_entry(info, &domain->devices, link)
3625 intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
3626 IOMMU_NO_PASID);
3627}
3628
3629static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
3630{
3631 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
3632 unsigned long flags;
3633
3634 if (dmar_domain->force_snooping)
3635 return true;
3636
3637 spin_lock_irqsave(&dmar_domain->lock, flags);
3638 if (!domain_support_force_snooping(dmar_domain) ||
3639 (!dmar_domain->use_first_level && dmar_domain->has_mappings)) {
3640 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3641 return false;
3642 }
3643
3644 domain_set_force_snooping(dmar_domain);
3645 dmar_domain->force_snooping = true;
3646 spin_unlock_irqrestore(&dmar_domain->lock, flags);
3647
3648 return true;
3649}
3650
3651static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3652{
3653 struct device_domain_info *info = dev_iommu_priv_get(dev);
3654
3655 switch (cap) {
3656 case IOMMU_CAP_CACHE_COHERENCY:
3657 case IOMMU_CAP_DEFERRED_FLUSH:
3658 return true;
3659 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3660 return dmar_platform_optin();
3661 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3662 return ecap_sc_support(info->iommu->ecap);
3663 case IOMMU_CAP_DIRTY_TRACKING:
3664 return ssads_supported(info->iommu);
3665 default:
3666 return false;
3667 }
3668}
3669
3670static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3671{
3672 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3673 struct device_domain_info *info;
3674 struct intel_iommu *iommu;
3675 u8 bus, devfn;
3676 int ret;
3677
3678 iommu = device_lookup_iommu(dev, &bus, &devfn);
3679 if (!iommu || !iommu->iommu.ops)
3680 return ERR_PTR(-ENODEV);
3681
3682 info = kzalloc(sizeof(*info), GFP_KERNEL);
3683 if (!info)
3684 return ERR_PTR(-ENOMEM);
3685
3686 if (dev_is_real_dma_subdevice(dev)) {
3687 info->bus = pdev->bus->number;
3688 info->devfn = pdev->devfn;
3689 info->segment = pci_domain_nr(pdev->bus);
3690 } else {
3691 info->bus = bus;
3692 info->devfn = devfn;
3693 info->segment = iommu->segment;
3694 }
3695
3696 info->dev = dev;
3697 info->iommu = iommu;
3698 if (dev_is_pci(dev)) {
3699 if (ecap_dev_iotlb_support(iommu->ecap) &&
3700 pci_ats_supported(pdev) &&
3701 dmar_ats_supported(pdev, iommu)) {
3702 info->ats_supported = 1;
3703 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3704
3705 /*
3706 * For IOMMU that supports device IOTLB throttling
3707 * (DIT), we assign PFSID to the invalidation desc
3708 * of a VF such that IOMMU HW can gauge queue depth
3709 * at PF level. If DIT is not set, PFSID will be
3710 * treated as reserved, which should be set to 0.
3711 */
3712 if (ecap_dit(iommu->ecap))
3713 info->pfsid = pci_dev_id(pci_physfn(pdev));
3714 info->ats_qdep = pci_ats_queue_depth(pdev);
3715 }
3716 if (sm_supported(iommu)) {
3717 if (pasid_supported(iommu)) {
3718 int features = pci_pasid_features(pdev);
3719
3720 if (features >= 0)
3721 info->pasid_supported = features | 1;
3722 }
3723
3724 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3725 pci_pri_supported(pdev))
3726 info->pri_supported = 1;
3727 }
3728 }
3729
3730 dev_iommu_priv_set(dev, info);
3731 if (pdev && pci_ats_supported(pdev)) {
3732 pci_prepare_ats(pdev, VTD_PAGE_SHIFT);
3733 ret = device_rbtree_insert(iommu, info);
3734 if (ret)
3735 goto free;
3736 }
3737
3738 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3739 ret = intel_pasid_alloc_table(dev);
3740 if (ret) {
3741 dev_err(dev, "PASID table allocation failed\n");
3742 goto clear_rbtree;
3743 }
3744
3745 if (!context_copied(iommu, info->bus, info->devfn)) {
3746 ret = intel_pasid_setup_sm_context(dev);
3747 if (ret)
3748 goto free_table;
3749 }
3750 }
3751
3752 intel_iommu_debugfs_create_dev(info);
3753
3754 /*
3755 * The PCIe spec, in its wisdom, declares that the behaviour of the
3756 * device is undefined if you enable PASID support after ATS support.
3757 * So always enable PASID support on devices which have it, even if
3758 * we can't yet know if we're ever going to use it.
3759 */
3760 if (info->pasid_supported &&
3761 !pci_enable_pasid(pdev, info->pasid_supported & ~1))
3762 info->pasid_enabled = 1;
3763
3764 return &iommu->iommu;
3765free_table:
3766 intel_pasid_free_table(dev);
3767clear_rbtree:
3768 device_rbtree_remove(info);
3769free:
3770 kfree(info);
3771
3772 return ERR_PTR(ret);
3773}
3774
3775static void intel_iommu_release_device(struct device *dev)
3776{
3777 struct device_domain_info *info = dev_iommu_priv_get(dev);
3778 struct intel_iommu *iommu = info->iommu;
3779
3780 if (info->pasid_enabled) {
3781 pci_disable_pasid(to_pci_dev(dev));
3782 info->pasid_enabled = 0;
3783 }
3784
3785 mutex_lock(&iommu->iopf_lock);
3786 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3787 device_rbtree_remove(info);
3788 mutex_unlock(&iommu->iopf_lock);
3789
3790 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3791 !context_copied(iommu, info->bus, info->devfn))
3792 intel_pasid_teardown_sm_context(dev);
3793
3794 intel_pasid_free_table(dev);
3795 intel_iommu_debugfs_remove_dev(info);
3796 kfree(info);
3797 set_dma_ops(dev, NULL);
3798}
3799
3800static void intel_iommu_get_resv_regions(struct device *device,
3801 struct list_head *head)
3802{
3803 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3804 struct iommu_resv_region *reg;
3805 struct dmar_rmrr_unit *rmrr;
3806 struct device *i_dev;
3807 int i;
3808
3809 rcu_read_lock();
3810 for_each_rmrr_units(rmrr) {
3811 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3812 i, i_dev) {
3813 struct iommu_resv_region *resv;
3814 enum iommu_resv_type type;
3815 size_t length;
3816
3817 if (i_dev != device &&
3818 !is_downstream_to_pci_bridge(device, i_dev))
3819 continue;
3820
3821 length = rmrr->end_address - rmrr->base_address + 1;
3822
3823 type = device_rmrr_is_relaxable(device) ?
3824 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3825
3826 resv = iommu_alloc_resv_region(rmrr->base_address,
3827 length, prot, type,
3828 GFP_ATOMIC);
3829 if (!resv)
3830 break;
3831
3832 list_add_tail(&resv->list, head);
3833 }
3834 }
3835 rcu_read_unlock();
3836
3837#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3838 if (dev_is_pci(device)) {
3839 struct pci_dev *pdev = to_pci_dev(device);
3840
3841 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3842 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
3843 IOMMU_RESV_DIRECT_RELAXABLE,
3844 GFP_KERNEL);
3845 if (reg)
3846 list_add_tail(®->list, head);
3847 }
3848 }
3849#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3850
3851 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3852 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3853 0, IOMMU_RESV_MSI, GFP_KERNEL);
3854 if (!reg)
3855 return;
3856 list_add_tail(®->list, head);
3857}
3858
3859static struct iommu_group *intel_iommu_device_group(struct device *dev)
3860{
3861 if (dev_is_pci(dev))
3862 return pci_device_group(dev);
3863 return generic_device_group(dev);
3864}
3865
3866static int intel_iommu_enable_sva(struct device *dev)
3867{
3868 struct device_domain_info *info = dev_iommu_priv_get(dev);
3869 struct intel_iommu *iommu;
3870
3871 if (!info || dmar_disabled)
3872 return -EINVAL;
3873
3874 iommu = info->iommu;
3875 if (!iommu)
3876 return -EINVAL;
3877
3878 if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
3879 return -ENODEV;
3880
3881 if (!info->pasid_enabled || !info->ats_enabled)
3882 return -EINVAL;
3883
3884 /*
3885 * Devices having device-specific I/O fault handling should not
3886 * support PCI/PRI. The IOMMU side has no means to check the
3887 * capability of device-specific IOPF. Therefore, IOMMU can only
3888 * default that if the device driver enables SVA on a non-PRI
3889 * device, it will handle IOPF in its own way.
3890 */
3891 if (!info->pri_supported)
3892 return 0;
3893
3894 /* Devices supporting PRI should have it enabled. */
3895 if (!info->pri_enabled)
3896 return -EINVAL;
3897
3898 return 0;
3899}
3900
3901static int context_flip_pri(struct device_domain_info *info, bool enable)
3902{
3903 struct intel_iommu *iommu = info->iommu;
3904 u8 bus = info->bus, devfn = info->devfn;
3905 struct context_entry *context;
3906 u16 did;
3907
3908 spin_lock(&iommu->lock);
3909 if (context_copied(iommu, bus, devfn)) {
3910 spin_unlock(&iommu->lock);
3911 return -EINVAL;
3912 }
3913
3914 context = iommu_context_addr(iommu, bus, devfn, false);
3915 if (!context || !context_present(context)) {
3916 spin_unlock(&iommu->lock);
3917 return -ENODEV;
3918 }
3919 did = context_domain_id(context);
3920
3921 if (enable)
3922 context_set_sm_pre(context);
3923 else
3924 context_clear_sm_pre(context);
3925
3926 if (!ecap_coherent(iommu->ecap))
3927 clflush_cache_range(context, sizeof(*context));
3928 intel_context_flush_present(info, context, did, true);
3929 spin_unlock(&iommu->lock);
3930
3931 return 0;
3932}
3933
3934static int intel_iommu_enable_iopf(struct device *dev)
3935{
3936 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3937 struct device_domain_info *info = dev_iommu_priv_get(dev);
3938 struct intel_iommu *iommu;
3939 int ret;
3940
3941 if (!pdev || !info || !info->ats_enabled || !info->pri_supported)
3942 return -ENODEV;
3943
3944 if (info->pri_enabled)
3945 return -EBUSY;
3946
3947 iommu = info->iommu;
3948 if (!iommu)
3949 return -EINVAL;
3950
3951 /* PASID is required in PRG Response Message. */
3952 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
3953 return -EINVAL;
3954
3955 ret = pci_reset_pri(pdev);
3956 if (ret)
3957 return ret;
3958
3959 ret = iopf_queue_add_device(iommu->iopf_queue, dev);
3960 if (ret)
3961 return ret;
3962
3963 ret = context_flip_pri(info, true);
3964 if (ret)
3965 goto err_remove_device;
3966
3967 ret = pci_enable_pri(pdev, PRQ_DEPTH);
3968 if (ret)
3969 goto err_clear_pri;
3970
3971 info->pri_enabled = 1;
3972
3973 return 0;
3974err_clear_pri:
3975 context_flip_pri(info, false);
3976err_remove_device:
3977 iopf_queue_remove_device(iommu->iopf_queue, dev);
3978
3979 return ret;
3980}
3981
3982static int intel_iommu_disable_iopf(struct device *dev)
3983{
3984 struct device_domain_info *info = dev_iommu_priv_get(dev);
3985 struct intel_iommu *iommu = info->iommu;
3986
3987 if (!info->pri_enabled)
3988 return -EINVAL;
3989
3990 /* Disable new PRI reception: */
3991 context_flip_pri(info, false);
3992
3993 /*
3994 * Remove device from fault queue and acknowledge all outstanding
3995 * PRQs to the device:
3996 */
3997 iopf_queue_remove_device(iommu->iopf_queue, dev);
3998
3999 /*
4000 * PCIe spec states that by clearing PRI enable bit, the Page
4001 * Request Interface will not issue new page requests, but has
4002 * outstanding page requests that have been transmitted or are
4003 * queued for transmission. This is supposed to be called after
4004 * the device driver has stopped DMA, all PASIDs have been
4005 * unbound and the outstanding PRQs have been drained.
4006 */
4007 pci_disable_pri(to_pci_dev(dev));
4008 info->pri_enabled = 0;
4009
4010 return 0;
4011}
4012
4013static int
4014intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4015{
4016 switch (feat) {
4017 case IOMMU_DEV_FEAT_IOPF:
4018 return intel_iommu_enable_iopf(dev);
4019
4020 case IOMMU_DEV_FEAT_SVA:
4021 return intel_iommu_enable_sva(dev);
4022
4023 default:
4024 return -ENODEV;
4025 }
4026}
4027
4028static int
4029intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4030{
4031 switch (feat) {
4032 case IOMMU_DEV_FEAT_IOPF:
4033 return intel_iommu_disable_iopf(dev);
4034
4035 case IOMMU_DEV_FEAT_SVA:
4036 return 0;
4037
4038 default:
4039 return -ENODEV;
4040 }
4041}
4042
4043static bool intel_iommu_is_attach_deferred(struct device *dev)
4044{
4045 struct device_domain_info *info = dev_iommu_priv_get(dev);
4046
4047 return translation_pre_enabled(info->iommu) && !info->domain;
4048}
4049
4050/*
4051 * Check that the device does not live on an external facing PCI port that is
4052 * marked as untrusted. Such devices should not be able to apply quirks and
4053 * thus not be able to bypass the IOMMU restrictions.
4054 */
4055static bool risky_device(struct pci_dev *pdev)
4056{
4057 if (pdev->untrusted) {
4058 pci_info(pdev,
4059 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4060 pdev->vendor, pdev->device);
4061 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4062 return true;
4063 }
4064 return false;
4065}
4066
4067static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4068 unsigned long iova, size_t size)
4069{
4070 cache_tag_flush_range_np(to_dmar_domain(domain), iova, iova + size - 1);
4071
4072 return 0;
4073}
4074
4075void domain_remove_dev_pasid(struct iommu_domain *domain,
4076 struct device *dev, ioasid_t pasid)
4077{
4078 struct device_domain_info *info = dev_iommu_priv_get(dev);
4079 struct dev_pasid_info *curr, *dev_pasid = NULL;
4080 struct intel_iommu *iommu = info->iommu;
4081 struct dmar_domain *dmar_domain;
4082 unsigned long flags;
4083
4084 if (!domain)
4085 return;
4086
4087 /* Identity domain has no meta data for pasid. */
4088 if (domain->type == IOMMU_DOMAIN_IDENTITY)
4089 return;
4090
4091 dmar_domain = to_dmar_domain(domain);
4092 spin_lock_irqsave(&dmar_domain->lock, flags);
4093 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4094 if (curr->dev == dev && curr->pasid == pasid) {
4095 list_del(&curr->link_domain);
4096 dev_pasid = curr;
4097 break;
4098 }
4099 }
4100 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4101
4102 cache_tag_unassign_domain(dmar_domain, dev, pasid);
4103 domain_detach_iommu(dmar_domain, iommu);
4104 if (!WARN_ON_ONCE(!dev_pasid)) {
4105 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4106 kfree(dev_pasid);
4107 }
4108}
4109
4110static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
4111 struct iommu_domain *domain)
4112{
4113 struct device_domain_info *info = dev_iommu_priv_get(dev);
4114
4115 intel_pasid_tear_down_entry(info->iommu, dev, pasid, false);
4116 domain_remove_dev_pasid(domain, dev, pasid);
4117}
4118
4119struct dev_pasid_info *
4120domain_add_dev_pasid(struct iommu_domain *domain,
4121 struct device *dev, ioasid_t pasid)
4122{
4123 struct device_domain_info *info = dev_iommu_priv_get(dev);
4124 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4125 struct intel_iommu *iommu = info->iommu;
4126 struct dev_pasid_info *dev_pasid;
4127 unsigned long flags;
4128 int ret;
4129
4130 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4131 if (!dev_pasid)
4132 return ERR_PTR(-ENOMEM);
4133
4134 ret = domain_attach_iommu(dmar_domain, iommu);
4135 if (ret)
4136 goto out_free;
4137
4138 ret = cache_tag_assign_domain(dmar_domain, dev, pasid);
4139 if (ret)
4140 goto out_detach_iommu;
4141
4142 dev_pasid->dev = dev;
4143 dev_pasid->pasid = pasid;
4144 spin_lock_irqsave(&dmar_domain->lock, flags);
4145 list_add(&dev_pasid->link_domain, &dmar_domain->dev_pasids);
4146 spin_unlock_irqrestore(&dmar_domain->lock, flags);
4147
4148 return dev_pasid;
4149out_detach_iommu:
4150 domain_detach_iommu(dmar_domain, iommu);
4151out_free:
4152 kfree(dev_pasid);
4153 return ERR_PTR(ret);
4154}
4155
4156static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4157 struct device *dev, ioasid_t pasid,
4158 struct iommu_domain *old)
4159{
4160 struct device_domain_info *info = dev_iommu_priv_get(dev);
4161 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4162 struct intel_iommu *iommu = info->iommu;
4163 struct dev_pasid_info *dev_pasid;
4164 int ret;
4165
4166 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4167 return -EINVAL;
4168
4169 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4170 return -EOPNOTSUPP;
4171
4172 if (domain->dirty_ops)
4173 return -EINVAL;
4174
4175 if (context_copied(iommu, info->bus, info->devfn))
4176 return -EBUSY;
4177
4178 ret = paging_domain_compatible(domain, dev);
4179 if (ret)
4180 return ret;
4181
4182 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4183 if (IS_ERR(dev_pasid))
4184 return PTR_ERR(dev_pasid);
4185
4186 if (dmar_domain->use_first_level)
4187 ret = domain_setup_first_level(iommu, dmar_domain,
4188 dev, pasid, old);
4189 else
4190 ret = domain_setup_second_level(iommu, dmar_domain,
4191 dev, pasid, old);
4192 if (ret)
4193 goto out_remove_dev_pasid;
4194
4195 domain_remove_dev_pasid(old, dev, pasid);
4196
4197 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4198
4199 return 0;
4200
4201out_remove_dev_pasid:
4202 domain_remove_dev_pasid(domain, dev, pasid);
4203 return ret;
4204}
4205
4206static void *intel_iommu_hw_info(struct device *dev, u32 *length, u32 *type)
4207{
4208 struct device_domain_info *info = dev_iommu_priv_get(dev);
4209 struct intel_iommu *iommu = info->iommu;
4210 struct iommu_hw_info_vtd *vtd;
4211
4212 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4213 if (!vtd)
4214 return ERR_PTR(-ENOMEM);
4215
4216 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4217 vtd->cap_reg = iommu->cap;
4218 vtd->ecap_reg = iommu->ecap;
4219 *length = sizeof(*vtd);
4220 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4221 return vtd;
4222}
4223
4224/*
4225 * Set dirty tracking for the device list of a domain. The caller must
4226 * hold the domain->lock when calling it.
4227 */
4228static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4229{
4230 struct device_domain_info *info;
4231 int ret = 0;
4232
4233 list_for_each_entry(info, devices, link) {
4234 ret = intel_pasid_setup_dirty_tracking(info->iommu, info->dev,
4235 IOMMU_NO_PASID, enable);
4236 if (ret)
4237 break;
4238 }
4239
4240 return ret;
4241}
4242
4243static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4244 bool enable)
4245{
4246 struct dmar_domain *s1_domain;
4247 unsigned long flags;
4248 int ret;
4249
4250 spin_lock(&domain->s1_lock);
4251 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4252 spin_lock_irqsave(&s1_domain->lock, flags);
4253 ret = device_set_dirty_tracking(&s1_domain->devices, enable);
4254 spin_unlock_irqrestore(&s1_domain->lock, flags);
4255 if (ret)
4256 goto err_unwind;
4257 }
4258 spin_unlock(&domain->s1_lock);
4259 return 0;
4260
4261err_unwind:
4262 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4263 spin_lock_irqsave(&s1_domain->lock, flags);
4264 device_set_dirty_tracking(&s1_domain->devices,
4265 domain->dirty_tracking);
4266 spin_unlock_irqrestore(&s1_domain->lock, flags);
4267 }
4268 spin_unlock(&domain->s1_lock);
4269 return ret;
4270}
4271
4272static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4273 bool enable)
4274{
4275 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4276 int ret;
4277
4278 spin_lock(&dmar_domain->lock);
4279 if (dmar_domain->dirty_tracking == enable)
4280 goto out_unlock;
4281
4282 ret = device_set_dirty_tracking(&dmar_domain->devices, enable);
4283 if (ret)
4284 goto err_unwind;
4285
4286 if (dmar_domain->nested_parent) {
4287 ret = parent_domain_set_dirty_tracking(dmar_domain, enable);
4288 if (ret)
4289 goto err_unwind;
4290 }
4291
4292 dmar_domain->dirty_tracking = enable;
4293out_unlock:
4294 spin_unlock(&dmar_domain->lock);
4295
4296 return 0;
4297
4298err_unwind:
4299 device_set_dirty_tracking(&dmar_domain->devices,
4300 dmar_domain->dirty_tracking);
4301 spin_unlock(&dmar_domain->lock);
4302 return ret;
4303}
4304
4305static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4306 unsigned long iova, size_t size,
4307 unsigned long flags,
4308 struct iommu_dirty_bitmap *dirty)
4309{
4310 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4311 unsigned long end = iova + size - 1;
4312 unsigned long pgsize;
4313
4314 /*
4315 * IOMMUFD core calls into a dirty tracking disabled domain without an
4316 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4317 * have occurred when we stopped dirty tracking. This ensures that we
4318 * never inherit dirtied bits from a previous cycle.
4319 */
4320 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4321 return -EINVAL;
4322
4323 do {
4324 struct dma_pte *pte;
4325 int lvl = 0;
4326
4327 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl,
4328 GFP_ATOMIC);
4329 pgsize = level_size(lvl) << VTD_PAGE_SHIFT;
4330 if (!pte || !dma_pte_present(pte)) {
4331 iova += pgsize;
4332 continue;
4333 }
4334
4335 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4336 iommu_dirty_bitmap_record(dirty, iova, pgsize);
4337 iova += pgsize;
4338 } while (iova < end);
4339
4340 return 0;
4341}
4342
4343static const struct iommu_dirty_ops intel_dirty_ops = {
4344 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4345 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4346};
4347
4348static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4349{
4350 struct device_domain_info *info = dev_iommu_priv_get(dev);
4351 struct intel_iommu *iommu = info->iommu;
4352 struct context_entry *context;
4353
4354 spin_lock(&iommu->lock);
4355 context = iommu_context_addr(iommu, bus, devfn, 1);
4356 if (!context) {
4357 spin_unlock(&iommu->lock);
4358 return -ENOMEM;
4359 }
4360
4361 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4362 spin_unlock(&iommu->lock);
4363 return 0;
4364 }
4365
4366 copied_context_tear_down(iommu, context, bus, devfn);
4367 context_clear_entry(context);
4368 context_set_domain_id(context, FLPT_DEFAULT_DID);
4369
4370 /*
4371 * In pass through mode, AW must be programmed to indicate the largest
4372 * AGAW value supported by hardware. And ASR is ignored by hardware.
4373 */
4374 context_set_address_width(context, iommu->msagaw);
4375 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4376 context_set_fault_enable(context);
4377 context_set_present(context);
4378 if (!ecap_coherent(iommu->ecap))
4379 clflush_cache_range(context, sizeof(*context));
4380 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4381 spin_unlock(&iommu->lock);
4382
4383 return 0;
4384}
4385
4386static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4387{
4388 struct device *dev = data;
4389
4390 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), alias & 0xff);
4391}
4392
4393static int device_setup_pass_through(struct device *dev)
4394{
4395 struct device_domain_info *info = dev_iommu_priv_get(dev);
4396
4397 if (!dev_is_pci(dev))
4398 return context_setup_pass_through(dev, info->bus, info->devfn);
4399
4400 return pci_for_each_dma_alias(to_pci_dev(dev),
4401 context_setup_pass_through_cb, dev);
4402}
4403
4404static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4405{
4406 struct device_domain_info *info = dev_iommu_priv_get(dev);
4407 struct intel_iommu *iommu = info->iommu;
4408 int ret;
4409
4410 device_block_translation(dev);
4411
4412 if (dev_is_real_dma_subdevice(dev))
4413 return 0;
4414
4415 if (sm_supported(iommu)) {
4416 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4417 if (!ret)
4418 iommu_enable_pci_caps(info);
4419 } else {
4420 ret = device_setup_pass_through(dev);
4421 }
4422
4423 return ret;
4424}
4425
4426static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4427 struct device *dev, ioasid_t pasid,
4428 struct iommu_domain *old)
4429{
4430 struct device_domain_info *info = dev_iommu_priv_get(dev);
4431 struct intel_iommu *iommu = info->iommu;
4432 int ret;
4433
4434 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4435 return -EOPNOTSUPP;
4436
4437 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4438 if (ret)
4439 return ret;
4440
4441 domain_remove_dev_pasid(old, dev, pasid);
4442 return 0;
4443}
4444
4445static struct iommu_domain identity_domain = {
4446 .type = IOMMU_DOMAIN_IDENTITY,
4447 .ops = &(const struct iommu_domain_ops) {
4448 .attach_dev = identity_domain_attach_dev,
4449 .set_dev_pasid = identity_domain_set_dev_pasid,
4450 },
4451};
4452
4453static struct iommu_domain *intel_iommu_domain_alloc_paging(struct device *dev)
4454{
4455 struct device_domain_info *info = dev_iommu_priv_get(dev);
4456 struct intel_iommu *iommu = info->iommu;
4457 struct dmar_domain *dmar_domain;
4458 bool first_stage;
4459
4460 first_stage = first_level_by_default(iommu);
4461 dmar_domain = paging_domain_alloc(dev, first_stage);
4462 if (IS_ERR(dmar_domain))
4463 return ERR_CAST(dmar_domain);
4464
4465 return &dmar_domain->domain;
4466}
4467
4468const struct iommu_ops intel_iommu_ops = {
4469 .blocked_domain = &blocking_domain,
4470 .release_domain = &blocking_domain,
4471 .identity_domain = &identity_domain,
4472 .capable = intel_iommu_capable,
4473 .hw_info = intel_iommu_hw_info,
4474 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4475 .domain_alloc_sva = intel_svm_domain_alloc,
4476 .domain_alloc_paging = intel_iommu_domain_alloc_paging,
4477 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4478 .probe_device = intel_iommu_probe_device,
4479 .release_device = intel_iommu_release_device,
4480 .get_resv_regions = intel_iommu_get_resv_regions,
4481 .device_group = intel_iommu_device_group,
4482 .dev_enable_feat = intel_iommu_dev_enable_feat,
4483 .dev_disable_feat = intel_iommu_dev_disable_feat,
4484 .is_attach_deferred = intel_iommu_is_attach_deferred,
4485 .def_domain_type = device_def_domain_type,
4486 .remove_dev_pasid = intel_iommu_remove_dev_pasid,
4487 .pgsize_bitmap = SZ_4K,
4488 .page_response = intel_iommu_page_response,
4489 .default_domain_ops = &(const struct iommu_domain_ops) {
4490 .attach_dev = intel_iommu_attach_device,
4491 .set_dev_pasid = intel_iommu_set_dev_pasid,
4492 .map_pages = intel_iommu_map_pages,
4493 .unmap_pages = intel_iommu_unmap_pages,
4494 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4495 .flush_iotlb_all = intel_flush_iotlb_all,
4496 .iotlb_sync = intel_iommu_tlb_sync,
4497 .iova_to_phys = intel_iommu_iova_to_phys,
4498 .free = intel_iommu_domain_free,
4499 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4500 }
4501};
4502
4503static void quirk_iommu_igfx(struct pci_dev *dev)
4504{
4505 if (risky_device(dev))
4506 return;
4507
4508 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4509 disable_igfx_iommu = 1;
4510}
4511
4512/* G4x/GM45 integrated gfx dmar support is totally busted. */
4513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4514DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4515DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4516DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4517DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4518DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4519DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4520
4521/* Broadwell igfx malfunctions with dmar */
4522DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4523DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4524DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4525DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4526DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4527DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4528DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4529DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4530DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4531DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4532DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4533DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4534DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4535DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4536DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4537DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4538DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4539DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4540DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4541DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4542DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4543DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4544DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4545DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4546
4547static void quirk_iommu_rwbf(struct pci_dev *dev)
4548{
4549 if (risky_device(dev))
4550 return;
4551
4552 /*
4553 * Mobile 4 Series Chipset neglects to set RWBF capability,
4554 * but needs it. Same seems to hold for the desktop versions.
4555 */
4556 pci_info(dev, "Forcing write-buffer flush capability\n");
4557 rwbf_quirk = 1;
4558}
4559
4560DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4561DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4562DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4563DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4564DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4565DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4566DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4567
4568#define GGC 0x52
4569#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4570#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4571#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4572#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4573#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4574#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4575#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4576#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4577
4578static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4579{
4580 unsigned short ggc;
4581
4582 if (risky_device(dev))
4583 return;
4584
4585 if (pci_read_config_word(dev, GGC, &ggc))
4586 return;
4587
4588 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4589 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4590 disable_igfx_iommu = 1;
4591 } else if (!disable_igfx_iommu) {
4592 /* we have to ensure the gfx device is idle before we flush */
4593 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4594 iommu_set_dma_strict();
4595 }
4596}
4597DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4598DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4599DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4600DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4601
4602static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4603{
4604 unsigned short ver;
4605
4606 if (!IS_GFX_DEVICE(dev))
4607 return;
4608
4609 ver = (dev->device >> 8) & 0xff;
4610 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4611 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4612 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4613 return;
4614
4615 if (risky_device(dev))
4616 return;
4617
4618 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4619 iommu_skip_te_disable = 1;
4620}
4621DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4622
4623/* On Tylersburg chipsets, some BIOSes have been known to enable the
4624 ISOCH DMAR unit for the Azalia sound device, but not give it any
4625 TLB entries, which causes it to deadlock. Check for that. We do
4626 this in a function called from init_dmars(), instead of in a PCI
4627 quirk, because we don't want to print the obnoxious "BIOS broken"
4628 message if VT-d is actually disabled.
4629*/
4630static void __init check_tylersburg_isoch(void)
4631{
4632 struct pci_dev *pdev;
4633 uint32_t vtisochctrl;
4634
4635 /* If there's no Azalia in the system anyway, forget it. */
4636 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4637 if (!pdev)
4638 return;
4639
4640 if (risky_device(pdev)) {
4641 pci_dev_put(pdev);
4642 return;
4643 }
4644
4645 pci_dev_put(pdev);
4646
4647 /* System Management Registers. Might be hidden, in which case
4648 we can't do the sanity check. But that's OK, because the
4649 known-broken BIOSes _don't_ actually hide it, so far. */
4650 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4651 if (!pdev)
4652 return;
4653
4654 if (risky_device(pdev)) {
4655 pci_dev_put(pdev);
4656 return;
4657 }
4658
4659 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4660 pci_dev_put(pdev);
4661 return;
4662 }
4663
4664 pci_dev_put(pdev);
4665
4666 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4667 if (vtisochctrl & 1)
4668 return;
4669
4670 /* Drop all bits other than the number of TLB entries */
4671 vtisochctrl &= 0x1c;
4672
4673 /* If we have the recommended number of TLB entries (16), fine. */
4674 if (vtisochctrl == 0x10)
4675 return;
4676
4677 /* Zero TLB entries? You get to ride the short bus to school. */
4678 if (!vtisochctrl) {
4679 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4680 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4681 dmi_get_system_info(DMI_BIOS_VENDOR),
4682 dmi_get_system_info(DMI_BIOS_VERSION),
4683 dmi_get_system_info(DMI_PRODUCT_VERSION));
4684 iommu_identity_mapping |= IDENTMAP_AZALIA;
4685 return;
4686 }
4687
4688 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4689 vtisochctrl);
4690}
4691
4692/*
4693 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4694 * invalidation completion before posted writes initiated with translated address
4695 * that utilized translations matching the invalidation address range, violating
4696 * the invalidation completion ordering.
4697 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4698 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4699 * under the control of the trusted/privileged host device driver must use this
4700 * quirk.
4701 * Device TLBs are invalidated under the following six conditions:
4702 * 1. Device driver does DMA API unmap IOVA
4703 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4704 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4705 * exit_mmap() due to crash
4706 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4707 * VM has to free pages that were unmapped
4708 * 5. Userspace driver unmaps a DMA buffer
4709 * 6. Cache invalidation in vSVA usage (upcoming)
4710 *
4711 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4712 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4713 * invalidate TLB the same way as normal user unmap which will use this quirk.
4714 * The dTLB invalidation after PASID cache flush does not need this quirk.
4715 *
4716 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4717 */
4718void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4719 unsigned long address, unsigned long mask,
4720 u32 pasid, u16 qdep)
4721{
4722 u16 sid;
4723
4724 if (likely(!info->dtlb_extra_inval))
4725 return;
4726
4727 sid = PCI_DEVID(info->bus, info->devfn);
4728 if (pasid == IOMMU_NO_PASID) {
4729 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4730 qdep, address, mask);
4731 } else {
4732 qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4733 pasid, qdep, address, mask);
4734 }
4735}
4736
4737#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4738
4739/*
4740 * Function to submit a command to the enhanced command interface. The
4741 * valid enhanced command descriptions are defined in Table 47 of the
4742 * VT-d spec. The VT-d hardware implementation may support some but not
4743 * all commands, which can be determined by checking the Enhanced
4744 * Command Capability Register.
4745 *
4746 * Return values:
4747 * - 0: Command successful without any error;
4748 * - Negative: software error value;
4749 * - Nonzero positive: failure status code defined in Table 48.
4750 */
4751int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4752{
4753 unsigned long flags;
4754 u64 res;
4755 int ret;
4756
4757 if (!cap_ecmds(iommu->cap))
4758 return -ENODEV;
4759
4760 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4761
4762 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4763 if (res & DMA_ECMD_ECRSP_IP) {
4764 ret = -EBUSY;
4765 goto err;
4766 }
4767
4768 /*
4769 * Unconditionally write the operand B, because
4770 * - There is no side effect if an ecmd doesn't require an
4771 * operand B, but we set the register to some value.
4772 * - It's not invoked in any critical path. The extra MMIO
4773 * write doesn't bring any performance concerns.
4774 */
4775 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4776 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4777
4778 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4779 !(res & DMA_ECMD_ECRSP_IP), res);
4780
4781 if (res & DMA_ECMD_ECRSP_IP) {
4782 ret = -ETIMEDOUT;
4783 goto err;
4784 }
4785
4786 ret = ecmd_get_status_code(res);
4787err:
4788 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4789
4790 return ret;
4791}