1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
413 struct device_domain_info *info;
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
419 spin_unlock_irqrestore(&device_domain_lock, flags);
423 spin_unlock_irqrestore(&device_domain_lock, flags);
428 const struct iommu_ops intel_iommu_ops;
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
440 static void init_translation_status(struct intel_iommu *iommu)
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
452 return container_of(dom, struct dmar_domain, domain);
455 static int __init intel_iommu_setup(char *str)
460 if (!strncmp(str, "on", 2)) {
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
491 str += strcspn(str, ",");
497 __setup("intel_iommu=", intel_iommu_setup);
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
504 struct dmar_domain **domains;
507 domains = iommu->domains[idx];
511 return domains[did & 0xff];
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
517 struct dmar_domain **domains;
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
529 domains[did & 0xff] = domain;
532 void *alloc_pgtable_page(int node)
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
539 vaddr = page_address(page);
543 void free_pgtable_page(void *vaddr)
545 free_page((unsigned long)vaddr);
548 static inline void *alloc_domain_mem(void)
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
553 static void free_domain_mem(void *vaddr)
555 kmem_cache_free(iommu_domain_cache, vaddr);
558 static inline void * alloc_devinfo_mem(void)
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
563 static inline void free_devinfo_mem(void *vaddr)
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
568 static inline int domain_type_is_si(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
594 if (test_bit(agaw, &sagaw))
602 * Calculate max SAGAW for each iommu.
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
628 for_each_domain_iommu(iommu_id, domain)
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
634 return g_iommus[iommu_id];
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
644 domain->iommu_coherency = 1;
646 for_each_domain_iommu(i, domain) {
648 if (!ecap_coherent(g_iommus[i]->ecap)) {
649 domain->iommu_coherency = 0;
656 /* No hardware attached; use lowest common denominator */
658 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_coherent(iommu->ecap)) {
660 domain->iommu_coherency = 0;
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
669 struct dmar_drhd_unit *drhd;
670 struct intel_iommu *iommu;
674 for_each_active_iommu(iommu, drhd) {
676 if (!ecap_sc_support(iommu->ecap)) {
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688 struct intel_iommu *skip)
690 struct dmar_drhd_unit *drhd;
691 struct intel_iommu *iommu;
694 if (!intel_iommu_superpage) {
698 /* set iommu_superpage to the smallest common denominator */
700 for_each_active_iommu(iommu, drhd) {
702 if (domain && domain_use_first_level(domain)) {
703 if (!cap_fl1gp_support(iommu->cap))
706 mask &= cap_super_page_val(iommu->cap);
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
721 domain_update_iommu_coherency(domain);
722 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
729 struct root_entry *root = &iommu->root_entry[bus];
730 struct context_entry *context;
734 if (sm_supported(iommu)) {
742 context = phys_to_virt(*entry & VTD_PAGE_MASK);
744 unsigned long phy_addr;
748 context = alloc_pgtable_page(iommu->node);
752 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753 phy_addr = virt_to_phys((void *)context);
754 *entry = phy_addr | 1;
755 __iommu_flush_cache(iommu, entry, sizeof(*entry));
757 return &context[devfn];
760 static int iommu_dummy(struct device *dev)
762 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
766 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
767 * sub-hierarchy of a candidate PCI-PCI bridge
768 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
769 * @bridge: the candidate PCI-PCI bridge
771 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
774 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
776 struct pci_dev *pdev, *pbridge;
778 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
781 pdev = to_pci_dev(dev);
782 pbridge = to_pci_dev(bridge);
784 if (pbridge->subordinate &&
785 pbridge->subordinate->number <= pdev->bus->number &&
786 pbridge->subordinate->busn_res.end >= pdev->bus->number)
792 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
794 struct dmar_drhd_unit *drhd = NULL;
795 struct intel_iommu *iommu;
797 struct pci_dev *pdev = NULL;
801 if (iommu_dummy(dev))
804 if (dev_is_pci(dev)) {
805 struct pci_dev *pf_pdev;
807 pdev = to_pci_dev(dev);
810 /* VMD child devices currently cannot be handled individually */
811 if (is_vmd(pdev->bus))
815 /* VFs aren't listed in scope tables; we need to look up
816 * the PF instead to find the IOMMU. */
817 pf_pdev = pci_physfn(pdev);
819 segment = pci_domain_nr(pdev->bus);
820 } else if (has_acpi_companion(dev))
821 dev = &ACPI_COMPANION(dev)->dev;
824 for_each_active_iommu(iommu, drhd) {
825 if (pdev && segment != drhd->segment)
828 for_each_active_dev_scope(drhd->devices,
829 drhd->devices_cnt, i, tmp) {
831 /* For a VF use its original BDF# not that of the PF
832 * which we used for the IOMMU lookup. Strictly speaking
833 * we could do this for all PCI devices; we only need to
834 * get the BDF# from the scope table for ACPI matches. */
835 if (pdev && pdev->is_virtfn)
838 *bus = drhd->devices[i].bus;
839 *devfn = drhd->devices[i].devfn;
843 if (is_downstream_to_pci_bridge(dev, tmp))
847 if (pdev && drhd->include_all) {
849 *bus = pdev->bus->number;
850 *devfn = pdev->devfn;
861 static void domain_flush_cache(struct dmar_domain *domain,
862 void *addr, int size)
864 if (!domain->iommu_coherency)
865 clflush_cache_range(addr, size);
868 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
870 struct context_entry *context;
874 spin_lock_irqsave(&iommu->lock, flags);
875 context = iommu_context_addr(iommu, bus, devfn, 0);
877 ret = context_present(context);
878 spin_unlock_irqrestore(&iommu->lock, flags);
882 static void free_context_table(struct intel_iommu *iommu)
886 struct context_entry *context;
888 spin_lock_irqsave(&iommu->lock, flags);
889 if (!iommu->root_entry) {
892 for (i = 0; i < ROOT_ENTRY_NR; i++) {
893 context = iommu_context_addr(iommu, i, 0, 0);
895 free_pgtable_page(context);
897 if (!sm_supported(iommu))
900 context = iommu_context_addr(iommu, i, 0x80, 0);
902 free_pgtable_page(context);
905 free_pgtable_page(iommu->root_entry);
906 iommu->root_entry = NULL;
908 spin_unlock_irqrestore(&iommu->lock, flags);
911 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
912 unsigned long pfn, int *target_level)
914 struct dma_pte *parent, *pte;
915 int level = agaw_to_level(domain->agaw);
918 BUG_ON(!domain->pgd);
920 if (!domain_pfn_supported(domain, pfn))
921 /* Address beyond IOMMU's addressing capabilities. */
924 parent = domain->pgd;
929 offset = pfn_level_offset(pfn, level);
930 pte = &parent[offset];
931 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
933 if (level == *target_level)
936 if (!dma_pte_present(pte)) {
939 tmp_page = alloc_pgtable_page(domain->nid);
944 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
945 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
946 if (domain_use_first_level(domain))
947 pteval |= DMA_FL_PTE_XD;
948 if (cmpxchg64(&pte->val, 0ULL, pteval))
949 /* Someone else set it while we were thinking; use theirs. */
950 free_pgtable_page(tmp_page);
952 domain_flush_cache(domain, pte, sizeof(*pte));
957 parent = phys_to_virt(dma_pte_addr(pte));
962 *target_level = level;
967 /* return address's pte at specific level */
968 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
970 int level, int *large_page)
972 struct dma_pte *parent, *pte;
973 int total = agaw_to_level(domain->agaw);
976 parent = domain->pgd;
977 while (level <= total) {
978 offset = pfn_level_offset(pfn, total);
979 pte = &parent[offset];
983 if (!dma_pte_present(pte)) {
988 if (dma_pte_superpage(pte)) {
993 parent = phys_to_virt(dma_pte_addr(pte));
999 /* clear last level pte, a tlb flush should be followed */
1000 static void dma_pte_clear_range(struct dmar_domain *domain,
1001 unsigned long start_pfn,
1002 unsigned long last_pfn)
1004 unsigned int large_page;
1005 struct dma_pte *first_pte, *pte;
1007 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009 BUG_ON(start_pfn > last_pfn);
1011 /* we don't need lock here; nobody else touches the iova range */
1014 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1016 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1021 start_pfn += lvl_to_nr_pages(large_page);
1023 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1025 domain_flush_cache(domain, first_pte,
1026 (void *)pte - (void *)first_pte);
1028 } while (start_pfn && start_pfn <= last_pfn);
1031 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032 int retain_level, struct dma_pte *pte,
1033 unsigned long pfn, unsigned long start_pfn,
1034 unsigned long last_pfn)
1036 pfn = max(start_pfn, pfn);
1037 pte = &pte[pfn_level_offset(pfn, level)];
1040 unsigned long level_pfn;
1041 struct dma_pte *level_pte;
1043 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1046 level_pfn = pfn & level_mask(level);
1047 level_pte = phys_to_virt(dma_pte_addr(pte));
1050 dma_pte_free_level(domain, level - 1, retain_level,
1051 level_pte, level_pfn, start_pfn,
1056 * Free the page table if we're below the level we want to
1057 * retain and the range covers the entire table.
1059 if (level < retain_level && !(start_pfn > level_pfn ||
1060 last_pfn < level_pfn + level_size(level) - 1)) {
1062 domain_flush_cache(domain, pte, sizeof(*pte));
1063 free_pgtable_page(level_pte);
1066 pfn += level_size(level);
1067 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1071 * clear last level (leaf) ptes and free page table pages below the
1072 * level we wish to keep intact.
1074 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075 unsigned long start_pfn,
1076 unsigned long last_pfn,
1079 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1080 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1081 BUG_ON(start_pfn > last_pfn);
1083 dma_pte_clear_range(domain, start_pfn, last_pfn);
1085 /* We don't need lock here; nobody else touches the iova range */
1086 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087 domain->pgd, 0, start_pfn, last_pfn);
1090 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091 free_pgtable_page(domain->pgd);
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097 need to *modify* it at all. All we need to do is make a list of all the
1098 pages which can be freed just as soon as we've flushed the IOTLB and we
1099 know the hardware page-walk will no longer touch them.
1100 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1102 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1103 int level, struct dma_pte *pte,
1104 struct page *freelist)
1108 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109 pg->freelist = freelist;
1115 pte = page_address(pg);
1117 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1118 freelist = dma_pte_list_pagetables(domain, level - 1,
1121 } while (!first_pte_in_page(pte));
1126 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1127 struct dma_pte *pte, unsigned long pfn,
1128 unsigned long start_pfn,
1129 unsigned long last_pfn,
1130 struct page *freelist)
1132 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1134 pfn = max(start_pfn, pfn);
1135 pte = &pte[pfn_level_offset(pfn, level)];
1138 unsigned long level_pfn;
1140 if (!dma_pte_present(pte))
1143 level_pfn = pfn & level_mask(level);
1145 /* If range covers entire pagetable, free it */
1146 if (start_pfn <= level_pfn &&
1147 last_pfn >= level_pfn + level_size(level) - 1) {
1148 /* These suborbinate page tables are going away entirely. Don't
1149 bother to clear them; we're just going to *free* them. */
1150 if (level > 1 && !dma_pte_superpage(pte))
1151 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1157 } else if (level > 1) {
1158 /* Recurse down into a level that isn't *entirely* obsolete */
1159 freelist = dma_pte_clear_level(domain, level - 1,
1160 phys_to_virt(dma_pte_addr(pte)),
1161 level_pfn, start_pfn, last_pfn,
1165 pfn += level_size(level);
1166 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1169 domain_flush_cache(domain, first_pte,
1170 (void *)++last_pte - (void *)first_pte);
1175 /* We can't just free the pages because the IOMMU may still be walking
1176 the page tables, and may have cached the intermediate levels. The
1177 pages can only be freed after the IOTLB flush has been done. */
1178 static struct page *domain_unmap(struct dmar_domain *domain,
1179 unsigned long start_pfn,
1180 unsigned long last_pfn)
1182 struct page *freelist;
1184 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1185 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1186 BUG_ON(start_pfn > last_pfn);
1188 /* we don't need lock here; nobody else touches the iova range */
1189 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1190 domain->pgd, 0, start_pfn, last_pfn, NULL);
1193 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1194 struct page *pgd_page = virt_to_page(domain->pgd);
1195 pgd_page->freelist = freelist;
1196 freelist = pgd_page;
1204 static void dma_free_pagelist(struct page *freelist)
1208 while ((pg = freelist)) {
1209 freelist = pg->freelist;
1210 free_pgtable_page(page_address(pg));
1214 static void iova_entry_free(unsigned long data)
1216 struct page *freelist = (struct page *)data;
1218 dma_free_pagelist(freelist);
1221 /* iommu handling */
1222 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1224 struct root_entry *root;
1225 unsigned long flags;
1227 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1229 pr_err("Allocating root entry for %s failed\n",
1234 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1236 spin_lock_irqsave(&iommu->lock, flags);
1237 iommu->root_entry = root;
1238 spin_unlock_irqrestore(&iommu->lock, flags);
1243 static void iommu_set_root_entry(struct intel_iommu *iommu)
1249 addr = virt_to_phys(iommu->root_entry);
1250 if (sm_supported(iommu))
1251 addr |= DMA_RTADDR_SMT;
1253 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1256 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1258 /* Make sure hardware complete it */
1259 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260 readl, (sts & DMA_GSTS_RTPS), sts);
1262 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1265 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1270 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1273 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1274 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1276 /* Make sure hardware complete it */
1277 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1278 readl, (!(val & DMA_GSTS_WBFS)), val);
1280 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283 /* return value determine if we need a write buffer flush */
1284 static void __iommu_flush_context(struct intel_iommu *iommu,
1285 u16 did, u16 source_id, u8 function_mask,
1292 case DMA_CCMD_GLOBAL_INVL:
1293 val = DMA_CCMD_GLOBAL_INVL;
1295 case DMA_CCMD_DOMAIN_INVL:
1296 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1298 case DMA_CCMD_DEVICE_INVL:
1299 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1300 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1305 val |= DMA_CCMD_ICC;
1307 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1308 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1310 /* Make sure hardware complete it */
1311 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1312 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1314 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1317 /* return value determine if we need a write buffer flush */
1318 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1319 u64 addr, unsigned int size_order, u64 type)
1321 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1322 u64 val = 0, val_iva = 0;
1326 case DMA_TLB_GLOBAL_FLUSH:
1327 /* global flush doesn't need set IVA_REG */
1328 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1330 case DMA_TLB_DSI_FLUSH:
1331 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1333 case DMA_TLB_PSI_FLUSH:
1334 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1335 /* IH bit is passed in as part of address */
1336 val_iva = size_order | addr;
1341 /* Note: set drain read/write */
1344 * This is probably to be super secure.. Looks like we can
1345 * ignore it without any impact.
1347 if (cap_read_drain(iommu->cap))
1348 val |= DMA_TLB_READ_DRAIN;
1350 if (cap_write_drain(iommu->cap))
1351 val |= DMA_TLB_WRITE_DRAIN;
1353 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354 /* Note: Only uses first TLB reg currently */
1356 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1357 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1359 /* Make sure hardware complete it */
1360 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1361 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1363 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1365 /* check IOTLB invalidation granularity */
1366 if (DMA_TLB_IAIG(val) == 0)
1367 pr_err("Flush IOTLB failed\n");
1368 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1369 pr_debug("TLB flush request %Lx, actual %Lx\n",
1370 (unsigned long long)DMA_TLB_IIRG(type),
1371 (unsigned long long)DMA_TLB_IAIG(val));
1374 static struct device_domain_info *
1375 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1378 struct device_domain_info *info;
1380 assert_spin_locked(&device_domain_lock);
1385 list_for_each_entry(info, &domain->devices, link)
1386 if (info->iommu == iommu && info->bus == bus &&
1387 info->devfn == devfn) {
1388 if (info->ats_supported && info->dev)
1396 static void domain_update_iotlb(struct dmar_domain *domain)
1398 struct device_domain_info *info;
1399 bool has_iotlb_device = false;
1401 assert_spin_locked(&device_domain_lock);
1403 list_for_each_entry(info, &domain->devices, link) {
1404 struct pci_dev *pdev;
1406 if (!info->dev || !dev_is_pci(info->dev))
1409 pdev = to_pci_dev(info->dev);
1410 if (pdev->ats_enabled) {
1411 has_iotlb_device = true;
1416 domain->has_iotlb_device = has_iotlb_device;
1419 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1421 struct pci_dev *pdev;
1423 assert_spin_locked(&device_domain_lock);
1425 if (!info || !dev_is_pci(info->dev))
1428 pdev = to_pci_dev(info->dev);
1429 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1430 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1431 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1432 * reserved, which should be set to 0.
1434 if (!ecap_dit(info->iommu->ecap))
1437 struct pci_dev *pf_pdev;
1439 /* pdev will be returned if device is not a vf */
1440 pf_pdev = pci_physfn(pdev);
1441 info->pfsid = pci_dev_id(pf_pdev);
1444 #ifdef CONFIG_INTEL_IOMMU_SVM
1445 /* The PCIe spec, in its wisdom, declares that the behaviour of
1446 the device if you enable PASID support after ATS support is
1447 undefined. So always enable PASID support on devices which
1448 have it, even if we can't yet know if we're ever going to
1450 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1451 info->pasid_enabled = 1;
1453 if (info->pri_supported &&
1454 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1455 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1456 info->pri_enabled = 1;
1458 if (!pdev->untrusted && info->ats_supported &&
1459 pci_ats_page_aligned(pdev) &&
1460 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1461 info->ats_enabled = 1;
1462 domain_update_iotlb(info->domain);
1463 info->ats_qdep = pci_ats_queue_depth(pdev);
1467 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1469 struct pci_dev *pdev;
1471 assert_spin_locked(&device_domain_lock);
1473 if (!dev_is_pci(info->dev))
1476 pdev = to_pci_dev(info->dev);
1478 if (info->ats_enabled) {
1479 pci_disable_ats(pdev);
1480 info->ats_enabled = 0;
1481 domain_update_iotlb(info->domain);
1483 #ifdef CONFIG_INTEL_IOMMU_SVM
1484 if (info->pri_enabled) {
1485 pci_disable_pri(pdev);
1486 info->pri_enabled = 0;
1488 if (info->pasid_enabled) {
1489 pci_disable_pasid(pdev);
1490 info->pasid_enabled = 0;
1495 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1496 u64 addr, unsigned mask)
1499 unsigned long flags;
1500 struct device_domain_info *info;
1502 if (!domain->has_iotlb_device)
1505 spin_lock_irqsave(&device_domain_lock, flags);
1506 list_for_each_entry(info, &domain->devices, link) {
1507 if (!info->ats_enabled)
1510 sid = info->bus << 8 | info->devfn;
1511 qdep = info->ats_qdep;
1512 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1515 spin_unlock_irqrestore(&device_domain_lock, flags);
1518 static void domain_flush_piotlb(struct intel_iommu *iommu,
1519 struct dmar_domain *domain,
1520 u64 addr, unsigned long npages, bool ih)
1522 u16 did = domain->iommu_did[iommu->seq_id];
1524 if (domain->default_pasid)
1525 qi_flush_piotlb(iommu, did, domain->default_pasid,
1528 if (!list_empty(&domain->devices))
1529 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533 struct dmar_domain *domain,
1534 unsigned long pfn, unsigned int pages,
1537 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539 u16 did = domain->iommu_did[iommu->seq_id];
1546 if (domain_use_first_level(domain)) {
1547 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1550 * Fallback to domain selective flush if no PSI support or
1551 * the size is too big. PSI requires page size to be 2 ^ x,
1552 * and the base address is naturally aligned to the size.
1554 if (!cap_pgsel_inv(iommu->cap) ||
1555 mask > cap_max_amask_val(iommu->cap))
1556 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1559 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1564 * In caching mode, changes of pages from non-present to present require
1565 * flush. However, device IOTLB doesn't need to be flushed in this case.
1567 if (!cap_caching_mode(iommu->cap) || !map)
1568 iommu_flush_dev_iotlb(domain, addr, mask);
1571 /* Notification for newly created mappings */
1572 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1573 struct dmar_domain *domain,
1574 unsigned long pfn, unsigned int pages)
1577 * It's a non-present to present mapping. Only flush if caching mode
1580 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1581 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1583 iommu_flush_write_buffer(iommu);
1586 static void iommu_flush_iova(struct iova_domain *iovad)
1588 struct dmar_domain *domain;
1591 domain = container_of(iovad, struct dmar_domain, iovad);
1593 for_each_domain_iommu(idx, domain) {
1594 struct intel_iommu *iommu = g_iommus[idx];
1595 u16 did = domain->iommu_did[iommu->seq_id];
1597 if (domain_use_first_level(domain))
1598 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1600 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1603 if (!cap_caching_mode(iommu->cap))
1604 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1605 0, MAX_AGAW_PFN_WIDTH);
1609 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1612 unsigned long flags;
1614 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1617 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1618 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1619 pmen &= ~DMA_PMEN_EPM;
1620 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1622 /* wait for the protected region status bit to clear */
1623 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1624 readl, !(pmen & DMA_PMEN_PRS), pmen);
1626 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1629 static void iommu_enable_translation(struct intel_iommu *iommu)
1632 unsigned long flags;
1634 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1635 iommu->gcmd |= DMA_GCMD_TE;
1636 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1638 /* Make sure hardware complete it */
1639 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1640 readl, (sts & DMA_GSTS_TES), sts);
1642 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1645 static void iommu_disable_translation(struct intel_iommu *iommu)
1650 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1651 iommu->gcmd &= ~DMA_GCMD_TE;
1652 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1654 /* Make sure hardware complete it */
1655 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656 readl, (!(sts & DMA_GSTS_TES)), sts);
1658 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1661 static int iommu_init_domains(struct intel_iommu *iommu)
1663 u32 ndomains, nlongs;
1666 ndomains = cap_ndoms(iommu->cap);
1667 pr_debug("%s: Number of Domains supported <%d>\n",
1668 iommu->name, ndomains);
1669 nlongs = BITS_TO_LONGS(ndomains);
1671 spin_lock_init(&iommu->lock);
1673 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1674 if (!iommu->domain_ids) {
1675 pr_err("%s: Allocating domain id array failed\n",
1680 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1681 iommu->domains = kzalloc(size, GFP_KERNEL);
1683 if (iommu->domains) {
1684 size = 256 * sizeof(struct dmar_domain *);
1685 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1688 if (!iommu->domains || !iommu->domains[0]) {
1689 pr_err("%s: Allocating domain array failed\n",
1691 kfree(iommu->domain_ids);
1692 kfree(iommu->domains);
1693 iommu->domain_ids = NULL;
1694 iommu->domains = NULL;
1699 * If Caching mode is set, then invalid translations are tagged
1700 * with domain-id 0, hence we need to pre-allocate it. We also
1701 * use domain-id 0 as a marker for non-allocated domain-id, so
1702 * make sure it is not used for a real domain.
1704 set_bit(0, iommu->domain_ids);
1707 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1708 * entry for first-level or pass-through translation modes should
1709 * be programmed with a domain id different from those used for
1710 * second-level or nested translation. We reserve a domain id for
1713 if (sm_supported(iommu))
1714 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1719 static void disable_dmar_iommu(struct intel_iommu *iommu)
1721 struct device_domain_info *info, *tmp;
1722 unsigned long flags;
1724 if (!iommu->domains || !iommu->domain_ids)
1727 spin_lock_irqsave(&device_domain_lock, flags);
1728 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1729 if (info->iommu != iommu)
1732 if (!info->dev || !info->domain)
1735 __dmar_remove_one_dev_info(info);
1737 spin_unlock_irqrestore(&device_domain_lock, flags);
1739 if (iommu->gcmd & DMA_GCMD_TE)
1740 iommu_disable_translation(iommu);
1743 static void free_dmar_iommu(struct intel_iommu *iommu)
1745 if ((iommu->domains) && (iommu->domain_ids)) {
1746 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1749 for (i = 0; i < elems; i++)
1750 kfree(iommu->domains[i]);
1751 kfree(iommu->domains);
1752 kfree(iommu->domain_ids);
1753 iommu->domains = NULL;
1754 iommu->domain_ids = NULL;
1757 g_iommus[iommu->seq_id] = NULL;
1759 /* free context mapping */
1760 free_context_table(iommu);
1762 #ifdef CONFIG_INTEL_IOMMU_SVM
1763 if (pasid_supported(iommu)) {
1764 if (ecap_prs(iommu->ecap))
1765 intel_svm_finish_prq(iommu);
1771 * Check and return whether first level is used by default for
1774 static bool first_level_by_default(void)
1776 struct dmar_drhd_unit *drhd;
1777 struct intel_iommu *iommu;
1778 static int first_level_support = -1;
1780 if (likely(first_level_support != -1))
1781 return first_level_support;
1783 first_level_support = 1;
1786 for_each_active_iommu(iommu, drhd) {
1787 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1788 first_level_support = 0;
1794 return first_level_support;
1797 static struct dmar_domain *alloc_domain(int flags)
1799 struct dmar_domain *domain;
1801 domain = alloc_domain_mem();
1805 memset(domain, 0, sizeof(*domain));
1806 domain->nid = NUMA_NO_NODE;
1807 domain->flags = flags;
1808 if (first_level_by_default())
1809 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1810 domain->has_iotlb_device = false;
1811 INIT_LIST_HEAD(&domain->devices);
1816 /* Must be called with iommu->lock */
1817 static int domain_attach_iommu(struct dmar_domain *domain,
1818 struct intel_iommu *iommu)
1820 unsigned long ndomains;
1823 assert_spin_locked(&device_domain_lock);
1824 assert_spin_locked(&iommu->lock);
1826 domain->iommu_refcnt[iommu->seq_id] += 1;
1827 domain->iommu_count += 1;
1828 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1829 ndomains = cap_ndoms(iommu->cap);
1830 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1832 if (num >= ndomains) {
1833 pr_err("%s: No free domain ids\n", iommu->name);
1834 domain->iommu_refcnt[iommu->seq_id] -= 1;
1835 domain->iommu_count -= 1;
1839 set_bit(num, iommu->domain_ids);
1840 set_iommu_domain(iommu, num, domain);
1842 domain->iommu_did[iommu->seq_id] = num;
1843 domain->nid = iommu->node;
1845 domain_update_iommu_cap(domain);
1851 static int domain_detach_iommu(struct dmar_domain *domain,
1852 struct intel_iommu *iommu)
1856 assert_spin_locked(&device_domain_lock);
1857 assert_spin_locked(&iommu->lock);
1859 domain->iommu_refcnt[iommu->seq_id] -= 1;
1860 count = --domain->iommu_count;
1861 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1862 num = domain->iommu_did[iommu->seq_id];
1863 clear_bit(num, iommu->domain_ids);
1864 set_iommu_domain(iommu, num, NULL);
1866 domain_update_iommu_cap(domain);
1867 domain->iommu_did[iommu->seq_id] = 0;
1873 static struct iova_domain reserved_iova_list;
1874 static struct lock_class_key reserved_rbtree_key;
1876 static int dmar_init_reserved_ranges(void)
1878 struct pci_dev *pdev = NULL;
1882 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1884 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1885 &reserved_rbtree_key);
1887 /* IOAPIC ranges shouldn't be accessed by DMA */
1888 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1889 IOVA_PFN(IOAPIC_RANGE_END));
1891 pr_err("Reserve IOAPIC range failed\n");
1895 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1896 for_each_pci_dev(pdev) {
1899 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1900 r = &pdev->resource[i];
1901 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1903 iova = reserve_iova(&reserved_iova_list,
1907 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1915 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1917 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1920 static inline int guestwidth_to_adjustwidth(int gaw)
1923 int r = (gaw - 12) % 9;
1934 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1937 int adjust_width, agaw;
1938 unsigned long sagaw;
1941 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1943 if (!intel_iommu_strict) {
1944 ret = init_iova_flush_queue(&domain->iovad,
1945 iommu_flush_iova, iova_entry_free);
1947 pr_info("iova flush queue initialization failed\n");
1950 domain_reserve_special_ranges(domain);
1952 /* calculate AGAW */
1953 if (guest_width > cap_mgaw(iommu->cap))
1954 guest_width = cap_mgaw(iommu->cap);
1955 domain->gaw = guest_width;
1956 adjust_width = guestwidth_to_adjustwidth(guest_width);
1957 agaw = width_to_agaw(adjust_width);
1958 sagaw = cap_sagaw(iommu->cap);
1959 if (!test_bit(agaw, &sagaw)) {
1960 /* hardware doesn't support it, choose a bigger one */
1961 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1962 agaw = find_next_bit(&sagaw, 5, agaw);
1966 domain->agaw = agaw;
1968 if (ecap_coherent(iommu->ecap))
1969 domain->iommu_coherency = 1;
1971 domain->iommu_coherency = 0;
1973 if (ecap_sc_support(iommu->ecap))
1974 domain->iommu_snooping = 1;
1976 domain->iommu_snooping = 0;
1978 if (intel_iommu_superpage)
1979 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1981 domain->iommu_superpage = 0;
1983 domain->nid = iommu->node;
1985 /* always allocate the top pgd */
1986 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1989 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1993 static void domain_exit(struct dmar_domain *domain)
1996 /* Remove associated devices and clear attached or cached domains */
1997 domain_remove_dev_info(domain);
2000 put_iova_domain(&domain->iovad);
2003 struct page *freelist;
2005 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2006 dma_free_pagelist(freelist);
2009 free_domain_mem(domain);
2013 * Get the PASID directory size for scalable mode context entry.
2014 * Value of X in the PDTS field of a scalable mode context entry
2015 * indicates PASID directory with 2^(X + 7) entries.
2017 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2021 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2022 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2030 * Set the RID_PASID field of a scalable mode context entry. The
2031 * IOMMU hardware will use the PASID value set in this field for
2032 * DMA translations of DMA requests without PASID.
2035 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2037 context->hi |= pasid & ((1 << 20) - 1);
2038 context->hi |= (1 << 20);
2042 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2045 static inline void context_set_sm_dte(struct context_entry *context)
2047 context->lo |= (1 << 2);
2051 * Set the PRE(Page Request Enable) field of a scalable mode context
2054 static inline void context_set_sm_pre(struct context_entry *context)
2056 context->lo |= (1 << 4);
2059 /* Convert value to context PASID directory size field coding. */
2060 #define context_pdts(pds) (((pds) & 0x7) << 9)
2062 static int domain_context_mapping_one(struct dmar_domain *domain,
2063 struct intel_iommu *iommu,
2064 struct pasid_table *table,
2067 u16 did = domain->iommu_did[iommu->seq_id];
2068 int translation = CONTEXT_TT_MULTI_LEVEL;
2069 struct device_domain_info *info = NULL;
2070 struct context_entry *context;
2071 unsigned long flags;
2076 if (hw_pass_through && domain_type_is_si(domain))
2077 translation = CONTEXT_TT_PASS_THROUGH;
2079 pr_debug("Set context mapping for %02x:%02x.%d\n",
2080 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2082 BUG_ON(!domain->pgd);
2084 spin_lock_irqsave(&device_domain_lock, flags);
2085 spin_lock(&iommu->lock);
2088 context = iommu_context_addr(iommu, bus, devfn, 1);
2093 if (context_present(context))
2097 * For kdump cases, old valid entries may be cached due to the
2098 * in-flight DMA and copied pgtable, but there is no unmapping
2099 * behaviour for them, thus we need an explicit cache flush for
2100 * the newly-mapped device. For kdump, at this point, the device
2101 * is supposed to finish reset at its driver probe stage, so no
2102 * in-flight DMA will exist, and we don't need to worry anymore
2105 if (context_copied(context)) {
2106 u16 did_old = context_domain_id(context);
2108 if (did_old < cap_ndoms(iommu->cap)) {
2109 iommu->flush.flush_context(iommu, did_old,
2110 (((u16)bus) << 8) | devfn,
2111 DMA_CCMD_MASK_NOBIT,
2112 DMA_CCMD_DEVICE_INVL);
2113 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2118 context_clear_entry(context);
2120 if (sm_supported(iommu)) {
2125 /* Setup the PASID DIR pointer: */
2126 pds = context_get_sm_pds(table);
2127 context->lo = (u64)virt_to_phys(table->table) |
2130 /* Setup the RID_PASID field: */
2131 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2134 * Setup the Device-TLB enable bit and Page request
2137 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2138 if (info && info->ats_supported)
2139 context_set_sm_dte(context);
2140 if (info && info->pri_supported)
2141 context_set_sm_pre(context);
2143 struct dma_pte *pgd = domain->pgd;
2146 context_set_domain_id(context, did);
2148 if (translation != CONTEXT_TT_PASS_THROUGH) {
2150 * Skip top levels of page tables for iommu which has
2151 * less agaw than default. Unnecessary for PT mode.
2153 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2155 pgd = phys_to_virt(dma_pte_addr(pgd));
2156 if (!dma_pte_present(pgd))
2160 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2161 if (info && info->ats_supported)
2162 translation = CONTEXT_TT_DEV_IOTLB;
2164 translation = CONTEXT_TT_MULTI_LEVEL;
2166 context_set_address_root(context, virt_to_phys(pgd));
2167 context_set_address_width(context, agaw);
2170 * In pass through mode, AW must be programmed to
2171 * indicate the largest AGAW value supported by
2172 * hardware. And ASR is ignored by hardware.
2174 context_set_address_width(context, iommu->msagaw);
2177 context_set_translation_type(context, translation);
2180 context_set_fault_enable(context);
2181 context_set_present(context);
2182 domain_flush_cache(domain, context, sizeof(*context));
2185 * It's a non-present to present mapping. If hardware doesn't cache
2186 * non-present entry we only need to flush the write-buffer. If the
2187 * _does_ cache non-present entries, then it does so in the special
2188 * domain #0, which we have to flush:
2190 if (cap_caching_mode(iommu->cap)) {
2191 iommu->flush.flush_context(iommu, 0,
2192 (((u16)bus) << 8) | devfn,
2193 DMA_CCMD_MASK_NOBIT,
2194 DMA_CCMD_DEVICE_INVL);
2195 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2197 iommu_flush_write_buffer(iommu);
2199 iommu_enable_dev_iotlb(info);
2204 spin_unlock(&iommu->lock);
2205 spin_unlock_irqrestore(&device_domain_lock, flags);
2210 struct domain_context_mapping_data {
2211 struct dmar_domain *domain;
2212 struct intel_iommu *iommu;
2213 struct pasid_table *table;
2216 static int domain_context_mapping_cb(struct pci_dev *pdev,
2217 u16 alias, void *opaque)
2219 struct domain_context_mapping_data *data = opaque;
2221 return domain_context_mapping_one(data->domain, data->iommu,
2222 data->table, PCI_BUS_NUM(alias),
2227 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2229 struct domain_context_mapping_data data;
2230 struct pasid_table *table;
2231 struct intel_iommu *iommu;
2234 iommu = device_to_iommu(dev, &bus, &devfn);
2238 table = intel_pasid_get_table(dev);
2240 if (!dev_is_pci(dev))
2241 return domain_context_mapping_one(domain, iommu, table,
2244 data.domain = domain;
2248 return pci_for_each_dma_alias(to_pci_dev(dev),
2249 &domain_context_mapping_cb, &data);
2252 static int domain_context_mapped_cb(struct pci_dev *pdev,
2253 u16 alias, void *opaque)
2255 struct intel_iommu *iommu = opaque;
2257 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2260 static int domain_context_mapped(struct device *dev)
2262 struct intel_iommu *iommu;
2265 iommu = device_to_iommu(dev, &bus, &devfn);
2269 if (!dev_is_pci(dev))
2270 return device_context_mapped(iommu, bus, devfn);
2272 return !pci_for_each_dma_alias(to_pci_dev(dev),
2273 domain_context_mapped_cb, iommu);
2276 /* Returns a number of VTD pages, but aligned to MM page size */
2277 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2280 host_addr &= ~PAGE_MASK;
2281 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2284 /* Return largest possible superpage level for a given mapping */
2285 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2286 unsigned long iov_pfn,
2287 unsigned long phy_pfn,
2288 unsigned long pages)
2290 int support, level = 1;
2291 unsigned long pfnmerge;
2293 support = domain->iommu_superpage;
2295 /* To use a large page, the virtual *and* physical addresses
2296 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2297 of them will mean we have to use smaller pages. So just
2298 merge them and check both at once. */
2299 pfnmerge = iov_pfn | phy_pfn;
2301 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2302 pages >>= VTD_STRIDE_SHIFT;
2305 pfnmerge >>= VTD_STRIDE_SHIFT;
2312 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2313 struct scatterlist *sg, unsigned long phys_pfn,
2314 unsigned long nr_pages, int prot)
2316 struct dma_pte *first_pte = NULL, *pte = NULL;
2317 phys_addr_t uninitialized_var(pteval);
2318 unsigned long sg_res = 0;
2319 unsigned int largepage_lvl = 0;
2320 unsigned long lvl_pages = 0;
2323 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2325 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2328 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2329 if (domain_use_first_level(domain))
2330 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2334 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2337 while (nr_pages > 0) {
2341 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2343 sg_res = aligned_nrpages(sg->offset, sg->length);
2344 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2345 sg->dma_length = sg->length;
2346 pteval = (sg_phys(sg) - pgoff) | attr;
2347 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2351 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2353 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2356 /* It is large page*/
2357 if (largepage_lvl > 1) {
2358 unsigned long nr_superpages, end_pfn;
2360 pteval |= DMA_PTE_LARGE_PAGE;
2361 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2363 nr_superpages = sg_res / lvl_pages;
2364 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2367 * Ensure that old small page tables are
2368 * removed to make room for superpage(s).
2369 * We're adding new large pages, so make sure
2370 * we don't remove their parent tables.
2372 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2375 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2379 /* We don't need lock here, nobody else
2380 * touches the iova range
2382 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2384 static int dumps = 5;
2385 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2386 iov_pfn, tmp, (unsigned long long)pteval);
2389 debug_dma_dump_mappings(NULL);
2394 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2396 BUG_ON(nr_pages < lvl_pages);
2397 BUG_ON(sg_res < lvl_pages);
2399 nr_pages -= lvl_pages;
2400 iov_pfn += lvl_pages;
2401 phys_pfn += lvl_pages;
2402 pteval += lvl_pages * VTD_PAGE_SIZE;
2403 sg_res -= lvl_pages;
2405 /* If the next PTE would be the first in a new page, then we
2406 need to flush the cache on the entries we've just written.
2407 And then we'll need to recalculate 'pte', so clear it and
2408 let it get set again in the if (!pte) block above.
2410 If we're done (!nr_pages) we need to flush the cache too.
2412 Also if we've been setting superpages, we may need to
2413 recalculate 'pte' and switch back to smaller pages for the
2414 end of the mapping, if the trailing size is not enough to
2415 use another superpage (i.e. sg_res < lvl_pages). */
2417 if (!nr_pages || first_pte_in_page(pte) ||
2418 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2419 domain_flush_cache(domain, first_pte,
2420 (void *)pte - (void *)first_pte);
2424 if (!sg_res && nr_pages)
2430 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2431 struct scatterlist *sg, unsigned long phys_pfn,
2432 unsigned long nr_pages, int prot)
2435 struct intel_iommu *iommu;
2437 /* Do the real mapping first */
2438 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2442 for_each_domain_iommu(iommu_id, domain) {
2443 iommu = g_iommus[iommu_id];
2444 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2450 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2451 struct scatterlist *sg, unsigned long nr_pages,
2454 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2457 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2458 unsigned long phys_pfn, unsigned long nr_pages,
2461 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2464 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2466 unsigned long flags;
2467 struct context_entry *context;
2473 spin_lock_irqsave(&iommu->lock, flags);
2474 context = iommu_context_addr(iommu, bus, devfn, 0);
2476 spin_unlock_irqrestore(&iommu->lock, flags);
2479 did_old = context_domain_id(context);
2480 context_clear_entry(context);
2481 __iommu_flush_cache(iommu, context, sizeof(*context));
2482 spin_unlock_irqrestore(&iommu->lock, flags);
2483 iommu->flush.flush_context(iommu,
2485 (((u16)bus) << 8) | devfn,
2486 DMA_CCMD_MASK_NOBIT,
2487 DMA_CCMD_DEVICE_INVL);
2488 iommu->flush.flush_iotlb(iommu,
2495 static inline void unlink_domain_info(struct device_domain_info *info)
2497 assert_spin_locked(&device_domain_lock);
2498 list_del(&info->link);
2499 list_del(&info->global);
2501 info->dev->archdata.iommu = NULL;
2504 static void domain_remove_dev_info(struct dmar_domain *domain)
2506 struct device_domain_info *info, *tmp;
2507 unsigned long flags;
2509 spin_lock_irqsave(&device_domain_lock, flags);
2510 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2511 __dmar_remove_one_dev_info(info);
2512 spin_unlock_irqrestore(&device_domain_lock, flags);
2515 struct dmar_domain *find_domain(struct device *dev)
2517 struct device_domain_info *info;
2519 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2520 dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2523 /* No lock here, assumes no domain exit in normal case */
2524 info = dev->archdata.iommu;
2526 return info->domain;
2531 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2533 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2534 struct iommu_domain *domain;
2536 dev->archdata.iommu = NULL;
2537 domain = iommu_get_domain_for_dev(dev);
2539 intel_iommu_attach_device(domain, dev);
2542 return find_domain(dev);
2545 static inline struct device_domain_info *
2546 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2548 struct device_domain_info *info;
2550 list_for_each_entry(info, &device_domain_list, global)
2551 if (info->iommu->segment == segment && info->bus == bus &&
2552 info->devfn == devfn)
2558 static int domain_setup_first_level(struct intel_iommu *iommu,
2559 struct dmar_domain *domain,
2563 int flags = PASID_FLAG_SUPERVISOR_MODE;
2564 struct dma_pte *pgd = domain->pgd;
2568 * Skip top levels of page tables for iommu which has
2569 * less agaw than default. Unnecessary for PT mode.
2571 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2572 pgd = phys_to_virt(dma_pte_addr(pgd));
2573 if (!dma_pte_present(pgd))
2577 level = agaw_to_level(agaw);
2578 if (level != 4 && level != 5)
2581 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2583 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2584 domain->iommu_did[iommu->seq_id],
2588 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2591 struct dmar_domain *domain)
2593 struct dmar_domain *found = NULL;
2594 struct device_domain_info *info;
2595 unsigned long flags;
2598 info = alloc_devinfo_mem();
2603 info->devfn = devfn;
2604 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2605 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2608 info->domain = domain;
2609 info->iommu = iommu;
2610 info->pasid_table = NULL;
2611 info->auxd_enabled = 0;
2612 INIT_LIST_HEAD(&info->auxiliary_domains);
2614 if (dev && dev_is_pci(dev)) {
2615 struct pci_dev *pdev = to_pci_dev(info->dev);
2617 if (!pdev->untrusted &&
2618 !pci_ats_disabled() &&
2619 ecap_dev_iotlb_support(iommu->ecap) &&
2620 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2621 dmar_find_matched_atsr_unit(pdev))
2622 info->ats_supported = 1;
2624 if (sm_supported(iommu)) {
2625 if (pasid_supported(iommu)) {
2626 int features = pci_pasid_features(pdev);
2628 info->pasid_supported = features | 1;
2631 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2632 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2633 info->pri_supported = 1;
2637 spin_lock_irqsave(&device_domain_lock, flags);
2639 found = find_domain(dev);
2642 struct device_domain_info *info2;
2643 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2645 found = info2->domain;
2651 spin_unlock_irqrestore(&device_domain_lock, flags);
2652 free_devinfo_mem(info);
2653 /* Caller must free the original domain */
2657 spin_lock(&iommu->lock);
2658 ret = domain_attach_iommu(domain, iommu);
2659 spin_unlock(&iommu->lock);
2662 spin_unlock_irqrestore(&device_domain_lock, flags);
2663 free_devinfo_mem(info);
2667 list_add(&info->link, &domain->devices);
2668 list_add(&info->global, &device_domain_list);
2670 dev->archdata.iommu = info;
2671 spin_unlock_irqrestore(&device_domain_lock, flags);
2673 /* PASID table is mandatory for a PCI device in scalable mode. */
2674 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2675 ret = intel_pasid_alloc_table(dev);
2677 dev_err(dev, "PASID table allocation failed\n");
2678 dmar_remove_one_dev_info(dev);
2682 /* Setup the PASID entry for requests without PASID: */
2683 spin_lock(&iommu->lock);
2684 if (hw_pass_through && domain_type_is_si(domain))
2685 ret = intel_pasid_setup_pass_through(iommu, domain,
2686 dev, PASID_RID2PASID);
2687 else if (domain_use_first_level(domain))
2688 ret = domain_setup_first_level(iommu, domain, dev,
2691 ret = intel_pasid_setup_second_level(iommu, domain,
2692 dev, PASID_RID2PASID);
2693 spin_unlock(&iommu->lock);
2695 dev_err(dev, "Setup RID2PASID failed\n");
2696 dmar_remove_one_dev_info(dev);
2701 if (dev && domain_context_mapping(domain, dev)) {
2702 dev_err(dev, "Domain context map failed\n");
2703 dmar_remove_one_dev_info(dev);
2710 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2712 *(u16 *)opaque = alias;
2716 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2718 struct device_domain_info *info;
2719 struct dmar_domain *domain = NULL;
2720 struct intel_iommu *iommu;
2722 unsigned long flags;
2725 iommu = device_to_iommu(dev, &bus, &devfn);
2729 if (dev_is_pci(dev)) {
2730 struct pci_dev *pdev = to_pci_dev(dev);
2732 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2734 spin_lock_irqsave(&device_domain_lock, flags);
2735 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2736 PCI_BUS_NUM(dma_alias),
2739 iommu = info->iommu;
2740 domain = info->domain;
2742 spin_unlock_irqrestore(&device_domain_lock, flags);
2744 /* DMA alias already has a domain, use it */
2749 /* Allocate and initialize new domain for the device */
2750 domain = alloc_domain(0);
2753 if (domain_init(domain, iommu, gaw)) {
2754 domain_exit(domain);
2762 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2763 struct dmar_domain *domain)
2765 struct intel_iommu *iommu;
2766 struct dmar_domain *tmp;
2767 u16 req_id, dma_alias;
2770 iommu = device_to_iommu(dev, &bus, &devfn);
2774 req_id = ((u16)bus << 8) | devfn;
2776 if (dev_is_pci(dev)) {
2777 struct pci_dev *pdev = to_pci_dev(dev);
2779 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2781 /* register PCI DMA alias device */
2782 if (req_id != dma_alias) {
2783 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2784 dma_alias & 0xff, NULL, domain);
2786 if (!tmp || tmp != domain)
2791 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2792 if (!tmp || tmp != domain)
2798 static int iommu_domain_identity_map(struct dmar_domain *domain,
2799 unsigned long long start,
2800 unsigned long long end)
2802 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2803 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2805 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2806 dma_to_mm_pfn(last_vpfn))) {
2807 pr_err("Reserving iova failed\n");
2811 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2813 * RMRR range might have overlap with physical memory range,
2816 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2818 return __domain_mapping(domain, first_vpfn, NULL,
2819 first_vpfn, last_vpfn - first_vpfn + 1,
2820 DMA_PTE_READ|DMA_PTE_WRITE);
2823 static int domain_prepare_identity_map(struct device *dev,
2824 struct dmar_domain *domain,
2825 unsigned long long start,
2826 unsigned long long end)
2828 /* For _hardware_ passthrough, don't bother. But for software
2829 passthrough, we do it anyway -- it may indicate a memory
2830 range which is reserved in E820, so which didn't get set
2831 up to start with in si_domain */
2832 if (domain == si_domain && hw_pass_through) {
2833 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2838 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2841 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2842 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2843 dmi_get_system_info(DMI_BIOS_VENDOR),
2844 dmi_get_system_info(DMI_BIOS_VERSION),
2845 dmi_get_system_info(DMI_PRODUCT_VERSION));
2849 if (end >> agaw_to_width(domain->agaw)) {
2850 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2851 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2852 agaw_to_width(domain->agaw),
2853 dmi_get_system_info(DMI_BIOS_VENDOR),
2854 dmi_get_system_info(DMI_BIOS_VERSION),
2855 dmi_get_system_info(DMI_PRODUCT_VERSION));
2859 return iommu_domain_identity_map(domain, start, end);
2862 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2864 static int __init si_domain_init(int hw)
2866 struct dmar_rmrr_unit *rmrr;
2870 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2874 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2875 domain_exit(si_domain);
2882 for_each_online_node(nid) {
2883 unsigned long start_pfn, end_pfn;
2886 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2887 ret = iommu_domain_identity_map(si_domain,
2888 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2895 * Identity map the RMRRs so that devices with RMRRs could also use
2898 for_each_rmrr_units(rmrr) {
2899 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2901 unsigned long long start = rmrr->base_address;
2902 unsigned long long end = rmrr->end_address;
2904 if (WARN_ON(end < start ||
2905 end >> agaw_to_width(si_domain->agaw)))
2908 ret = iommu_domain_identity_map(si_domain, start, end);
2917 static int identity_mapping(struct device *dev)
2919 struct device_domain_info *info;
2921 info = dev->archdata.iommu;
2922 if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2923 return (info->domain == si_domain);
2928 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2930 struct dmar_domain *ndomain;
2931 struct intel_iommu *iommu;
2934 iommu = device_to_iommu(dev, &bus, &devfn);
2938 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2939 if (ndomain != domain)
2945 static bool device_has_rmrr(struct device *dev)
2947 struct dmar_rmrr_unit *rmrr;
2952 for_each_rmrr_units(rmrr) {
2954 * Return TRUE if this RMRR contains the device that
2957 for_each_active_dev_scope(rmrr->devices,
2958 rmrr->devices_cnt, i, tmp)
2960 is_downstream_to_pci_bridge(dev, tmp)) {
2970 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2971 * is relaxable (ie. is allowed to be not enforced under some conditions)
2972 * @dev: device handle
2974 * We assume that PCI USB devices with RMRRs have them largely
2975 * for historical reasons and that the RMRR space is not actively used post
2976 * boot. This exclusion may change if vendors begin to abuse it.
2978 * The same exception is made for graphics devices, with the requirement that
2979 * any use of the RMRR regions will be torn down before assigning the device
2982 * Return: true if the RMRR is relaxable, false otherwise
2984 static bool device_rmrr_is_relaxable(struct device *dev)
2986 struct pci_dev *pdev;
2988 if (!dev_is_pci(dev))
2991 pdev = to_pci_dev(dev);
2992 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2999 * There are a couple cases where we need to restrict the functionality of
3000 * devices associated with RMRRs. The first is when evaluating a device for
3001 * identity mapping because problems exist when devices are moved in and out
3002 * of domains and their respective RMRR information is lost. This means that
3003 * a device with associated RMRRs will never be in a "passthrough" domain.
3004 * The second is use of the device through the IOMMU API. This interface
3005 * expects to have full control of the IOVA space for the device. We cannot
3006 * satisfy both the requirement that RMRR access is maintained and have an
3007 * unencumbered IOVA space. We also have no ability to quiesce the device's
3008 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3009 * We therefore prevent devices associated with an RMRR from participating in
3010 * the IOMMU API, which eliminates them from device assignment.
3012 * In both cases, devices which have relaxable RMRRs are not concerned by this
3013 * restriction. See device_rmrr_is_relaxable comment.
3015 static bool device_is_rmrr_locked(struct device *dev)
3017 if (!device_has_rmrr(dev))
3020 if (device_rmrr_is_relaxable(dev))
3027 * Return the required default domain type for a specific device.
3029 * @dev: the device in query
3030 * @startup: true if this is during early boot
3033 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3034 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3035 * - 0: both identity and dynamic domains work for this device
3037 static int device_def_domain_type(struct device *dev)
3039 if (dev_is_pci(dev)) {
3040 struct pci_dev *pdev = to_pci_dev(dev);
3043 * Prevent any device marked as untrusted from getting
3044 * placed into the statically identity mapping domain.
3046 if (pdev->untrusted)
3047 return IOMMU_DOMAIN_DMA;
3049 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3050 return IOMMU_DOMAIN_IDENTITY;
3052 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3053 return IOMMU_DOMAIN_IDENTITY;
3056 * We want to start off with all devices in the 1:1 domain, and
3057 * take them out later if we find they can't access all of memory.
3059 * However, we can't do this for PCI devices behind bridges,
3060 * because all PCI devices behind the same bridge will end up
3061 * with the same source-id on their transactions.
3063 * Practically speaking, we can't change things around for these
3064 * devices at run-time, because we can't be sure there'll be no
3065 * DMA transactions in flight for any of their siblings.
3067 * So PCI devices (unless they're on the root bus) as well as
3068 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3069 * the 1:1 domain, just in _case_ one of their siblings turns out
3070 * not to be able to map all of memory.
3072 if (!pci_is_pcie(pdev)) {
3073 if (!pci_is_root_bus(pdev->bus))
3074 return IOMMU_DOMAIN_DMA;
3075 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3076 return IOMMU_DOMAIN_DMA;
3077 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3078 return IOMMU_DOMAIN_DMA;
3084 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3087 * Start from the sane iommu hardware state.
3088 * If the queued invalidation is already initialized by us
3089 * (for example, while enabling interrupt-remapping) then
3090 * we got the things already rolling from a sane state.
3094 * Clear any previous faults.
3096 dmar_fault(-1, iommu);
3098 * Disable queued invalidation if supported and already enabled
3099 * before OS handover.
3101 dmar_disable_qi(iommu);
3104 if (dmar_enable_qi(iommu)) {
3106 * Queued Invalidate not enabled, use Register Based Invalidate
3108 iommu->flush.flush_context = __iommu_flush_context;
3109 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3110 pr_info("%s: Using Register based invalidation\n",
3113 iommu->flush.flush_context = qi_flush_context;
3114 iommu->flush.flush_iotlb = qi_flush_iotlb;
3115 pr_info("%s: Using Queued invalidation\n", iommu->name);
3119 static int copy_context_table(struct intel_iommu *iommu,
3120 struct root_entry *old_re,
3121 struct context_entry **tbl,
3124 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3125 struct context_entry *new_ce = NULL, ce;
3126 struct context_entry *old_ce = NULL;
3127 struct root_entry re;
3128 phys_addr_t old_ce_phys;
3130 tbl_idx = ext ? bus * 2 : bus;
3131 memcpy(&re, old_re, sizeof(re));
3133 for (devfn = 0; devfn < 256; devfn++) {
3134 /* First calculate the correct index */
3135 idx = (ext ? devfn * 2 : devfn) % 256;
3138 /* First save what we may have and clean up */
3140 tbl[tbl_idx] = new_ce;
3141 __iommu_flush_cache(iommu, new_ce,
3151 old_ce_phys = root_entry_lctp(&re);
3153 old_ce_phys = root_entry_uctp(&re);
3156 if (ext && devfn == 0) {
3157 /* No LCTP, try UCTP */
3166 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3171 new_ce = alloc_pgtable_page(iommu->node);
3178 /* Now copy the context entry */
3179 memcpy(&ce, old_ce + idx, sizeof(ce));
3181 if (!__context_present(&ce))
3184 did = context_domain_id(&ce);
3185 if (did >= 0 && did < cap_ndoms(iommu->cap))
3186 set_bit(did, iommu->domain_ids);
3189 * We need a marker for copied context entries. This
3190 * marker needs to work for the old format as well as
3191 * for extended context entries.
3193 * Bit 67 of the context entry is used. In the old
3194 * format this bit is available to software, in the
3195 * extended format it is the PGE bit, but PGE is ignored
3196 * by HW if PASIDs are disabled (and thus still
3199 * So disable PASIDs first and then mark the entry
3200 * copied. This means that we don't copy PASID
3201 * translations from the old kernel, but this is fine as
3202 * faults there are not fatal.
3204 context_clear_pasid_enable(&ce);
3205 context_set_copied(&ce);
3210 tbl[tbl_idx + pos] = new_ce;
3212 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3221 static int copy_translation_tables(struct intel_iommu *iommu)
3223 struct context_entry **ctxt_tbls;
3224 struct root_entry *old_rt;
3225 phys_addr_t old_rt_phys;
3226 int ctxt_table_entries;
3227 unsigned long flags;
3232 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3233 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3234 new_ext = !!ecap_ecs(iommu->ecap);
3237 * The RTT bit can only be changed when translation is disabled,
3238 * but disabling translation means to open a window for data
3239 * corruption. So bail out and don't copy anything if we would
3240 * have to change the bit.
3245 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3249 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3253 /* This is too big for the stack - allocate it from slab */
3254 ctxt_table_entries = ext ? 512 : 256;
3256 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3260 for (bus = 0; bus < 256; bus++) {
3261 ret = copy_context_table(iommu, &old_rt[bus],
3262 ctxt_tbls, bus, ext);
3264 pr_err("%s: Failed to copy context table for bus %d\n",
3270 spin_lock_irqsave(&iommu->lock, flags);
3272 /* Context tables are copied, now write them to the root_entry table */
3273 for (bus = 0; bus < 256; bus++) {
3274 int idx = ext ? bus * 2 : bus;
3277 if (ctxt_tbls[idx]) {
3278 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3279 iommu->root_entry[bus].lo = val;
3282 if (!ext || !ctxt_tbls[idx + 1])
3285 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3286 iommu->root_entry[bus].hi = val;
3289 spin_unlock_irqrestore(&iommu->lock, flags);
3293 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3303 static int __init init_dmars(void)
3305 struct dmar_drhd_unit *drhd;
3306 struct intel_iommu *iommu;
3312 * initialize and program root entry to not present
3315 for_each_drhd_unit(drhd) {
3317 * lock not needed as this is only incremented in the single
3318 * threaded kernel __init code path all other access are read
3321 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3325 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3328 /* Preallocate enough resources for IOMMU hot-addition */
3329 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3330 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3332 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3335 pr_err("Allocating global iommu array failed\n");
3340 for_each_iommu(iommu, drhd) {
3341 if (drhd->ignored) {
3342 iommu_disable_translation(iommu);
3347 * Find the max pasid size of all IOMMU's in the system.
3348 * We need to ensure the system pasid table is no bigger
3349 * than the smallest supported.
3351 if (pasid_supported(iommu)) {
3352 u32 temp = 2 << ecap_pss(iommu->ecap);
3354 intel_pasid_max_id = min_t(u32, temp,
3355 intel_pasid_max_id);
3358 g_iommus[iommu->seq_id] = iommu;
3360 intel_iommu_init_qi(iommu);
3362 ret = iommu_init_domains(iommu);
3366 init_translation_status(iommu);
3368 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3369 iommu_disable_translation(iommu);
3370 clear_translation_pre_enabled(iommu);
3371 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3377 * we could share the same root & context tables
3378 * among all IOMMU's. Need to Split it later.
3380 ret = iommu_alloc_root_entry(iommu);
3384 if (translation_pre_enabled(iommu)) {
3385 pr_info("Translation already enabled - trying to copy translation structures\n");
3387 ret = copy_translation_tables(iommu);
3390 * We found the IOMMU with translation
3391 * enabled - but failed to copy over the
3392 * old root-entry table. Try to proceed
3393 * by disabling translation now and
3394 * allocating a clean root-entry table.
3395 * This might cause DMAR faults, but
3396 * probably the dump will still succeed.
3398 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3400 iommu_disable_translation(iommu);
3401 clear_translation_pre_enabled(iommu);
3403 pr_info("Copied translation tables from previous kernel for %s\n",
3408 if (!ecap_pass_through(iommu->ecap))
3409 hw_pass_through = 0;
3410 intel_svm_check(iommu);
3414 * Now that qi is enabled on all iommus, set the root entry and flush
3415 * caches. This is required on some Intel X58 chipsets, otherwise the
3416 * flush_context function will loop forever and the boot hangs.
3418 for_each_active_iommu(iommu, drhd) {
3419 iommu_flush_write_buffer(iommu);
3420 iommu_set_root_entry(iommu);
3421 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3422 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3425 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3430 iommu_identity_mapping |= IDENTMAP_GFX;
3432 check_tylersburg_isoch();
3434 ret = si_domain_init(hw_pass_through);
3441 * global invalidate context cache
3442 * global invalidate iotlb
3443 * enable translation
3445 for_each_iommu(iommu, drhd) {
3446 if (drhd->ignored) {
3448 * we always have to disable PMRs or DMA may fail on
3452 iommu_disable_protect_mem_regions(iommu);
3456 iommu_flush_write_buffer(iommu);
3458 #ifdef CONFIG_INTEL_IOMMU_SVM
3459 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3461 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3462 * could cause possible lock race condition.
3464 up_write(&dmar_global_lock);
3465 ret = intel_svm_enable_prq(iommu);
3466 down_write(&dmar_global_lock);
3471 ret = dmar_set_interrupt(iommu);
3479 for_each_active_iommu(iommu, drhd) {
3480 disable_dmar_iommu(iommu);
3481 free_dmar_iommu(iommu);
3490 /* This takes a number of _MM_ pages, not VTD pages */
3491 static unsigned long intel_alloc_iova(struct device *dev,
3492 struct dmar_domain *domain,
3493 unsigned long nrpages, uint64_t dma_mask)
3495 unsigned long iova_pfn;
3498 * Restrict dma_mask to the width that the iommu can handle.
3499 * First-level translation restricts the input-address to a
3500 * canonical address (i.e., address bits 63:N have the same
3501 * value as address bit [N-1], where N is 48-bits with 4-level
3502 * paging and 57-bits with 5-level paging). Hence, skip bit
3505 if (domain_use_first_level(domain))
3506 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3509 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3512 /* Ensure we reserve the whole size-aligned region */
3513 nrpages = __roundup_pow_of_two(nrpages);
3515 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3517 * First try to allocate an io virtual address in
3518 * DMA_BIT_MASK(32) and if that fails then try allocating
3521 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3522 IOVA_PFN(DMA_BIT_MASK(32)), false);
3526 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3527 IOVA_PFN(dma_mask), true);
3528 if (unlikely(!iova_pfn)) {
3529 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3537 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3539 struct dmar_domain *domain, *tmp;
3540 struct dmar_rmrr_unit *rmrr;
3541 struct device *i_dev;
3544 /* Device shouldn't be attached by any domains. */
3545 domain = find_domain(dev);
3549 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3553 /* We have a new domain - setup possible RMRRs for the device */
3555 for_each_rmrr_units(rmrr) {
3556 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3561 ret = domain_prepare_identity_map(dev, domain,
3565 dev_err(dev, "Mapping reserved region failed\n");
3570 tmp = set_domain_for_dev(dev, domain);
3571 if (!tmp || domain != tmp) {
3572 domain_exit(domain);
3578 dev_err(dev, "Allocating domain failed\n");
3580 domain->domain.type = IOMMU_DOMAIN_DMA;
3585 /* Check if the dev needs to go through non-identity map and unmap process.*/
3586 static bool iommu_need_mapping(struct device *dev)
3590 if (iommu_dummy(dev))
3593 ret = identity_mapping(dev);
3595 u64 dma_mask = *dev->dma_mask;
3597 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598 dma_mask = dev->coherent_dma_mask;
3600 if (dma_mask >= dma_direct_get_required_mask(dev))
3604 * 32 bit DMA is removed from si_domain and fall back to
3605 * non-identity mapping.
3607 dmar_remove_one_dev_info(dev);
3608 ret = iommu_request_dma_domain_for_dev(dev);
3610 struct iommu_domain *domain;
3611 struct dmar_domain *dmar_domain;
3613 domain = iommu_get_domain_for_dev(dev);
3615 dmar_domain = to_dmar_domain(domain);
3616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3618 dmar_remove_one_dev_info(dev);
3619 get_private_domain_for_dev(dev);
3622 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629 size_t size, int dir, u64 dma_mask)
3631 struct dmar_domain *domain;
3632 phys_addr_t start_paddr;
3633 unsigned long iova_pfn;
3636 struct intel_iommu *iommu;
3637 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3639 BUG_ON(dir == DMA_NONE);
3641 domain = deferred_attach_domain(dev);
3643 return DMA_MAPPING_ERROR;
3645 iommu = domain_get_iommu(domain);
3646 size = aligned_nrpages(paddr, size);
3648 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3653 * Check if DMAR supports zero-length reads on write only
3656 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657 !cap_zlr(iommu->cap))
3658 prot |= DMA_PTE_READ;
3659 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660 prot |= DMA_PTE_WRITE;
3662 * paddr - (paddr + size) might be partial page, we should map the whole
3663 * page. Note: if two part of one page are separately mapped, we
3664 * might have two guest_addr mapping to the same host paddr, but this
3665 * is not a big problem
3667 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668 mm_to_dma_pfn(paddr_pfn), size, prot);
3672 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673 start_paddr += paddr & ~PAGE_MASK;
3675 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3681 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683 size, (unsigned long long)paddr, dir);
3684 return DMA_MAPPING_ERROR;
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688 unsigned long offset, size_t size,
3689 enum dma_data_direction dir,
3690 unsigned long attrs)
3692 if (iommu_need_mapping(dev))
3693 return __intel_map_single(dev, page_to_phys(page) + offset,
3694 size, dir, *dev->dma_mask);
3695 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699 size_t size, enum dma_data_direction dir,
3700 unsigned long attrs)
3702 if (iommu_need_mapping(dev))
3703 return __intel_map_single(dev, phys_addr, size, dir,
3705 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3710 struct dmar_domain *domain;
3711 unsigned long start_pfn, last_pfn;
3712 unsigned long nrpages;
3713 unsigned long iova_pfn;
3714 struct intel_iommu *iommu;
3715 struct page *freelist;
3716 struct pci_dev *pdev = NULL;
3718 domain = find_domain(dev);
3721 iommu = domain_get_iommu(domain);
3723 iova_pfn = IOVA_PFN(dev_addr);
3725 nrpages = aligned_nrpages(dev_addr, size);
3726 start_pfn = mm_to_dma_pfn(iova_pfn);
3727 last_pfn = start_pfn + nrpages - 1;
3729 if (dev_is_pci(dev))
3730 pdev = to_pci_dev(dev);
3732 freelist = domain_unmap(domain, start_pfn, last_pfn);
3733 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734 !has_iova_flush_queue(&domain->iovad)) {
3735 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736 nrpages, !freelist, 0);
3738 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739 dma_free_pagelist(freelist);
3741 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742 (unsigned long)freelist);
3744 * queue up the release of the unmap to save the 1/6th of the
3745 * cpu used up by the iotlb flush operation...
3749 trace_unmap_single(dev, dev_addr, size);
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753 size_t size, enum dma_data_direction dir,
3754 unsigned long attrs)
3756 if (iommu_need_mapping(dev))
3757 intel_unmap(dev, dev_addr, size);
3759 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763 size_t size, enum dma_data_direction dir, unsigned long attrs)
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770 dma_addr_t *dma_handle, gfp_t flags,
3771 unsigned long attrs)
3773 struct page *page = NULL;
3776 if (!iommu_need_mapping(dev))
3777 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3782 if (gfpflags_allow_blocking(flags)) {
3783 unsigned int count = size >> PAGE_SHIFT;
3785 page = dma_alloc_from_contiguous(dev, count, order,
3786 flags & __GFP_NOWARN);
3790 page = alloc_pages(flags, order);
3793 memset(page_address(page), 0, size);
3795 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3797 dev->coherent_dma_mask);
3798 if (*dma_handle != DMA_MAPPING_ERROR)
3799 return page_address(page);
3800 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801 __free_pages(page, order);
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807 dma_addr_t dma_handle, unsigned long attrs)
3810 struct page *page = virt_to_page(vaddr);
3812 if (!iommu_need_mapping(dev))
3813 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3815 size = PAGE_ALIGN(size);
3816 order = get_order(size);
3818 intel_unmap(dev, dma_handle, size);
3819 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820 __free_pages(page, order);
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824 int nelems, enum dma_data_direction dir,
3825 unsigned long attrs)
3827 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828 unsigned long nrpages = 0;
3829 struct scatterlist *sg;
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3835 for_each_sg(sglist, sg, nelems, i) {
3836 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3839 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3841 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845 enum dma_data_direction dir, unsigned long attrs)
3848 struct dmar_domain *domain;
3851 unsigned long iova_pfn;
3853 struct scatterlist *sg;
3854 unsigned long start_vpfn;
3855 struct intel_iommu *iommu;
3857 BUG_ON(dir == DMA_NONE);
3858 if (!iommu_need_mapping(dev))
3859 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3861 domain = deferred_attach_domain(dev);
3865 iommu = domain_get_iommu(domain);
3867 for_each_sg(sglist, sg, nelems, i)
3868 size += aligned_nrpages(sg->offset, sg->length);
3870 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3873 sglist->dma_length = 0;
3878 * Check if DMAR supports zero-length reads on write only
3881 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882 !cap_zlr(iommu->cap))
3883 prot |= DMA_PTE_READ;
3884 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885 prot |= DMA_PTE_WRITE;
3887 start_vpfn = mm_to_dma_pfn(iova_pfn);
3889 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890 if (unlikely(ret)) {
3891 dma_pte_free_pagetable(domain, start_vpfn,
3892 start_vpfn + size - 1,
3893 agaw_to_level(domain->agaw) + 1);
3894 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3898 for_each_sg(sglist, sg, nelems, i)
3899 trace_map_sg(dev, i + 1, nelems, sg);
3904 static u64 intel_get_required_mask(struct device *dev)
3906 if (!iommu_need_mapping(dev))
3907 return dma_direct_get_required_mask(dev);
3908 return DMA_BIT_MASK(32);
3911 static const struct dma_map_ops intel_dma_ops = {
3912 .alloc = intel_alloc_coherent,
3913 .free = intel_free_coherent,
3914 .map_sg = intel_map_sg,
3915 .unmap_sg = intel_unmap_sg,
3916 .map_page = intel_map_page,
3917 .unmap_page = intel_unmap_page,
3918 .map_resource = intel_map_resource,
3919 .unmap_resource = intel_unmap_resource,
3920 .dma_supported = dma_direct_supported,
3921 .mmap = dma_common_mmap,
3922 .get_sgtable = dma_common_get_sgtable,
3923 .get_required_mask = intel_get_required_mask,
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928 enum dma_data_direction dir, enum dma_sync_target target)
3930 struct dmar_domain *domain;
3931 phys_addr_t tlb_addr;
3933 domain = find_domain(dev);
3934 if (WARN_ON(!domain))
3937 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938 if (is_swiotlb_buffer(tlb_addr))
3939 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs,
3947 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948 struct dmar_domain *domain;
3949 struct intel_iommu *iommu;
3950 unsigned long iova_pfn;
3951 unsigned long nrpages;
3952 phys_addr_t tlb_addr;
3956 domain = deferred_attach_domain(dev);
3957 if (WARN_ON(dir == DMA_NONE || !domain))
3958 return DMA_MAPPING_ERROR;
3960 iommu = domain_get_iommu(domain);
3961 if (WARN_ON(!iommu))
3962 return DMA_MAPPING_ERROR;
3964 nrpages = aligned_nrpages(0, size);
3965 iova_pfn = intel_alloc_iova(dev, domain,
3966 dma_to_mm_pfn(nrpages), dma_mask);
3968 return DMA_MAPPING_ERROR;
3971 * Check if DMAR supports zero-length reads on write only
3974 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3975 !cap_zlr(iommu->cap))
3976 prot |= DMA_PTE_READ;
3977 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3978 prot |= DMA_PTE_WRITE;
3981 * If both the physical buffer start address and size are
3982 * page aligned, we don't need to use a bounce page.
3984 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3985 tlb_addr = swiotlb_tbl_map_single(dev,
3986 __phys_to_dma(dev, io_tlb_start),
3987 paddr, size, aligned_size, dir, attrs);
3988 if (tlb_addr == DMA_MAPPING_ERROR) {
3991 /* Cleanup the padding area. */
3992 void *padding_start = phys_to_virt(tlb_addr);
3993 size_t padding_size = aligned_size;
3995 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3996 (dir == DMA_TO_DEVICE ||
3997 dir == DMA_BIDIRECTIONAL)) {
3998 padding_start += size;
3999 padding_size -= size;
4002 memset(padding_start, 0, padding_size);
4008 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4009 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4013 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4015 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4018 if (is_swiotlb_buffer(tlb_addr))
4019 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4020 aligned_size, dir, attrs);
4022 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4023 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4024 size, (unsigned long long)paddr, dir);
4026 return DMA_MAPPING_ERROR;
4030 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4031 enum dma_data_direction dir, unsigned long attrs)
4033 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4034 struct dmar_domain *domain;
4035 phys_addr_t tlb_addr;
4037 domain = find_domain(dev);
4038 if (WARN_ON(!domain))
4041 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4042 if (WARN_ON(!tlb_addr))
4045 intel_unmap(dev, dev_addr, size);
4046 if (is_swiotlb_buffer(tlb_addr))
4047 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4048 aligned_size, dir, attrs);
4050 trace_bounce_unmap_single(dev, dev_addr, size);
4054 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4055 size_t size, enum dma_data_direction dir, unsigned long attrs)
4057 return bounce_map_single(dev, page_to_phys(page) + offset,
4058 size, dir, attrs, *dev->dma_mask);
4062 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4063 enum dma_data_direction dir, unsigned long attrs)
4065 return bounce_map_single(dev, phys_addr, size,
4066 dir, attrs, *dev->dma_mask);
4070 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4071 enum dma_data_direction dir, unsigned long attrs)
4073 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4077 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4078 enum dma_data_direction dir, unsigned long attrs)
4080 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4084 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4085 enum dma_data_direction dir, unsigned long attrs)
4087 struct scatterlist *sg;
4090 for_each_sg(sglist, sg, nelems, i)
4091 bounce_unmap_page(dev, sg->dma_address,
4092 sg_dma_len(sg), dir, attrs);
4096 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4097 enum dma_data_direction dir, unsigned long attrs)
4100 struct scatterlist *sg;
4102 for_each_sg(sglist, sg, nelems, i) {
4103 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4104 sg->offset, sg->length,
4106 if (sg->dma_address == DMA_MAPPING_ERROR)
4108 sg_dma_len(sg) = sg->length;
4111 for_each_sg(sglist, sg, nelems, i)
4112 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4117 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4122 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4123 size_t size, enum dma_data_direction dir)
4125 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4129 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4130 size_t size, enum dma_data_direction dir)
4132 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4136 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4137 int nelems, enum dma_data_direction dir)
4139 struct scatterlist *sg;
4142 for_each_sg(sglist, sg, nelems, i)
4143 bounce_sync_single(dev, sg_dma_address(sg),
4144 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4148 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4149 int nelems, enum dma_data_direction dir)
4151 struct scatterlist *sg;
4154 for_each_sg(sglist, sg, nelems, i)
4155 bounce_sync_single(dev, sg_dma_address(sg),
4156 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4159 static const struct dma_map_ops bounce_dma_ops = {
4160 .alloc = intel_alloc_coherent,
4161 .free = intel_free_coherent,
4162 .map_sg = bounce_map_sg,
4163 .unmap_sg = bounce_unmap_sg,
4164 .map_page = bounce_map_page,
4165 .unmap_page = bounce_unmap_page,
4166 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4167 .sync_single_for_device = bounce_sync_single_for_device,
4168 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4169 .sync_sg_for_device = bounce_sync_sg_for_device,
4170 .map_resource = bounce_map_resource,
4171 .unmap_resource = bounce_unmap_resource,
4172 .dma_supported = dma_direct_supported,
4175 static inline int iommu_domain_cache_init(void)
4179 iommu_domain_cache = kmem_cache_create("iommu_domain",
4180 sizeof(struct dmar_domain),
4185 if (!iommu_domain_cache) {
4186 pr_err("Couldn't create iommu_domain cache\n");
4193 static inline int iommu_devinfo_cache_init(void)
4197 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4198 sizeof(struct device_domain_info),
4202 if (!iommu_devinfo_cache) {
4203 pr_err("Couldn't create devinfo cache\n");
4210 static int __init iommu_init_mempool(void)
4213 ret = iova_cache_get();
4217 ret = iommu_domain_cache_init();
4221 ret = iommu_devinfo_cache_init();
4225 kmem_cache_destroy(iommu_domain_cache);
4232 static void __init iommu_exit_mempool(void)
4234 kmem_cache_destroy(iommu_devinfo_cache);
4235 kmem_cache_destroy(iommu_domain_cache);
4239 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4241 struct dmar_drhd_unit *drhd;
4245 /* We know that this device on this chipset has its own IOMMU.
4246 * If we find it under a different IOMMU, then the BIOS is lying
4247 * to us. Hope that the IOMMU for this device is actually
4248 * disabled, and it needs no translation...
4250 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4252 /* "can't" happen */
4253 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4256 vtbar &= 0xffff0000;
4258 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4259 drhd = dmar_find_matched_drhd_unit(pdev);
4260 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4261 TAINT_FIRMWARE_WORKAROUND,
4262 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4263 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4265 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4267 static void __init init_no_remapping_devices(void)
4269 struct dmar_drhd_unit *drhd;
4273 for_each_drhd_unit(drhd) {
4274 if (!drhd->include_all) {
4275 for_each_active_dev_scope(drhd->devices,
4276 drhd->devices_cnt, i, dev)
4278 /* ignore DMAR unit if no devices exist */
4279 if (i == drhd->devices_cnt)
4284 for_each_active_drhd_unit(drhd) {
4285 if (drhd->include_all)
4288 for_each_active_dev_scope(drhd->devices,
4289 drhd->devices_cnt, i, dev)
4290 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4292 if (i < drhd->devices_cnt)
4295 /* This IOMMU has *only* gfx devices. Either bypass it or
4296 set the gfx_mapped flag, as appropriate */
4297 if (!dmar_map_gfx) {
4299 for_each_active_dev_scope(drhd->devices,
4300 drhd->devices_cnt, i, dev)
4301 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4306 #ifdef CONFIG_SUSPEND
4307 static int init_iommu_hw(void)
4309 struct dmar_drhd_unit *drhd;
4310 struct intel_iommu *iommu = NULL;
4312 for_each_active_iommu(iommu, drhd)
4314 dmar_reenable_qi(iommu);
4316 for_each_iommu(iommu, drhd) {
4317 if (drhd->ignored) {
4319 * we always have to disable PMRs or DMA may fail on
4323 iommu_disable_protect_mem_regions(iommu);
4327 iommu_flush_write_buffer(iommu);
4329 iommu_set_root_entry(iommu);
4331 iommu->flush.flush_context(iommu, 0, 0, 0,
4332 DMA_CCMD_GLOBAL_INVL);
4333 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4334 iommu_enable_translation(iommu);
4335 iommu_disable_protect_mem_regions(iommu);
4341 static void iommu_flush_all(void)
4343 struct dmar_drhd_unit *drhd;
4344 struct intel_iommu *iommu;
4346 for_each_active_iommu(iommu, drhd) {
4347 iommu->flush.flush_context(iommu, 0, 0, 0,
4348 DMA_CCMD_GLOBAL_INVL);
4349 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4350 DMA_TLB_GLOBAL_FLUSH);
4354 static int iommu_suspend(void)
4356 struct dmar_drhd_unit *drhd;
4357 struct intel_iommu *iommu = NULL;
4360 for_each_active_iommu(iommu, drhd) {
4361 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4363 if (!iommu->iommu_state)
4369 for_each_active_iommu(iommu, drhd) {
4370 iommu_disable_translation(iommu);
4372 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4374 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4375 readl(iommu->reg + DMAR_FECTL_REG);
4376 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4377 readl(iommu->reg + DMAR_FEDATA_REG);
4378 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4379 readl(iommu->reg + DMAR_FEADDR_REG);
4380 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4381 readl(iommu->reg + DMAR_FEUADDR_REG);
4383 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4388 for_each_active_iommu(iommu, drhd)
4389 kfree(iommu->iommu_state);
4394 static void iommu_resume(void)
4396 struct dmar_drhd_unit *drhd;
4397 struct intel_iommu *iommu = NULL;
4400 if (init_iommu_hw()) {
4402 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4404 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4408 for_each_active_iommu(iommu, drhd) {
4410 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4412 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4413 iommu->reg + DMAR_FECTL_REG);
4414 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4415 iommu->reg + DMAR_FEDATA_REG);
4416 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4417 iommu->reg + DMAR_FEADDR_REG);
4418 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4419 iommu->reg + DMAR_FEUADDR_REG);
4421 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4424 for_each_active_iommu(iommu, drhd)
4425 kfree(iommu->iommu_state);
4428 static struct syscore_ops iommu_syscore_ops = {
4429 .resume = iommu_resume,
4430 .suspend = iommu_suspend,
4433 static void __init init_iommu_pm_ops(void)
4435 register_syscore_ops(&iommu_syscore_ops);
4439 static inline void init_iommu_pm_ops(void) {}
4440 #endif /* CONFIG_PM */
4442 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4444 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4445 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4446 rmrr->end_address <= rmrr->base_address ||
4447 arch_rmrr_sanity_check(rmrr))
4453 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4455 struct acpi_dmar_reserved_memory *rmrr;
4456 struct dmar_rmrr_unit *rmrru;
4458 rmrr = (struct acpi_dmar_reserved_memory *)header;
4459 if (rmrr_sanity_check(rmrr))
4460 WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
4461 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4462 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4463 rmrr->base_address, rmrr->end_address,
4464 dmi_get_system_info(DMI_BIOS_VENDOR),
4465 dmi_get_system_info(DMI_BIOS_VERSION),
4466 dmi_get_system_info(DMI_PRODUCT_VERSION));
4468 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4472 rmrru->hdr = header;
4474 rmrru->base_address = rmrr->base_address;
4475 rmrru->end_address = rmrr->end_address;
4477 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4478 ((void *)rmrr) + rmrr->header.length,
4479 &rmrru->devices_cnt);
4480 if (rmrru->devices_cnt && rmrru->devices == NULL)
4483 list_add(&rmrru->list, &dmar_rmrr_units);
4492 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4494 struct dmar_atsr_unit *atsru;
4495 struct acpi_dmar_atsr *tmp;
4497 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4498 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4499 if (atsr->segment != tmp->segment)
4501 if (atsr->header.length != tmp->header.length)
4503 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4510 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4512 struct acpi_dmar_atsr *atsr;
4513 struct dmar_atsr_unit *atsru;
4515 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4518 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4519 atsru = dmar_find_atsr(atsr);
4523 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4528 * If memory is allocated from slab by ACPI _DSM method, we need to
4529 * copy the memory content because the memory buffer will be freed
4532 atsru->hdr = (void *)(atsru + 1);
4533 memcpy(atsru->hdr, hdr, hdr->length);
4534 atsru->include_all = atsr->flags & 0x1;
4535 if (!atsru->include_all) {
4536 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4537 (void *)atsr + atsr->header.length,
4538 &atsru->devices_cnt);
4539 if (atsru->devices_cnt && atsru->devices == NULL) {
4545 list_add_rcu(&atsru->list, &dmar_atsr_units);
4550 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4552 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4556 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4558 struct acpi_dmar_atsr *atsr;
4559 struct dmar_atsr_unit *atsru;
4561 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4562 atsru = dmar_find_atsr(atsr);
4564 list_del_rcu(&atsru->list);
4566 intel_iommu_free_atsr(atsru);
4572 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4576 struct acpi_dmar_atsr *atsr;
4577 struct dmar_atsr_unit *atsru;
4579 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4580 atsru = dmar_find_atsr(atsr);
4584 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4585 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4593 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4596 struct intel_iommu *iommu = dmaru->iommu;
4598 if (g_iommus[iommu->seq_id])
4601 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4602 pr_warn("%s: Doesn't support hardware pass through.\n",
4606 if (!ecap_sc_support(iommu->ecap) &&
4607 domain_update_iommu_snooping(iommu)) {
4608 pr_warn("%s: Doesn't support snooping.\n",
4612 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4613 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4614 pr_warn("%s: Doesn't support large page.\n",
4620 * Disable translation if already enabled prior to OS handover.
4622 if (iommu->gcmd & DMA_GCMD_TE)
4623 iommu_disable_translation(iommu);
4625 g_iommus[iommu->seq_id] = iommu;
4626 ret = iommu_init_domains(iommu);
4628 ret = iommu_alloc_root_entry(iommu);
4632 intel_svm_check(iommu);
4634 if (dmaru->ignored) {
4636 * we always have to disable PMRs or DMA may fail on this device
4639 iommu_disable_protect_mem_regions(iommu);
4643 intel_iommu_init_qi(iommu);
4644 iommu_flush_write_buffer(iommu);
4646 #ifdef CONFIG_INTEL_IOMMU_SVM
4647 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4648 ret = intel_svm_enable_prq(iommu);
4653 ret = dmar_set_interrupt(iommu);
4657 iommu_set_root_entry(iommu);
4658 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4659 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4660 iommu_enable_translation(iommu);
4662 iommu_disable_protect_mem_regions(iommu);
4666 disable_dmar_iommu(iommu);
4668 free_dmar_iommu(iommu);
4672 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4675 struct intel_iommu *iommu = dmaru->iommu;
4677 if (!intel_iommu_enabled)
4683 ret = intel_iommu_add(dmaru);
4685 disable_dmar_iommu(iommu);
4686 free_dmar_iommu(iommu);
4692 static void intel_iommu_free_dmars(void)
4694 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4695 struct dmar_atsr_unit *atsru, *atsr_n;
4697 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4698 list_del(&rmrru->list);
4699 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4703 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4704 list_del(&atsru->list);
4705 intel_iommu_free_atsr(atsru);
4709 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4712 struct pci_bus *bus;
4713 struct pci_dev *bridge = NULL;
4715 struct acpi_dmar_atsr *atsr;
4716 struct dmar_atsr_unit *atsru;
4718 dev = pci_physfn(dev);
4719 for (bus = dev->bus; bus; bus = bus->parent) {
4721 /* If it's an integrated device, allow ATS */
4724 /* Connected via non-PCIe: no ATS */
4725 if (!pci_is_pcie(bridge) ||
4726 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4728 /* If we found the root port, look it up in the ATSR */
4729 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4734 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4735 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4736 if (atsr->segment != pci_domain_nr(dev->bus))
4739 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4740 if (tmp == &bridge->dev)
4743 if (atsru->include_all)
4753 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4756 struct dmar_rmrr_unit *rmrru;
4757 struct dmar_atsr_unit *atsru;
4758 struct acpi_dmar_atsr *atsr;
4759 struct acpi_dmar_reserved_memory *rmrr;
4761 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4764 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4765 rmrr = container_of(rmrru->hdr,
4766 struct acpi_dmar_reserved_memory, header);
4767 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4768 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4769 ((void *)rmrr) + rmrr->header.length,
4770 rmrr->segment, rmrru->devices,
4771 rmrru->devices_cnt);
4774 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4775 dmar_remove_dev_scope(info, rmrr->segment,
4776 rmrru->devices, rmrru->devices_cnt);
4780 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4781 if (atsru->include_all)
4784 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4785 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4786 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4787 (void *)atsr + atsr->header.length,
4788 atsr->segment, atsru->devices,
4789 atsru->devices_cnt);
4794 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4795 if (dmar_remove_dev_scope(info, atsr->segment,
4796 atsru->devices, atsru->devices_cnt))
4804 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4805 unsigned long val, void *v)
4807 struct memory_notify *mhp = v;
4808 unsigned long long start, end;
4809 unsigned long start_vpfn, last_vpfn;
4812 case MEM_GOING_ONLINE:
4813 start = mhp->start_pfn << PAGE_SHIFT;
4814 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4815 if (iommu_domain_identity_map(si_domain, start, end)) {
4816 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4823 case MEM_CANCEL_ONLINE:
4824 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4825 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4826 while (start_vpfn <= last_vpfn) {
4828 struct dmar_drhd_unit *drhd;
4829 struct intel_iommu *iommu;
4830 struct page *freelist;
4832 iova = find_iova(&si_domain->iovad, start_vpfn);
4834 pr_debug("Failed get IOVA for PFN %lx\n",
4839 iova = split_and_remove_iova(&si_domain->iovad, iova,
4840 start_vpfn, last_vpfn);
4842 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4843 start_vpfn, last_vpfn);
4847 freelist = domain_unmap(si_domain, iova->pfn_lo,
4851 for_each_active_iommu(iommu, drhd)
4852 iommu_flush_iotlb_psi(iommu, si_domain,
4853 iova->pfn_lo, iova_size(iova),
4856 dma_free_pagelist(freelist);
4858 start_vpfn = iova->pfn_hi + 1;
4859 free_iova_mem(iova);
4867 static struct notifier_block intel_iommu_memory_nb = {
4868 .notifier_call = intel_iommu_memory_notifier,
4872 static void free_all_cpu_cached_iovas(unsigned int cpu)
4876 for (i = 0; i < g_num_of_iommus; i++) {
4877 struct intel_iommu *iommu = g_iommus[i];
4878 struct dmar_domain *domain;
4884 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4885 domain = get_iommu_domain(iommu, (u16)did);
4889 free_cpu_cached_iovas(cpu, &domain->iovad);
4894 static int intel_iommu_cpu_dead(unsigned int cpu)
4896 free_all_cpu_cached_iovas(cpu);
4900 static void intel_disable_iommus(void)
4902 struct intel_iommu *iommu = NULL;
4903 struct dmar_drhd_unit *drhd;
4905 for_each_iommu(iommu, drhd)
4906 iommu_disable_translation(iommu);
4909 void intel_iommu_shutdown(void)
4911 struct dmar_drhd_unit *drhd;
4912 struct intel_iommu *iommu = NULL;
4914 if (no_iommu || dmar_disabled)
4917 down_write(&dmar_global_lock);
4919 /* Disable PMRs explicitly here. */
4920 for_each_iommu(iommu, drhd)
4921 iommu_disable_protect_mem_regions(iommu);
4923 /* Make sure the IOMMUs are switched off */
4924 intel_disable_iommus();
4926 up_write(&dmar_global_lock);
4929 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4931 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4933 return container_of(iommu_dev, struct intel_iommu, iommu);
4936 static ssize_t intel_iommu_show_version(struct device *dev,
4937 struct device_attribute *attr,
4940 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4941 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4942 return sprintf(buf, "%d:%d\n",
4943 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4945 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4947 static ssize_t intel_iommu_show_address(struct device *dev,
4948 struct device_attribute *attr,
4951 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4952 return sprintf(buf, "%llx\n", iommu->reg_phys);
4954 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4956 static ssize_t intel_iommu_show_cap(struct device *dev,
4957 struct device_attribute *attr,
4960 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4961 return sprintf(buf, "%llx\n", iommu->cap);
4963 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4965 static ssize_t intel_iommu_show_ecap(struct device *dev,
4966 struct device_attribute *attr,
4969 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4970 return sprintf(buf, "%llx\n", iommu->ecap);
4972 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4974 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4975 struct device_attribute *attr,
4978 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4979 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4981 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4983 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4984 struct device_attribute *attr,
4987 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4988 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4989 cap_ndoms(iommu->cap)));
4991 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4993 static struct attribute *intel_iommu_attrs[] = {
4994 &dev_attr_version.attr,
4995 &dev_attr_address.attr,
4997 &dev_attr_ecap.attr,
4998 &dev_attr_domains_supported.attr,
4999 &dev_attr_domains_used.attr,
5003 static struct attribute_group intel_iommu_group = {
5004 .name = "intel-iommu",
5005 .attrs = intel_iommu_attrs,
5008 const struct attribute_group *intel_iommu_groups[] = {
5013 static inline bool has_untrusted_dev(void)
5015 struct pci_dev *pdev = NULL;
5017 for_each_pci_dev(pdev)
5018 if (pdev->untrusted)
5024 static int __init platform_optin_force_iommu(void)
5026 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5029 if (no_iommu || dmar_disabled)
5030 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5033 * If Intel-IOMMU is disabled by default, we will apply identity
5034 * map for all devices except those marked as being untrusted.
5037 iommu_set_default_passthrough(false);
5045 static int __init probe_acpi_namespace_devices(void)
5047 struct dmar_drhd_unit *drhd;
5048 /* To avoid a -Wunused-but-set-variable warning. */
5049 struct intel_iommu *iommu __maybe_unused;
5053 for_each_active_iommu(iommu, drhd) {
5054 for_each_active_dev_scope(drhd->devices,
5055 drhd->devices_cnt, i, dev) {
5056 struct acpi_device_physical_node *pn;
5057 struct iommu_group *group;
5058 struct acpi_device *adev;
5060 if (dev->bus != &acpi_bus_type)
5063 adev = to_acpi_device(dev);
5064 mutex_lock(&adev->physical_node_lock);
5065 list_for_each_entry(pn,
5066 &adev->physical_node_list, node) {
5067 group = iommu_group_get(pn->dev);
5069 iommu_group_put(group);
5073 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5074 ret = iommu_probe_device(pn->dev);
5078 mutex_unlock(&adev->physical_node_lock);
5088 int __init intel_iommu_init(void)
5091 struct dmar_drhd_unit *drhd;
5092 struct intel_iommu *iommu;
5095 * Intel IOMMU is required for a TXT/tboot launch or platform
5096 * opt in, so enforce that.
5098 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5100 if (iommu_init_mempool()) {
5102 panic("tboot: Failed to initialize iommu memory\n");
5106 down_write(&dmar_global_lock);
5107 if (dmar_table_init()) {
5109 panic("tboot: Failed to initialize DMAR table\n");
5113 if (dmar_dev_scope_init() < 0) {
5115 panic("tboot: Failed to initialize DMAR device scope\n");
5119 up_write(&dmar_global_lock);
5122 * The bus notifier takes the dmar_global_lock, so lockdep will
5123 * complain later when we register it under the lock.
5125 dmar_register_bus_notifier();
5127 down_write(&dmar_global_lock);
5129 if (no_iommu || dmar_disabled) {
5131 * We exit the function here to ensure IOMMU's remapping and
5132 * mempool aren't setup, which means that the IOMMU's PMRs
5133 * won't be disabled via the call to init_dmars(). So disable
5134 * it explicitly here. The PMRs were setup by tboot prior to
5135 * calling SENTER, but the kernel is expected to reset/tear
5138 if (intel_iommu_tboot_noforce) {
5139 for_each_iommu(iommu, drhd)
5140 iommu_disable_protect_mem_regions(iommu);
5144 * Make sure the IOMMUs are switched off, even when we
5145 * boot into a kexec kernel and the previous kernel left
5148 intel_disable_iommus();
5152 if (list_empty(&dmar_rmrr_units))
5153 pr_info("No RMRR found\n");
5155 if (list_empty(&dmar_atsr_units))
5156 pr_info("No ATSR found\n");
5158 if (dmar_init_reserved_ranges()) {
5160 panic("tboot: Failed to reserve iommu ranges\n");
5161 goto out_free_reserved_range;
5165 intel_iommu_gfx_mapped = 1;
5167 init_no_remapping_devices();
5172 panic("tboot: Failed to initialize DMARs\n");
5173 pr_err("Initialization failed\n");
5174 goto out_free_reserved_range;
5176 up_write(&dmar_global_lock);
5178 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5180 * If the system has no untrusted device or the user has decided
5181 * to disable the bounce page mechanisms, we don't need swiotlb.
5182 * Mark this and the pre-allocated bounce pages will be released
5185 if (!has_untrusted_dev() || intel_no_bounce)
5188 dma_ops = &intel_dma_ops;
5190 init_iommu_pm_ops();
5192 for_each_active_iommu(iommu, drhd) {
5193 iommu_device_sysfs_add(&iommu->iommu, NULL,
5196 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5197 iommu_device_register(&iommu->iommu);
5200 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5201 if (si_domain && !hw_pass_through)
5202 register_memory_notifier(&intel_iommu_memory_nb);
5203 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5204 intel_iommu_cpu_dead);
5206 down_read(&dmar_global_lock);
5207 if (probe_acpi_namespace_devices())
5208 pr_warn("ACPI name space devices didn't probe correctly\n");
5209 up_read(&dmar_global_lock);
5211 /* Finally, we enable the DMA remapping hardware. */
5212 for_each_iommu(iommu, drhd) {
5213 if (!drhd->ignored && !translation_pre_enabled(iommu))
5214 iommu_enable_translation(iommu);
5216 iommu_disable_protect_mem_regions(iommu);
5218 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5220 intel_iommu_enabled = 1;
5221 intel_iommu_debugfs_init();
5225 out_free_reserved_range:
5226 put_iova_domain(&reserved_iova_list);
5228 intel_iommu_free_dmars();
5229 up_write(&dmar_global_lock);
5230 iommu_exit_mempool();
5234 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5236 struct intel_iommu *iommu = opaque;
5238 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5243 * NB - intel-iommu lacks any sort of reference counting for the users of
5244 * dependent devices. If multiple endpoints have intersecting dependent
5245 * devices, unbinding the driver from any one of them will possibly leave
5246 * the others unable to operate.
5248 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5250 if (!iommu || !dev || !dev_is_pci(dev))
5253 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5256 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5258 struct dmar_domain *domain;
5259 struct intel_iommu *iommu;
5260 unsigned long flags;
5262 assert_spin_locked(&device_domain_lock);
5267 iommu = info->iommu;
5268 domain = info->domain;
5271 if (dev_is_pci(info->dev) && sm_supported(iommu))
5272 intel_pasid_tear_down_entry(iommu, info->dev,
5275 iommu_disable_dev_iotlb(info);
5276 domain_context_clear(iommu, info->dev);
5277 intel_pasid_free_table(info->dev);
5280 unlink_domain_info(info);
5282 spin_lock_irqsave(&iommu->lock, flags);
5283 domain_detach_iommu(domain, iommu);
5284 spin_unlock_irqrestore(&iommu->lock, flags);
5286 /* free the private domain */
5287 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5288 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5289 list_empty(&domain->devices))
5290 domain_exit(info->domain);
5292 free_devinfo_mem(info);
5295 static void dmar_remove_one_dev_info(struct device *dev)
5297 struct device_domain_info *info;
5298 unsigned long flags;
5300 spin_lock_irqsave(&device_domain_lock, flags);
5301 info = dev->archdata.iommu;
5302 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5303 && info != DUMMY_DEVICE_DOMAIN_INFO)
5304 __dmar_remove_one_dev_info(info);
5305 spin_unlock_irqrestore(&device_domain_lock, flags);
5308 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5312 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5313 domain_reserve_special_ranges(domain);
5315 /* calculate AGAW */
5316 domain->gaw = guest_width;
5317 adjust_width = guestwidth_to_adjustwidth(guest_width);
5318 domain->agaw = width_to_agaw(adjust_width);
5320 domain->iommu_coherency = 0;
5321 domain->iommu_snooping = 0;
5322 domain->iommu_superpage = 0;
5323 domain->max_addr = 0;
5325 /* always allocate the top pgd */
5326 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5329 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5333 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5335 struct dmar_domain *dmar_domain;
5336 struct iommu_domain *domain;
5340 case IOMMU_DOMAIN_DMA:
5342 case IOMMU_DOMAIN_UNMANAGED:
5343 dmar_domain = alloc_domain(0);
5345 pr_err("Can't allocate dmar_domain\n");
5348 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5349 pr_err("Domain initialization failed\n");
5350 domain_exit(dmar_domain);
5354 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5355 ret = init_iova_flush_queue(&dmar_domain->iovad,
5359 pr_info("iova flush queue initialization failed\n");
5362 domain_update_iommu_cap(dmar_domain);
5364 domain = &dmar_domain->domain;
5365 domain->geometry.aperture_start = 0;
5366 domain->geometry.aperture_end =
5367 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5368 domain->geometry.force_aperture = true;
5371 case IOMMU_DOMAIN_IDENTITY:
5372 return &si_domain->domain;
5380 static void intel_iommu_domain_free(struct iommu_domain *domain)
5382 if (domain != &si_domain->domain)
5383 domain_exit(to_dmar_domain(domain));
5387 * Check whether a @domain could be attached to the @dev through the
5388 * aux-domain attach/detach APIs.
5391 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5393 struct device_domain_info *info = dev->archdata.iommu;
5395 return info && info->auxd_enabled &&
5396 domain->type == IOMMU_DOMAIN_UNMANAGED;
5399 static void auxiliary_link_device(struct dmar_domain *domain,
5402 struct device_domain_info *info = dev->archdata.iommu;
5404 assert_spin_locked(&device_domain_lock);
5408 domain->auxd_refcnt++;
5409 list_add(&domain->auxd, &info->auxiliary_domains);
5412 static void auxiliary_unlink_device(struct dmar_domain *domain,
5415 struct device_domain_info *info = dev->archdata.iommu;
5417 assert_spin_locked(&device_domain_lock);
5421 list_del(&domain->auxd);
5422 domain->auxd_refcnt--;
5424 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5425 ioasid_free(domain->default_pasid);
5428 static int aux_domain_add_dev(struct dmar_domain *domain,
5433 unsigned long flags;
5434 struct intel_iommu *iommu;
5436 iommu = device_to_iommu(dev, &bus, &devfn);
5440 if (domain->default_pasid <= 0) {
5443 /* No private data needed for the default pasid */
5444 pasid = ioasid_alloc(NULL, PASID_MIN,
5445 pci_max_pasids(to_pci_dev(dev)) - 1,
5447 if (pasid == INVALID_IOASID) {
5448 pr_err("Can't allocate default pasid\n");
5451 domain->default_pasid = pasid;
5454 spin_lock_irqsave(&device_domain_lock, flags);
5456 * iommu->lock must be held to attach domain to iommu and setup the
5457 * pasid entry for second level translation.
5459 spin_lock(&iommu->lock);
5460 ret = domain_attach_iommu(domain, iommu);
5464 /* Setup the PASID entry for mediated devices: */
5465 if (domain_use_first_level(domain))
5466 ret = domain_setup_first_level(iommu, domain, dev,
5467 domain->default_pasid);
5469 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5470 domain->default_pasid);
5473 spin_unlock(&iommu->lock);
5475 auxiliary_link_device(domain, dev);
5477 spin_unlock_irqrestore(&device_domain_lock, flags);
5482 domain_detach_iommu(domain, iommu);
5484 spin_unlock(&iommu->lock);
5485 spin_unlock_irqrestore(&device_domain_lock, flags);
5486 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5487 ioasid_free(domain->default_pasid);
5492 static void aux_domain_remove_dev(struct dmar_domain *domain,
5495 struct device_domain_info *info;
5496 struct intel_iommu *iommu;
5497 unsigned long flags;
5499 if (!is_aux_domain(dev, &domain->domain))
5502 spin_lock_irqsave(&device_domain_lock, flags);
5503 info = dev->archdata.iommu;
5504 iommu = info->iommu;
5506 auxiliary_unlink_device(domain, dev);
5508 spin_lock(&iommu->lock);
5509 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5510 domain_detach_iommu(domain, iommu);
5511 spin_unlock(&iommu->lock);
5513 spin_unlock_irqrestore(&device_domain_lock, flags);
5516 static int prepare_domain_attach_device(struct iommu_domain *domain,
5519 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5520 struct intel_iommu *iommu;
5524 iommu = device_to_iommu(dev, &bus, &devfn);
5528 /* check if this iommu agaw is sufficient for max mapped address */
5529 addr_width = agaw_to_width(iommu->agaw);
5530 if (addr_width > cap_mgaw(iommu->cap))
5531 addr_width = cap_mgaw(iommu->cap);
5533 if (dmar_domain->max_addr > (1LL << addr_width)) {
5534 dev_err(dev, "%s: iommu width (%d) is not "
5535 "sufficient for the mapped address (%llx)\n",
5536 __func__, addr_width, dmar_domain->max_addr);
5539 dmar_domain->gaw = addr_width;
5542 * Knock out extra levels of page tables if necessary
5544 while (iommu->agaw < dmar_domain->agaw) {
5545 struct dma_pte *pte;
5547 pte = dmar_domain->pgd;
5548 if (dma_pte_present(pte)) {
5549 dmar_domain->pgd = (struct dma_pte *)
5550 phys_to_virt(dma_pte_addr(pte));
5551 free_pgtable_page(pte);
5553 dmar_domain->agaw--;
5559 static int intel_iommu_attach_device(struct iommu_domain *domain,
5564 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5565 device_is_rmrr_locked(dev)) {
5566 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5570 if (is_aux_domain(dev, domain))
5573 /* normally dev is not mapped */
5574 if (unlikely(domain_context_mapped(dev))) {
5575 struct dmar_domain *old_domain;
5577 old_domain = find_domain(dev);
5579 dmar_remove_one_dev_info(dev);
5582 ret = prepare_domain_attach_device(domain, dev);
5586 return domain_add_dev_info(to_dmar_domain(domain), dev);
5589 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5594 if (!is_aux_domain(dev, domain))
5597 ret = prepare_domain_attach_device(domain, dev);
5601 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5604 static void intel_iommu_detach_device(struct iommu_domain *domain,
5607 dmar_remove_one_dev_info(dev);
5610 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5613 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5616 static int intel_iommu_map(struct iommu_domain *domain,
5617 unsigned long iova, phys_addr_t hpa,
5618 size_t size, int iommu_prot, gfp_t gfp)
5620 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5625 if (iommu_prot & IOMMU_READ)
5626 prot |= DMA_PTE_READ;
5627 if (iommu_prot & IOMMU_WRITE)
5628 prot |= DMA_PTE_WRITE;
5629 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5630 prot |= DMA_PTE_SNP;
5632 max_addr = iova + size;
5633 if (dmar_domain->max_addr < max_addr) {
5636 /* check if minimum agaw is sufficient for mapped address */
5637 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5638 if (end < max_addr) {
5639 pr_err("%s: iommu width (%d) is not "
5640 "sufficient for the mapped address (%llx)\n",
5641 __func__, dmar_domain->gaw, max_addr);
5644 dmar_domain->max_addr = max_addr;
5646 /* Round up size to next multiple of PAGE_SIZE, if it and
5647 the low bits of hpa would take us onto the next page */
5648 size = aligned_nrpages(hpa, size);
5649 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5650 hpa >> VTD_PAGE_SHIFT, size, prot);
5654 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5655 unsigned long iova, size_t size,
5656 struct iommu_iotlb_gather *gather)
5658 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5659 struct page *freelist = NULL;
5660 unsigned long start_pfn, last_pfn;
5661 unsigned int npages;
5662 int iommu_id, level = 0;
5664 /* Cope with horrid API which requires us to unmap more than the
5665 size argument if it happens to be a large-page mapping. */
5666 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5668 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5669 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5671 start_pfn = iova >> VTD_PAGE_SHIFT;
5672 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5674 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5676 npages = last_pfn - start_pfn + 1;
5678 for_each_domain_iommu(iommu_id, dmar_domain)
5679 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5680 start_pfn, npages, !freelist, 0);
5682 dma_free_pagelist(freelist);
5684 if (dmar_domain->max_addr == iova + size)
5685 dmar_domain->max_addr = iova;
5690 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5693 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5694 struct dma_pte *pte;
5698 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5700 phys = dma_pte_addr(pte);
5705 static inline bool scalable_mode_support(void)
5707 struct dmar_drhd_unit *drhd;
5708 struct intel_iommu *iommu;
5712 for_each_active_iommu(iommu, drhd) {
5713 if (!sm_supported(iommu)) {
5723 static inline bool iommu_pasid_support(void)
5725 struct dmar_drhd_unit *drhd;
5726 struct intel_iommu *iommu;
5730 for_each_active_iommu(iommu, drhd) {
5731 if (!pasid_supported(iommu)) {
5741 static inline bool nested_mode_support(void)
5743 struct dmar_drhd_unit *drhd;
5744 struct intel_iommu *iommu;
5748 for_each_active_iommu(iommu, drhd) {
5749 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5759 static bool intel_iommu_capable(enum iommu_cap cap)
5761 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5762 return domain_update_iommu_snooping(NULL) == 1;
5763 if (cap == IOMMU_CAP_INTR_REMAP)
5764 return irq_remapping_enabled == 1;
5769 static int intel_iommu_add_device(struct device *dev)
5771 struct dmar_domain *dmar_domain;
5772 struct iommu_domain *domain;
5773 struct intel_iommu *iommu;
5774 struct iommu_group *group;
5778 iommu = device_to_iommu(dev, &bus, &devfn);
5782 iommu_device_link(&iommu->iommu, dev);
5784 if (translation_pre_enabled(iommu))
5785 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5787 group = iommu_group_get_for_dev(dev);
5789 if (IS_ERR(group)) {
5790 ret = PTR_ERR(group);
5794 iommu_group_put(group);
5796 domain = iommu_get_domain_for_dev(dev);
5797 dmar_domain = to_dmar_domain(domain);
5798 if (domain->type == IOMMU_DOMAIN_DMA) {
5799 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5800 ret = iommu_request_dm_for_dev(dev);
5802 dmar_remove_one_dev_info(dev);
5803 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5804 domain_add_dev_info(si_domain, dev);
5806 "Device uses a private identity domain.\n");
5810 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5811 ret = iommu_request_dma_domain_for_dev(dev);
5813 dmar_remove_one_dev_info(dev);
5814 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5815 if (!get_private_domain_for_dev(dev)) {
5817 "Failed to get a private domain.\n");
5823 "Device uses a private dma domain.\n");
5828 if (device_needs_bounce(dev)) {
5829 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5830 set_dma_ops(dev, &bounce_dma_ops);
5836 iommu_device_unlink(&iommu->iommu, dev);
5840 static void intel_iommu_remove_device(struct device *dev)
5842 struct intel_iommu *iommu;
5845 iommu = device_to_iommu(dev, &bus, &devfn);
5849 dmar_remove_one_dev_info(dev);
5851 iommu_group_remove_device(dev);
5853 iommu_device_unlink(&iommu->iommu, dev);
5855 if (device_needs_bounce(dev))
5856 set_dma_ops(dev, NULL);
5859 static void intel_iommu_get_resv_regions(struct device *device,
5860 struct list_head *head)
5862 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5863 struct iommu_resv_region *reg;
5864 struct dmar_rmrr_unit *rmrr;
5865 struct device *i_dev;
5868 down_read(&dmar_global_lock);
5869 for_each_rmrr_units(rmrr) {
5870 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5872 struct iommu_resv_region *resv;
5873 enum iommu_resv_type type;
5876 if (i_dev != device &&
5877 !is_downstream_to_pci_bridge(device, i_dev))
5880 length = rmrr->end_address - rmrr->base_address + 1;
5882 type = device_rmrr_is_relaxable(device) ?
5883 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5885 resv = iommu_alloc_resv_region(rmrr->base_address,
5886 length, prot, type);
5890 list_add_tail(&resv->list, head);
5893 up_read(&dmar_global_lock);
5895 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5896 if (dev_is_pci(device)) {
5897 struct pci_dev *pdev = to_pci_dev(device);
5899 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5900 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5901 IOMMU_RESV_DIRECT_RELAXABLE);
5903 list_add_tail(®->list, head);
5906 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5908 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5909 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5913 list_add_tail(®->list, head);
5916 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5918 struct device_domain_info *info;
5919 struct context_entry *context;
5920 struct dmar_domain *domain;
5921 unsigned long flags;
5925 domain = find_domain(dev);
5929 spin_lock_irqsave(&device_domain_lock, flags);
5930 spin_lock(&iommu->lock);
5933 info = dev->archdata.iommu;
5934 if (!info || !info->pasid_supported)
5937 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5938 if (WARN_ON(!context))
5941 ctx_lo = context[0].lo;
5943 if (!(ctx_lo & CONTEXT_PASIDE)) {
5944 ctx_lo |= CONTEXT_PASIDE;
5945 context[0].lo = ctx_lo;
5947 iommu->flush.flush_context(iommu,
5948 domain->iommu_did[iommu->seq_id],
5949 PCI_DEVID(info->bus, info->devfn),
5950 DMA_CCMD_MASK_NOBIT,
5951 DMA_CCMD_DEVICE_INVL);
5954 /* Enable PASID support in the device, if it wasn't already */
5955 if (!info->pasid_enabled)
5956 iommu_enable_dev_iotlb(info);
5961 spin_unlock(&iommu->lock);
5962 spin_unlock_irqrestore(&device_domain_lock, flags);
5967 static void intel_iommu_apply_resv_region(struct device *dev,
5968 struct iommu_domain *domain,
5969 struct iommu_resv_region *region)
5971 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5972 unsigned long start, end;
5974 start = IOVA_PFN(region->start);
5975 end = IOVA_PFN(region->start + region->length - 1);
5977 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5980 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5982 if (dev_is_pci(dev))
5983 return pci_device_group(dev);
5984 return generic_device_group(dev);
5987 #ifdef CONFIG_INTEL_IOMMU_SVM
5988 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5990 struct intel_iommu *iommu;
5993 if (iommu_dummy(dev)) {
5995 "No IOMMU translation for device; cannot enable SVM\n");
5999 iommu = device_to_iommu(dev, &bus, &devfn);
6001 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6007 #endif /* CONFIG_INTEL_IOMMU_SVM */
6009 static int intel_iommu_enable_auxd(struct device *dev)
6011 struct device_domain_info *info;
6012 struct intel_iommu *iommu;
6013 unsigned long flags;
6017 iommu = device_to_iommu(dev, &bus, &devfn);
6018 if (!iommu || dmar_disabled)
6021 if (!sm_supported(iommu) || !pasid_supported(iommu))
6024 ret = intel_iommu_enable_pasid(iommu, dev);
6028 spin_lock_irqsave(&device_domain_lock, flags);
6029 info = dev->archdata.iommu;
6030 info->auxd_enabled = 1;
6031 spin_unlock_irqrestore(&device_domain_lock, flags);
6036 static int intel_iommu_disable_auxd(struct device *dev)
6038 struct device_domain_info *info;
6039 unsigned long flags;
6041 spin_lock_irqsave(&device_domain_lock, flags);
6042 info = dev->archdata.iommu;
6043 if (!WARN_ON(!info))
6044 info->auxd_enabled = 0;
6045 spin_unlock_irqrestore(&device_domain_lock, flags);
6051 * A PCI express designated vendor specific extended capability is defined
6052 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6053 * for system software and tools to detect endpoint devices supporting the
6054 * Intel scalable IO virtualization without host driver dependency.
6056 * Returns the address of the matching extended capability structure within
6057 * the device's PCI configuration space or 0 if the device does not support
6060 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6065 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6067 pci_read_config_word(pdev, pos + 4, &vendor);
6068 pci_read_config_word(pdev, pos + 8, &id);
6069 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6072 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6079 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6081 if (feat == IOMMU_DEV_FEAT_AUX) {
6084 if (!dev_is_pci(dev) || dmar_disabled ||
6085 !scalable_mode_support() || !iommu_pasid_support())
6088 ret = pci_pasid_features(to_pci_dev(dev));
6092 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6099 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6101 if (feat == IOMMU_DEV_FEAT_AUX)
6102 return intel_iommu_enable_auxd(dev);
6108 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6110 if (feat == IOMMU_DEV_FEAT_AUX)
6111 return intel_iommu_disable_auxd(dev);
6117 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6119 struct device_domain_info *info = dev->archdata.iommu;
6121 if (feat == IOMMU_DEV_FEAT_AUX)
6122 return scalable_mode_support() && info && info->auxd_enabled;
6128 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6130 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6132 return dmar_domain->default_pasid > 0 ?
6133 dmar_domain->default_pasid : -EINVAL;
6136 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6139 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
6143 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6144 enum iommu_attr attr, void *data)
6146 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6147 unsigned long flags;
6150 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6154 case DOMAIN_ATTR_NESTING:
6155 spin_lock_irqsave(&device_domain_lock, flags);
6156 if (nested_mode_support() &&
6157 list_empty(&dmar_domain->devices)) {
6158 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6159 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6163 spin_unlock_irqrestore(&device_domain_lock, flags);
6173 const struct iommu_ops intel_iommu_ops = {
6174 .capable = intel_iommu_capable,
6175 .domain_alloc = intel_iommu_domain_alloc,
6176 .domain_free = intel_iommu_domain_free,
6177 .domain_set_attr = intel_iommu_domain_set_attr,
6178 .attach_dev = intel_iommu_attach_device,
6179 .detach_dev = intel_iommu_detach_device,
6180 .aux_attach_dev = intel_iommu_aux_attach_device,
6181 .aux_detach_dev = intel_iommu_aux_detach_device,
6182 .aux_get_pasid = intel_iommu_aux_get_pasid,
6183 .map = intel_iommu_map,
6184 .unmap = intel_iommu_unmap,
6185 .iova_to_phys = intel_iommu_iova_to_phys,
6186 .add_device = intel_iommu_add_device,
6187 .remove_device = intel_iommu_remove_device,
6188 .get_resv_regions = intel_iommu_get_resv_regions,
6189 .put_resv_regions = generic_iommu_put_resv_regions,
6190 .apply_resv_region = intel_iommu_apply_resv_region,
6191 .device_group = intel_iommu_device_group,
6192 .dev_has_feat = intel_iommu_dev_has_feat,
6193 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6194 .dev_enable_feat = intel_iommu_dev_enable_feat,
6195 .dev_disable_feat = intel_iommu_dev_disable_feat,
6196 .is_attach_deferred = intel_iommu_is_attach_deferred,
6197 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6200 static void quirk_iommu_igfx(struct pci_dev *dev)
6202 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6206 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6208 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6209 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6215 /* Broadwell igfx malfunctions with dmar */
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6218 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6241 static void quirk_iommu_rwbf(struct pci_dev *dev)
6244 * Mobile 4 Series Chipset neglects to set RWBF capability,
6245 * but needs it. Same seems to hold for the desktop versions.
6247 pci_info(dev, "Forcing write-buffer flush capability\n");
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6260 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6261 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6262 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6263 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6264 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6265 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6266 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6267 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6269 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6273 if (pci_read_config_word(dev, GGC, &ggc))
6276 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6277 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6279 } else if (dmar_map_gfx) {
6280 /* we have to ensure the gfx device is idle before we flush */
6281 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6282 intel_iommu_strict = 1;
6285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6286 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6290 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6291 ISOCH DMAR unit for the Azalia sound device, but not give it any
6292 TLB entries, which causes it to deadlock. Check for that. We do
6293 this in a function called from init_dmars(), instead of in a PCI
6294 quirk, because we don't want to print the obnoxious "BIOS broken"
6295 message if VT-d is actually disabled.
6297 static void __init check_tylersburg_isoch(void)
6299 struct pci_dev *pdev;
6300 uint32_t vtisochctrl;
6302 /* If there's no Azalia in the system anyway, forget it. */
6303 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6308 /* System Management Registers. Might be hidden, in which case
6309 we can't do the sanity check. But that's OK, because the
6310 known-broken BIOSes _don't_ actually hide it, so far. */
6311 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6315 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6322 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6323 if (vtisochctrl & 1)
6326 /* Drop all bits other than the number of TLB entries */
6327 vtisochctrl &= 0x1c;
6329 /* If we have the recommended number of TLB entries (16), fine. */
6330 if (vtisochctrl == 0x10)
6333 /* Zero TLB entries? You get to ride the short bus to school. */
6335 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6336 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6337 dmi_get_system_info(DMI_BIOS_VENDOR),
6338 dmi_get_system_info(DMI_BIOS_VERSION),
6339 dmi_get_system_info(DMI_PRODUCT_VERSION));
6340 iommu_identity_mapping |= IDENTMAP_AZALIA;
6344 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",