1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
53 #define ROOT_SIZE VTD_PAGE_SIZE
54 #define CONTEXT_SIZE VTD_PAGE_SIZE
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61 #define IOAPIC_RANGE_START (0xfee00000)
62 #define IOAPIC_RANGE_END (0xfeefffff)
63 #define IOVA_START_ADDR (0x1000)
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
76 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN (1)
82 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
84 /* page table handling */
85 #define LEVEL_STRIDE (9)
86 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
89 * This bitmap is used to advertise the page sizes our hardware support
90 * to the IOMMU core, which will then use this information to split
91 * physically contiguous memory regions it is mapping into page sizes
94 * Traditionally the IOMMU core just handed us the mappings directly,
95 * after making sure the size is an order of a 4KiB page and that the
96 * mapping has natural alignment.
98 * To retain this behavior, we currently advertise that we support
99 * all page sizes that are an order of 4KiB.
101 * If at some point we'd like to utilize the IOMMU core's new behavior,
102 * we could change this to advertise the real page sizes we support.
104 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
106 static inline int agaw_to_level(int agaw)
111 static inline int agaw_to_width(int agaw)
113 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
116 static inline int width_to_agaw(int width)
118 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
121 static inline unsigned int level_to_offset_bits(int level)
123 return (level - 1) * LEVEL_STRIDE;
126 static inline int pfn_level_offset(unsigned long pfn, int level)
128 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
131 static inline unsigned long level_mask(int level)
133 return -1UL << level_to_offset_bits(level);
136 static inline unsigned long level_size(int level)
138 return 1UL << level_to_offset_bits(level);
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
143 return (pfn + level_size(level) - 1) & level_mask(level);
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
148 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152 are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
155 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
160 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
164 return mm_to_dma_pfn(page_to_pfn(pg));
166 static inline unsigned long virt_to_dma_pfn(void *p)
168 return page_to_dma_pfn(virt_to_page(p));
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
178 * set to 1 to panic kernel if can't successfully enable VT-d
179 * (used when kernel is launched w/ TXT)
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
196 return re->lo & VTD_PAGE_MASK;
200 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
208 return re->hi & VTD_PAGE_MASK;
211 static inline void context_clear_pasid_enable(struct context_entry *context)
213 context->lo &= ~(1ULL << 11);
216 static inline bool context_pasid_enabled(struct context_entry *context)
218 return !!(context->lo & (1ULL << 11));
221 static inline void context_set_copied(struct context_entry *context)
223 context->hi |= (1ull << 3);
226 static inline bool context_copied(struct context_entry *context)
228 return !!(context->hi & (1ULL << 3));
231 static inline bool __context_present(struct context_entry *context)
233 return (context->lo & 1);
236 bool context_present(struct context_entry *context)
238 return context_pasid_enabled(context) ?
239 __context_present(context) :
240 __context_present(context) && !context_copied(context);
243 static inline void context_set_present(struct context_entry *context)
248 static inline void context_set_fault_enable(struct context_entry *context)
250 context->lo &= (((u64)-1) << 2) | 1;
253 static inline void context_set_translation_type(struct context_entry *context,
256 context->lo &= (((u64)-1) << 4) | 3;
257 context->lo |= (value & 3) << 2;
260 static inline void context_set_address_root(struct context_entry *context,
263 context->lo &= ~VTD_PAGE_MASK;
264 context->lo |= value & VTD_PAGE_MASK;
267 static inline void context_set_address_width(struct context_entry *context,
270 context->hi |= value & 7;
273 static inline void context_set_domain_id(struct context_entry *context,
276 context->hi |= (value & ((1 << 16) - 1)) << 8;
279 static inline int context_domain_id(struct context_entry *c)
281 return((c->hi >> 8) & 0xffff);
284 static inline void context_clear_entry(struct context_entry *context)
291 * This domain is a statically identity mapping domain.
292 * 1. This domain creats a static 1:1 mapping to all usable memory.
293 * 2. It maps to each iommu if successful.
294 * 3. Each iommu mapps to this domain if successful.
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
303 * This is a DMA domain allocated through the iommu domain allocation
304 * interface. But one or more devices belonging to this domain have
305 * been chosen to use a private domain. We should avoid to use the
306 * map/unmap/iova_to_phys APIs on it.
308 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
311 * When VT-d works in the scalable mode, it allows DMA translation to
312 * happen through either first level or second level page table. This
313 * bit marks that the DMA translation for the domain goes through the
314 * first level page table, otherwise, it goes through the second level.
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL BIT(2)
319 * Domain represents a virtual machine which demands iommu nested
320 * translation mode support.
322 #define DOMAIN_FLAG_NESTING_MODE BIT(3)
324 #define for_each_domain_iommu(idx, domain) \
325 for (idx = 0; idx < g_num_of_iommus; idx++) \
326 if (domain->iommu_refcnt[idx])
328 struct dmar_rmrr_unit {
329 struct list_head list; /* list of rmrr units */
330 struct acpi_dmar_header *hdr; /* ACPI header */
331 u64 base_address; /* reserved base address*/
332 u64 end_address; /* reserved end address */
333 struct dmar_dev_scope *devices; /* target devices */
334 int devices_cnt; /* target device count */
337 struct dmar_atsr_unit {
338 struct list_head list; /* list of ATSR units */
339 struct acpi_dmar_header *hdr; /* ACPI header */
340 struct dmar_dev_scope *devices; /* target devices */
341 int devices_cnt; /* target device count */
342 u8 include_all:1; /* include all ports */
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
348 #define for_each_rmrr_units(rmrr) \
349 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
360 static int domain_detach_iommu(struct dmar_domain *domain,
361 struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
390 #define IDENTMAP_GFX 2
391 #define IDENTMAP_AZALIA 4
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) && \
402 to_pci_dev(d)->untrusted)
405 * Iterate over elements in device_domain_list and call the specified
406 * callback @fn against each element.
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409 void *data), void *data)
413 struct device_domain_info *info;
415 spin_lock_irqsave(&device_domain_lock, flags);
416 list_for_each_entry(info, &device_domain_list, global) {
417 ret = fn(info, data);
419 spin_unlock_irqrestore(&device_domain_lock, flags);
423 spin_unlock_irqrestore(&device_domain_lock, flags);
428 const struct iommu_ops intel_iommu_ops;
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
432 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
437 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
440 static void init_translation_status(struct intel_iommu *iommu)
444 gsts = readl(iommu->reg + DMAR_GSTS_REG);
445 if (gsts & DMA_GSTS_TES)
446 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
452 return container_of(dom, struct dmar_domain, domain);
455 static int __init intel_iommu_setup(char *str)
460 if (!strncmp(str, "on", 2)) {
462 pr_info("IOMMU enabled\n");
463 } else if (!strncmp(str, "off", 3)) {
465 no_platform_optin = 1;
466 pr_info("IOMMU disabled\n");
467 } else if (!strncmp(str, "igfx_off", 8)) {
469 pr_info("Disable GFX device mapping\n");
470 } else if (!strncmp(str, "forcedac", 8)) {
471 pr_info("Forcing DAC for PCI devices\n");
473 } else if (!strncmp(str, "strict", 6)) {
474 pr_info("Disable batched IOTLB flush\n");
475 intel_iommu_strict = 1;
476 } else if (!strncmp(str, "sp_off", 6)) {
477 pr_info("Disable supported super page\n");
478 intel_iommu_superpage = 0;
479 } else if (!strncmp(str, "sm_on", 5)) {
480 pr_info("Intel-IOMMU: scalable mode supported\n");
482 } else if (!strncmp(str, "tboot_noforce", 13)) {
484 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485 intel_iommu_tboot_noforce = 1;
486 } else if (!strncmp(str, "nobounce", 8)) {
487 pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
491 str += strcspn(str, ",");
497 __setup("intel_iommu=", intel_iommu_setup);
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
504 struct dmar_domain **domains;
507 domains = iommu->domains[idx];
511 return domains[did & 0xff];
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515 struct dmar_domain *domain)
517 struct dmar_domain **domains;
520 if (!iommu->domains[idx]) {
521 size_t size = 256 * sizeof(struct dmar_domain *);
522 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
525 domains = iommu->domains[idx];
526 if (WARN_ON(!domains))
529 domains[did & 0xff] = domain;
532 void *alloc_pgtable_page(int node)
537 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
539 vaddr = page_address(page);
543 void free_pgtable_page(void *vaddr)
545 free_page((unsigned long)vaddr);
548 static inline void *alloc_domain_mem(void)
550 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
553 static void free_domain_mem(void *vaddr)
555 kmem_cache_free(iommu_domain_cache, vaddr);
558 static inline void * alloc_devinfo_mem(void)
560 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
563 static inline void free_devinfo_mem(void *vaddr)
565 kmem_cache_free(iommu_devinfo_cache, vaddr);
568 static inline int domain_type_is_si(struct dmar_domain *domain)
570 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
575 return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
581 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
583 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
591 sagaw = cap_sagaw(iommu->cap);
592 for (agaw = width_to_agaw(max_gaw);
594 if (test_bit(agaw, &sagaw))
602 * Calculate max SAGAW for each iommu.
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
606 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
610 * calculate agaw for each iommu.
611 * "SAGAW" may be different across iommus, use a default agaw, and
612 * get a supported less agaw for iommus that don't support the default agaw.
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
616 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
624 /* si_domain and vm domain should not get here. */
625 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
628 for_each_domain_iommu(iommu_id, domain)
631 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
634 return g_iommus[iommu_id];
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
639 struct dmar_drhd_unit *drhd;
640 struct intel_iommu *iommu;
644 domain->iommu_coherency = 1;
646 for_each_domain_iommu(i, domain) {
648 if (!ecap_coherent(g_iommus[i]->ecap)) {
649 domain->iommu_coherency = 0;
656 /* No hardware attached; use lowest common denominator */
658 for_each_active_iommu(iommu, drhd) {
659 if (!ecap_coherent(iommu->ecap)) {
660 domain->iommu_coherency = 0;
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
669 struct dmar_drhd_unit *drhd;
670 struct intel_iommu *iommu;
674 for_each_active_iommu(iommu, drhd) {
676 if (!ecap_sc_support(iommu->ecap)) {
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688 struct intel_iommu *skip)
690 struct dmar_drhd_unit *drhd;
691 struct intel_iommu *iommu;
694 if (!intel_iommu_superpage) {
698 /* set iommu_superpage to the smallest common denominator */
700 for_each_active_iommu(iommu, drhd) {
702 if (domain && domain_use_first_level(domain)) {
703 if (!cap_fl1gp_support(iommu->cap))
706 mask &= cap_super_page_val(iommu->cap);
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
721 domain_update_iommu_coherency(domain);
722 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723 domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
729 struct root_entry *root = &iommu->root_entry[bus];
730 struct context_entry *context;
734 if (sm_supported(iommu)) {
742 context = phys_to_virt(*entry & VTD_PAGE_MASK);
744 unsigned long phy_addr;
748 context = alloc_pgtable_page(iommu->node);
752 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753 phy_addr = virt_to_phys((void *)context);
754 *entry = phy_addr | 1;
755 __iommu_flush_cache(iommu, entry, sizeof(*entry));
757 return &context[devfn];
760 static int iommu_dummy(struct device *dev)
762 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
765 static bool attach_deferred(struct device *dev)
767 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
771 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
772 * sub-hierarchy of a candidate PCI-PCI bridge
773 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
774 * @bridge: the candidate PCI-PCI bridge
776 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
779 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
781 struct pci_dev *pdev, *pbridge;
783 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
786 pdev = to_pci_dev(dev);
787 pbridge = to_pci_dev(bridge);
789 if (pbridge->subordinate &&
790 pbridge->subordinate->number <= pdev->bus->number &&
791 pbridge->subordinate->busn_res.end >= pdev->bus->number)
797 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
799 struct dmar_drhd_unit *drhd = NULL;
800 struct intel_iommu *iommu;
802 struct pci_dev *pdev = NULL;
806 if (iommu_dummy(dev))
809 if (dev_is_pci(dev)) {
810 struct pci_dev *pf_pdev;
812 pdev = pci_real_dma_dev(to_pci_dev(dev));
814 /* VFs aren't listed in scope tables; we need to look up
815 * the PF instead to find the IOMMU. */
816 pf_pdev = pci_physfn(pdev);
818 segment = pci_domain_nr(pdev->bus);
819 } else if (has_acpi_companion(dev))
820 dev = &ACPI_COMPANION(dev)->dev;
823 for_each_active_iommu(iommu, drhd) {
824 if (pdev && segment != drhd->segment)
827 for_each_active_dev_scope(drhd->devices,
828 drhd->devices_cnt, i, tmp) {
830 /* For a VF use its original BDF# not that of the PF
831 * which we used for the IOMMU lookup. Strictly speaking
832 * we could do this for all PCI devices; we only need to
833 * get the BDF# from the scope table for ACPI matches. */
834 if (pdev && pdev->is_virtfn)
837 *bus = drhd->devices[i].bus;
838 *devfn = drhd->devices[i].devfn;
842 if (is_downstream_to_pci_bridge(dev, tmp))
846 if (pdev && drhd->include_all) {
848 *bus = pdev->bus->number;
849 *devfn = pdev->devfn;
860 static void domain_flush_cache(struct dmar_domain *domain,
861 void *addr, int size)
863 if (!domain->iommu_coherency)
864 clflush_cache_range(addr, size);
867 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
869 struct context_entry *context;
873 spin_lock_irqsave(&iommu->lock, flags);
874 context = iommu_context_addr(iommu, bus, devfn, 0);
876 ret = context_present(context);
877 spin_unlock_irqrestore(&iommu->lock, flags);
881 static void free_context_table(struct intel_iommu *iommu)
885 struct context_entry *context;
887 spin_lock_irqsave(&iommu->lock, flags);
888 if (!iommu->root_entry) {
891 for (i = 0; i < ROOT_ENTRY_NR; i++) {
892 context = iommu_context_addr(iommu, i, 0, 0);
894 free_pgtable_page(context);
896 if (!sm_supported(iommu))
899 context = iommu_context_addr(iommu, i, 0x80, 0);
901 free_pgtable_page(context);
904 free_pgtable_page(iommu->root_entry);
905 iommu->root_entry = NULL;
907 spin_unlock_irqrestore(&iommu->lock, flags);
910 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
911 unsigned long pfn, int *target_level)
913 struct dma_pte *parent, *pte;
914 int level = agaw_to_level(domain->agaw);
917 BUG_ON(!domain->pgd);
919 if (!domain_pfn_supported(domain, pfn))
920 /* Address beyond IOMMU's addressing capabilities. */
923 parent = domain->pgd;
928 offset = pfn_level_offset(pfn, level);
929 pte = &parent[offset];
930 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
932 if (level == *target_level)
935 if (!dma_pte_present(pte)) {
938 tmp_page = alloc_pgtable_page(domain->nid);
943 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
944 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
945 if (domain_use_first_level(domain))
946 pteval |= DMA_FL_PTE_XD;
947 if (cmpxchg64(&pte->val, 0ULL, pteval))
948 /* Someone else set it while we were thinking; use theirs. */
949 free_pgtable_page(tmp_page);
951 domain_flush_cache(domain, pte, sizeof(*pte));
956 parent = phys_to_virt(dma_pte_addr(pte));
961 *target_level = level;
966 /* return address's pte at specific level */
967 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969 int level, int *large_page)
971 struct dma_pte *parent, *pte;
972 int total = agaw_to_level(domain->agaw);
975 parent = domain->pgd;
976 while (level <= total) {
977 offset = pfn_level_offset(pfn, total);
978 pte = &parent[offset];
982 if (!dma_pte_present(pte)) {
987 if (dma_pte_superpage(pte)) {
992 parent = phys_to_virt(dma_pte_addr(pte));
998 /* clear last level pte, a tlb flush should be followed */
999 static void dma_pte_clear_range(struct dmar_domain *domain,
1000 unsigned long start_pfn,
1001 unsigned long last_pfn)
1003 unsigned int large_page;
1004 struct dma_pte *first_pte, *pte;
1006 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1007 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1008 BUG_ON(start_pfn > last_pfn);
1010 /* we don't need lock here; nobody else touches the iova range */
1013 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1020 start_pfn += lvl_to_nr_pages(large_page);
1022 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024 domain_flush_cache(domain, first_pte,
1025 (void *)pte - (void *)first_pte);
1027 } while (start_pfn && start_pfn <= last_pfn);
1030 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1031 int retain_level, struct dma_pte *pte,
1032 unsigned long pfn, unsigned long start_pfn,
1033 unsigned long last_pfn)
1035 pfn = max(start_pfn, pfn);
1036 pte = &pte[pfn_level_offset(pfn, level)];
1039 unsigned long level_pfn;
1040 struct dma_pte *level_pte;
1042 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1045 level_pfn = pfn & level_mask(level);
1046 level_pte = phys_to_virt(dma_pte_addr(pte));
1049 dma_pte_free_level(domain, level - 1, retain_level,
1050 level_pte, level_pfn, start_pfn,
1055 * Free the page table if we're below the level we want to
1056 * retain and the range covers the entire table.
1058 if (level < retain_level && !(start_pfn > level_pfn ||
1059 last_pfn < level_pfn + level_size(level) - 1)) {
1061 domain_flush_cache(domain, pte, sizeof(*pte));
1062 free_pgtable_page(level_pte);
1065 pfn += level_size(level);
1066 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1070 * clear last level (leaf) ptes and free page table pages below the
1071 * level we wish to keep intact.
1073 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1074 unsigned long start_pfn,
1075 unsigned long last_pfn,
1078 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1079 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1080 BUG_ON(start_pfn > last_pfn);
1082 dma_pte_clear_range(domain, start_pfn, last_pfn);
1084 /* We don't need lock here; nobody else touches the iova range */
1085 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1086 domain->pgd, 0, start_pfn, last_pfn);
1089 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1090 free_pgtable_page(domain->pgd);
1095 /* When a page at a given level is being unlinked from its parent, we don't
1096 need to *modify* it at all. All we need to do is make a list of all the
1097 pages which can be freed just as soon as we've flushed the IOTLB and we
1098 know the hardware page-walk will no longer touch them.
1099 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1102 int level, struct dma_pte *pte,
1103 struct page *freelist)
1107 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1108 pg->freelist = freelist;
1114 pte = page_address(pg);
1116 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1117 freelist = dma_pte_list_pagetables(domain, level - 1,
1120 } while (!first_pte_in_page(pte));
1125 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1126 struct dma_pte *pte, unsigned long pfn,
1127 unsigned long start_pfn,
1128 unsigned long last_pfn,
1129 struct page *freelist)
1131 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1133 pfn = max(start_pfn, pfn);
1134 pte = &pte[pfn_level_offset(pfn, level)];
1137 unsigned long level_pfn;
1139 if (!dma_pte_present(pte))
1142 level_pfn = pfn & level_mask(level);
1144 /* If range covers entire pagetable, free it */
1145 if (start_pfn <= level_pfn &&
1146 last_pfn >= level_pfn + level_size(level) - 1) {
1147 /* These suborbinate page tables are going away entirely. Don't
1148 bother to clear them; we're just going to *free* them. */
1149 if (level > 1 && !dma_pte_superpage(pte))
1150 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1156 } else if (level > 1) {
1157 /* Recurse down into a level that isn't *entirely* obsolete */
1158 freelist = dma_pte_clear_level(domain, level - 1,
1159 phys_to_virt(dma_pte_addr(pte)),
1160 level_pfn, start_pfn, last_pfn,
1164 pfn += level_size(level);
1165 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1168 domain_flush_cache(domain, first_pte,
1169 (void *)++last_pte - (void *)first_pte);
1174 /* We can't just free the pages because the IOMMU may still be walking
1175 the page tables, and may have cached the intermediate levels. The
1176 pages can only be freed after the IOTLB flush has been done. */
1177 static struct page *domain_unmap(struct dmar_domain *domain,
1178 unsigned long start_pfn,
1179 unsigned long last_pfn)
1181 struct page *freelist;
1183 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1184 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1185 BUG_ON(start_pfn > last_pfn);
1187 /* we don't need lock here; nobody else touches the iova range */
1188 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1189 domain->pgd, 0, start_pfn, last_pfn, NULL);
1192 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1193 struct page *pgd_page = virt_to_page(domain->pgd);
1194 pgd_page->freelist = freelist;
1195 freelist = pgd_page;
1203 static void dma_free_pagelist(struct page *freelist)
1207 while ((pg = freelist)) {
1208 freelist = pg->freelist;
1209 free_pgtable_page(page_address(pg));
1213 static void iova_entry_free(unsigned long data)
1215 struct page *freelist = (struct page *)data;
1217 dma_free_pagelist(freelist);
1220 /* iommu handling */
1221 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1223 struct root_entry *root;
1224 unsigned long flags;
1226 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1228 pr_err("Allocating root entry for %s failed\n",
1233 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1235 spin_lock_irqsave(&iommu->lock, flags);
1236 iommu->root_entry = root;
1237 spin_unlock_irqrestore(&iommu->lock, flags);
1242 static void iommu_set_root_entry(struct intel_iommu *iommu)
1248 addr = virt_to_phys(iommu->root_entry);
1249 if (sm_supported(iommu))
1250 addr |= DMA_RTADDR_SMT;
1252 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1253 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1255 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1257 /* Make sure hardware complete it */
1258 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1259 readl, (sts & DMA_GSTS_RTPS), sts);
1261 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1264 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1269 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1272 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1275 /* Make sure hardware complete it */
1276 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1277 readl, (!(val & DMA_GSTS_WBFS)), val);
1279 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_context(struct intel_iommu *iommu,
1284 u16 did, u16 source_id, u8 function_mask,
1291 case DMA_CCMD_GLOBAL_INVL:
1292 val = DMA_CCMD_GLOBAL_INVL;
1294 case DMA_CCMD_DOMAIN_INVL:
1295 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1297 case DMA_CCMD_DEVICE_INVL:
1298 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1299 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1304 val |= DMA_CCMD_ICC;
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1309 /* Make sure hardware complete it */
1310 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1311 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1313 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1316 /* return value determine if we need a write buffer flush */
1317 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1318 u64 addr, unsigned int size_order, u64 type)
1320 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1321 u64 val = 0, val_iva = 0;
1325 case DMA_TLB_GLOBAL_FLUSH:
1326 /* global flush doesn't need set IVA_REG */
1327 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1329 case DMA_TLB_DSI_FLUSH:
1330 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1332 case DMA_TLB_PSI_FLUSH:
1333 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1334 /* IH bit is passed in as part of address */
1335 val_iva = size_order | addr;
1340 /* Note: set drain read/write */
1343 * This is probably to be super secure.. Looks like we can
1344 * ignore it without any impact.
1346 if (cap_read_drain(iommu->cap))
1347 val |= DMA_TLB_READ_DRAIN;
1349 if (cap_write_drain(iommu->cap))
1350 val |= DMA_TLB_WRITE_DRAIN;
1352 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1353 /* Note: Only uses first TLB reg currently */
1355 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1356 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1358 /* Make sure hardware complete it */
1359 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1360 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1362 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 /* check IOTLB invalidation granularity */
1365 if (DMA_TLB_IAIG(val) == 0)
1366 pr_err("Flush IOTLB failed\n");
1367 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1368 pr_debug("TLB flush request %Lx, actual %Lx\n",
1369 (unsigned long long)DMA_TLB_IIRG(type),
1370 (unsigned long long)DMA_TLB_IAIG(val));
1373 static struct device_domain_info *
1374 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1377 struct device_domain_info *info;
1379 assert_spin_locked(&device_domain_lock);
1384 list_for_each_entry(info, &domain->devices, link)
1385 if (info->iommu == iommu && info->bus == bus &&
1386 info->devfn == devfn) {
1387 if (info->ats_supported && info->dev)
1395 static void domain_update_iotlb(struct dmar_domain *domain)
1397 struct device_domain_info *info;
1398 bool has_iotlb_device = false;
1400 assert_spin_locked(&device_domain_lock);
1402 list_for_each_entry(info, &domain->devices, link) {
1403 struct pci_dev *pdev;
1405 if (!info->dev || !dev_is_pci(info->dev))
1408 pdev = to_pci_dev(info->dev);
1409 if (pdev->ats_enabled) {
1410 has_iotlb_device = true;
1415 domain->has_iotlb_device = has_iotlb_device;
1418 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1420 struct pci_dev *pdev;
1422 assert_spin_locked(&device_domain_lock);
1424 if (!info || !dev_is_pci(info->dev))
1427 pdev = to_pci_dev(info->dev);
1428 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1429 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1430 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1431 * reserved, which should be set to 0.
1433 if (!ecap_dit(info->iommu->ecap))
1436 struct pci_dev *pf_pdev;
1438 /* pdev will be returned if device is not a vf */
1439 pf_pdev = pci_physfn(pdev);
1440 info->pfsid = pci_dev_id(pf_pdev);
1443 #ifdef CONFIG_INTEL_IOMMU_SVM
1444 /* The PCIe spec, in its wisdom, declares that the behaviour of
1445 the device if you enable PASID support after ATS support is
1446 undefined. So always enable PASID support on devices which
1447 have it, even if we can't yet know if we're ever going to
1449 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1450 info->pasid_enabled = 1;
1452 if (info->pri_supported &&
1453 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1454 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1455 info->pri_enabled = 1;
1457 if (!pdev->untrusted && info->ats_supported &&
1458 pci_ats_page_aligned(pdev) &&
1459 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1460 info->ats_enabled = 1;
1461 domain_update_iotlb(info->domain);
1462 info->ats_qdep = pci_ats_queue_depth(pdev);
1466 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1468 struct pci_dev *pdev;
1470 assert_spin_locked(&device_domain_lock);
1472 if (!dev_is_pci(info->dev))
1475 pdev = to_pci_dev(info->dev);
1477 if (info->ats_enabled) {
1478 pci_disable_ats(pdev);
1479 info->ats_enabled = 0;
1480 domain_update_iotlb(info->domain);
1482 #ifdef CONFIG_INTEL_IOMMU_SVM
1483 if (info->pri_enabled) {
1484 pci_disable_pri(pdev);
1485 info->pri_enabled = 0;
1487 if (info->pasid_enabled) {
1488 pci_disable_pasid(pdev);
1489 info->pasid_enabled = 0;
1494 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1495 u64 addr, unsigned mask)
1498 unsigned long flags;
1499 struct device_domain_info *info;
1501 if (!domain->has_iotlb_device)
1504 spin_lock_irqsave(&device_domain_lock, flags);
1505 list_for_each_entry(info, &domain->devices, link) {
1506 if (!info->ats_enabled)
1509 sid = info->bus << 8 | info->devfn;
1510 qdep = info->ats_qdep;
1511 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1514 spin_unlock_irqrestore(&device_domain_lock, flags);
1517 static void domain_flush_piotlb(struct intel_iommu *iommu,
1518 struct dmar_domain *domain,
1519 u64 addr, unsigned long npages, bool ih)
1521 u16 did = domain->iommu_did[iommu->seq_id];
1523 if (domain->default_pasid)
1524 qi_flush_piotlb(iommu, did, domain->default_pasid,
1527 if (!list_empty(&domain->devices))
1528 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1531 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1532 struct dmar_domain *domain,
1533 unsigned long pfn, unsigned int pages,
1536 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1537 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1538 u16 did = domain->iommu_did[iommu->seq_id];
1545 if (domain_use_first_level(domain)) {
1546 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1549 * Fallback to domain selective flush if no PSI support or
1550 * the size is too big. PSI requires page size to be 2 ^ x,
1551 * and the base address is naturally aligned to the size.
1553 if (!cap_pgsel_inv(iommu->cap) ||
1554 mask > cap_max_amask_val(iommu->cap))
1555 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1558 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1563 * In caching mode, changes of pages from non-present to present require
1564 * flush. However, device IOTLB doesn't need to be flushed in this case.
1566 if (!cap_caching_mode(iommu->cap) || !map)
1567 iommu_flush_dev_iotlb(domain, addr, mask);
1570 /* Notification for newly created mappings */
1571 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1572 struct dmar_domain *domain,
1573 unsigned long pfn, unsigned int pages)
1576 * It's a non-present to present mapping. Only flush if caching mode
1579 if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1580 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1582 iommu_flush_write_buffer(iommu);
1585 static void iommu_flush_iova(struct iova_domain *iovad)
1587 struct dmar_domain *domain;
1590 domain = container_of(iovad, struct dmar_domain, iovad);
1592 for_each_domain_iommu(idx, domain) {
1593 struct intel_iommu *iommu = g_iommus[idx];
1594 u16 did = domain->iommu_did[iommu->seq_id];
1596 if (domain_use_first_level(domain))
1597 domain_flush_piotlb(iommu, domain, 0, -1, 0);
1599 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1602 if (!cap_caching_mode(iommu->cap))
1603 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1604 0, MAX_AGAW_PFN_WIDTH);
1608 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1611 unsigned long flags;
1613 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1616 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1617 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1618 pmen &= ~DMA_PMEN_EPM;
1619 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621 /* wait for the protected region status bit to clear */
1622 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1623 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1628 static void iommu_enable_translation(struct intel_iommu *iommu)
1631 unsigned long flags;
1633 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634 iommu->gcmd |= DMA_GCMD_TE;
1635 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637 /* Make sure hardware complete it */
1638 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1639 readl, (sts & DMA_GSTS_TES), sts);
1641 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1644 static void iommu_disable_translation(struct intel_iommu *iommu)
1649 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1650 iommu->gcmd &= ~DMA_GCMD_TE;
1651 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653 /* Make sure hardware complete it */
1654 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1655 readl, (!(sts & DMA_GSTS_TES)), sts);
1657 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1660 static int iommu_init_domains(struct intel_iommu *iommu)
1662 u32 ndomains, nlongs;
1665 ndomains = cap_ndoms(iommu->cap);
1666 pr_debug("%s: Number of Domains supported <%d>\n",
1667 iommu->name, ndomains);
1668 nlongs = BITS_TO_LONGS(ndomains);
1670 spin_lock_init(&iommu->lock);
1672 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1673 if (!iommu->domain_ids) {
1674 pr_err("%s: Allocating domain id array failed\n",
1679 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1680 iommu->domains = kzalloc(size, GFP_KERNEL);
1682 if (iommu->domains) {
1683 size = 256 * sizeof(struct dmar_domain *);
1684 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1687 if (!iommu->domains || !iommu->domains[0]) {
1688 pr_err("%s: Allocating domain array failed\n",
1690 kfree(iommu->domain_ids);
1691 kfree(iommu->domains);
1692 iommu->domain_ids = NULL;
1693 iommu->domains = NULL;
1698 * If Caching mode is set, then invalid translations are tagged
1699 * with domain-id 0, hence we need to pre-allocate it. We also
1700 * use domain-id 0 as a marker for non-allocated domain-id, so
1701 * make sure it is not used for a real domain.
1703 set_bit(0, iommu->domain_ids);
1706 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1707 * entry for first-level or pass-through translation modes should
1708 * be programmed with a domain id different from those used for
1709 * second-level or nested translation. We reserve a domain id for
1712 if (sm_supported(iommu))
1713 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1718 static void disable_dmar_iommu(struct intel_iommu *iommu)
1720 struct device_domain_info *info, *tmp;
1721 unsigned long flags;
1723 if (!iommu->domains || !iommu->domain_ids)
1726 spin_lock_irqsave(&device_domain_lock, flags);
1727 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1728 if (info->iommu != iommu)
1731 if (!info->dev || !info->domain)
1734 __dmar_remove_one_dev_info(info);
1736 spin_unlock_irqrestore(&device_domain_lock, flags);
1738 if (iommu->gcmd & DMA_GCMD_TE)
1739 iommu_disable_translation(iommu);
1742 static void free_dmar_iommu(struct intel_iommu *iommu)
1744 if ((iommu->domains) && (iommu->domain_ids)) {
1745 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1748 for (i = 0; i < elems; i++)
1749 kfree(iommu->domains[i]);
1750 kfree(iommu->domains);
1751 kfree(iommu->domain_ids);
1752 iommu->domains = NULL;
1753 iommu->domain_ids = NULL;
1756 g_iommus[iommu->seq_id] = NULL;
1758 /* free context mapping */
1759 free_context_table(iommu);
1761 #ifdef CONFIG_INTEL_IOMMU_SVM
1762 if (pasid_supported(iommu)) {
1763 if (ecap_prs(iommu->ecap))
1764 intel_svm_finish_prq(iommu);
1770 * Check and return whether first level is used by default for
1773 static bool first_level_by_default(void)
1775 struct dmar_drhd_unit *drhd;
1776 struct intel_iommu *iommu;
1777 static int first_level_support = -1;
1779 if (likely(first_level_support != -1))
1780 return first_level_support;
1782 first_level_support = 1;
1785 for_each_active_iommu(iommu, drhd) {
1786 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1787 first_level_support = 0;
1793 return first_level_support;
1796 static struct dmar_domain *alloc_domain(int flags)
1798 struct dmar_domain *domain;
1800 domain = alloc_domain_mem();
1804 memset(domain, 0, sizeof(*domain));
1805 domain->nid = NUMA_NO_NODE;
1806 domain->flags = flags;
1807 if (first_level_by_default())
1808 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1809 domain->has_iotlb_device = false;
1810 INIT_LIST_HEAD(&domain->devices);
1815 /* Must be called with iommu->lock */
1816 static int domain_attach_iommu(struct dmar_domain *domain,
1817 struct intel_iommu *iommu)
1819 unsigned long ndomains;
1822 assert_spin_locked(&device_domain_lock);
1823 assert_spin_locked(&iommu->lock);
1825 domain->iommu_refcnt[iommu->seq_id] += 1;
1826 domain->iommu_count += 1;
1827 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1828 ndomains = cap_ndoms(iommu->cap);
1829 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1831 if (num >= ndomains) {
1832 pr_err("%s: No free domain ids\n", iommu->name);
1833 domain->iommu_refcnt[iommu->seq_id] -= 1;
1834 domain->iommu_count -= 1;
1838 set_bit(num, iommu->domain_ids);
1839 set_iommu_domain(iommu, num, domain);
1841 domain->iommu_did[iommu->seq_id] = num;
1842 domain->nid = iommu->node;
1844 domain_update_iommu_cap(domain);
1850 static int domain_detach_iommu(struct dmar_domain *domain,
1851 struct intel_iommu *iommu)
1855 assert_spin_locked(&device_domain_lock);
1856 assert_spin_locked(&iommu->lock);
1858 domain->iommu_refcnt[iommu->seq_id] -= 1;
1859 count = --domain->iommu_count;
1860 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1861 num = domain->iommu_did[iommu->seq_id];
1862 clear_bit(num, iommu->domain_ids);
1863 set_iommu_domain(iommu, num, NULL);
1865 domain_update_iommu_cap(domain);
1866 domain->iommu_did[iommu->seq_id] = 0;
1872 static struct iova_domain reserved_iova_list;
1873 static struct lock_class_key reserved_rbtree_key;
1875 static int dmar_init_reserved_ranges(void)
1877 struct pci_dev *pdev = NULL;
1881 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1883 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1884 &reserved_rbtree_key);
1886 /* IOAPIC ranges shouldn't be accessed by DMA */
1887 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1888 IOVA_PFN(IOAPIC_RANGE_END));
1890 pr_err("Reserve IOAPIC range failed\n");
1894 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1895 for_each_pci_dev(pdev) {
1898 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1899 r = &pdev->resource[i];
1900 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1902 iova = reserve_iova(&reserved_iova_list,
1906 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1914 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1916 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1919 static inline int guestwidth_to_adjustwidth(int gaw)
1922 int r = (gaw - 12) % 9;
1933 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1936 int adjust_width, agaw;
1937 unsigned long sagaw;
1940 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1942 if (!intel_iommu_strict) {
1943 ret = init_iova_flush_queue(&domain->iovad,
1944 iommu_flush_iova, iova_entry_free);
1946 pr_info("iova flush queue initialization failed\n");
1949 domain_reserve_special_ranges(domain);
1951 /* calculate AGAW */
1952 if (guest_width > cap_mgaw(iommu->cap))
1953 guest_width = cap_mgaw(iommu->cap);
1954 domain->gaw = guest_width;
1955 adjust_width = guestwidth_to_adjustwidth(guest_width);
1956 agaw = width_to_agaw(adjust_width);
1957 sagaw = cap_sagaw(iommu->cap);
1958 if (!test_bit(agaw, &sagaw)) {
1959 /* hardware doesn't support it, choose a bigger one */
1960 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1961 agaw = find_next_bit(&sagaw, 5, agaw);
1965 domain->agaw = agaw;
1967 if (ecap_coherent(iommu->ecap))
1968 domain->iommu_coherency = 1;
1970 domain->iommu_coherency = 0;
1972 if (ecap_sc_support(iommu->ecap))
1973 domain->iommu_snooping = 1;
1975 domain->iommu_snooping = 0;
1977 if (intel_iommu_superpage)
1978 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1980 domain->iommu_superpage = 0;
1982 domain->nid = iommu->node;
1984 /* always allocate the top pgd */
1985 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1988 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1992 static void domain_exit(struct dmar_domain *domain)
1995 /* Remove associated devices and clear attached or cached domains */
1996 domain_remove_dev_info(domain);
1999 put_iova_domain(&domain->iovad);
2002 struct page *freelist;
2004 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2005 dma_free_pagelist(freelist);
2008 free_domain_mem(domain);
2012 * Get the PASID directory size for scalable mode context entry.
2013 * Value of X in the PDTS field of a scalable mode context entry
2014 * indicates PASID directory with 2^(X + 7) entries.
2016 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2020 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2021 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2029 * Set the RID_PASID field of a scalable mode context entry. The
2030 * IOMMU hardware will use the PASID value set in this field for
2031 * DMA translations of DMA requests without PASID.
2034 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2036 context->hi |= pasid & ((1 << 20) - 1);
2037 context->hi |= (1 << 20);
2041 * Set the DTE(Device-TLB Enable) field of a scalable mode context
2044 static inline void context_set_sm_dte(struct context_entry *context)
2046 context->lo |= (1 << 2);
2050 * Set the PRE(Page Request Enable) field of a scalable mode context
2053 static inline void context_set_sm_pre(struct context_entry *context)
2055 context->lo |= (1 << 4);
2058 /* Convert value to context PASID directory size field coding. */
2059 #define context_pdts(pds) (((pds) & 0x7) << 9)
2061 static int domain_context_mapping_one(struct dmar_domain *domain,
2062 struct intel_iommu *iommu,
2063 struct pasid_table *table,
2066 u16 did = domain->iommu_did[iommu->seq_id];
2067 int translation = CONTEXT_TT_MULTI_LEVEL;
2068 struct device_domain_info *info = NULL;
2069 struct context_entry *context;
2070 unsigned long flags;
2075 if (hw_pass_through && domain_type_is_si(domain))
2076 translation = CONTEXT_TT_PASS_THROUGH;
2078 pr_debug("Set context mapping for %02x:%02x.%d\n",
2079 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2081 BUG_ON(!domain->pgd);
2083 spin_lock_irqsave(&device_domain_lock, flags);
2084 spin_lock(&iommu->lock);
2087 context = iommu_context_addr(iommu, bus, devfn, 1);
2092 if (context_present(context))
2096 * For kdump cases, old valid entries may be cached due to the
2097 * in-flight DMA and copied pgtable, but there is no unmapping
2098 * behaviour for them, thus we need an explicit cache flush for
2099 * the newly-mapped device. For kdump, at this point, the device
2100 * is supposed to finish reset at its driver probe stage, so no
2101 * in-flight DMA will exist, and we don't need to worry anymore
2104 if (context_copied(context)) {
2105 u16 did_old = context_domain_id(context);
2107 if (did_old < cap_ndoms(iommu->cap)) {
2108 iommu->flush.flush_context(iommu, did_old,
2109 (((u16)bus) << 8) | devfn,
2110 DMA_CCMD_MASK_NOBIT,
2111 DMA_CCMD_DEVICE_INVL);
2112 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2117 context_clear_entry(context);
2119 if (sm_supported(iommu)) {
2124 /* Setup the PASID DIR pointer: */
2125 pds = context_get_sm_pds(table);
2126 context->lo = (u64)virt_to_phys(table->table) |
2129 /* Setup the RID_PASID field: */
2130 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2133 * Setup the Device-TLB enable bit and Page request
2136 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2137 if (info && info->ats_supported)
2138 context_set_sm_dte(context);
2139 if (info && info->pri_supported)
2140 context_set_sm_pre(context);
2142 struct dma_pte *pgd = domain->pgd;
2145 context_set_domain_id(context, did);
2147 if (translation != CONTEXT_TT_PASS_THROUGH) {
2149 * Skip top levels of page tables for iommu which has
2150 * less agaw than default. Unnecessary for PT mode.
2152 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2154 pgd = phys_to_virt(dma_pte_addr(pgd));
2155 if (!dma_pte_present(pgd))
2159 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2160 if (info && info->ats_supported)
2161 translation = CONTEXT_TT_DEV_IOTLB;
2163 translation = CONTEXT_TT_MULTI_LEVEL;
2165 context_set_address_root(context, virt_to_phys(pgd));
2166 context_set_address_width(context, agaw);
2169 * In pass through mode, AW must be programmed to
2170 * indicate the largest AGAW value supported by
2171 * hardware. And ASR is ignored by hardware.
2173 context_set_address_width(context, iommu->msagaw);
2176 context_set_translation_type(context, translation);
2179 context_set_fault_enable(context);
2180 context_set_present(context);
2181 domain_flush_cache(domain, context, sizeof(*context));
2184 * It's a non-present to present mapping. If hardware doesn't cache
2185 * non-present entry we only need to flush the write-buffer. If the
2186 * _does_ cache non-present entries, then it does so in the special
2187 * domain #0, which we have to flush:
2189 if (cap_caching_mode(iommu->cap)) {
2190 iommu->flush.flush_context(iommu, 0,
2191 (((u16)bus) << 8) | devfn,
2192 DMA_CCMD_MASK_NOBIT,
2193 DMA_CCMD_DEVICE_INVL);
2194 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2196 iommu_flush_write_buffer(iommu);
2198 iommu_enable_dev_iotlb(info);
2203 spin_unlock(&iommu->lock);
2204 spin_unlock_irqrestore(&device_domain_lock, flags);
2209 struct domain_context_mapping_data {
2210 struct dmar_domain *domain;
2211 struct intel_iommu *iommu;
2212 struct pasid_table *table;
2215 static int domain_context_mapping_cb(struct pci_dev *pdev,
2216 u16 alias, void *opaque)
2218 struct domain_context_mapping_data *data = opaque;
2220 return domain_context_mapping_one(data->domain, data->iommu,
2221 data->table, PCI_BUS_NUM(alias),
2226 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2228 struct domain_context_mapping_data data;
2229 struct pasid_table *table;
2230 struct intel_iommu *iommu;
2233 iommu = device_to_iommu(dev, &bus, &devfn);
2237 table = intel_pasid_get_table(dev);
2239 if (!dev_is_pci(dev))
2240 return domain_context_mapping_one(domain, iommu, table,
2243 data.domain = domain;
2247 return pci_for_each_dma_alias(to_pci_dev(dev),
2248 &domain_context_mapping_cb, &data);
2251 static int domain_context_mapped_cb(struct pci_dev *pdev,
2252 u16 alias, void *opaque)
2254 struct intel_iommu *iommu = opaque;
2256 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2259 static int domain_context_mapped(struct device *dev)
2261 struct intel_iommu *iommu;
2264 iommu = device_to_iommu(dev, &bus, &devfn);
2268 if (!dev_is_pci(dev))
2269 return device_context_mapped(iommu, bus, devfn);
2271 return !pci_for_each_dma_alias(to_pci_dev(dev),
2272 domain_context_mapped_cb, iommu);
2275 /* Returns a number of VTD pages, but aligned to MM page size */
2276 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2279 host_addr &= ~PAGE_MASK;
2280 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2283 /* Return largest possible superpage level for a given mapping */
2284 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2285 unsigned long iov_pfn,
2286 unsigned long phy_pfn,
2287 unsigned long pages)
2289 int support, level = 1;
2290 unsigned long pfnmerge;
2292 support = domain->iommu_superpage;
2294 /* To use a large page, the virtual *and* physical addresses
2295 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2296 of them will mean we have to use smaller pages. So just
2297 merge them and check both at once. */
2298 pfnmerge = iov_pfn | phy_pfn;
2300 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2301 pages >>= VTD_STRIDE_SHIFT;
2304 pfnmerge >>= VTD_STRIDE_SHIFT;
2311 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2312 struct scatterlist *sg, unsigned long phys_pfn,
2313 unsigned long nr_pages, int prot)
2315 struct dma_pte *first_pte = NULL, *pte = NULL;
2316 phys_addr_t uninitialized_var(pteval);
2317 unsigned long sg_res = 0;
2318 unsigned int largepage_lvl = 0;
2319 unsigned long lvl_pages = 0;
2322 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2324 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2327 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2328 if (domain_use_first_level(domain))
2329 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2333 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2336 while (nr_pages > 0) {
2340 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2342 sg_res = aligned_nrpages(sg->offset, sg->length);
2343 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2344 sg->dma_length = sg->length;
2345 pteval = (sg_phys(sg) - pgoff) | attr;
2346 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2350 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2352 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2355 /* It is large page*/
2356 if (largepage_lvl > 1) {
2357 unsigned long nr_superpages, end_pfn;
2359 pteval |= DMA_PTE_LARGE_PAGE;
2360 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2362 nr_superpages = sg_res / lvl_pages;
2363 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2366 * Ensure that old small page tables are
2367 * removed to make room for superpage(s).
2368 * We're adding new large pages, so make sure
2369 * we don't remove their parent tables.
2371 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2374 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2378 /* We don't need lock here, nobody else
2379 * touches the iova range
2381 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2383 static int dumps = 5;
2384 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2385 iov_pfn, tmp, (unsigned long long)pteval);
2388 debug_dma_dump_mappings(NULL);
2393 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2395 BUG_ON(nr_pages < lvl_pages);
2396 BUG_ON(sg_res < lvl_pages);
2398 nr_pages -= lvl_pages;
2399 iov_pfn += lvl_pages;
2400 phys_pfn += lvl_pages;
2401 pteval += lvl_pages * VTD_PAGE_SIZE;
2402 sg_res -= lvl_pages;
2404 /* If the next PTE would be the first in a new page, then we
2405 need to flush the cache on the entries we've just written.
2406 And then we'll need to recalculate 'pte', so clear it and
2407 let it get set again in the if (!pte) block above.
2409 If we're done (!nr_pages) we need to flush the cache too.
2411 Also if we've been setting superpages, we may need to
2412 recalculate 'pte' and switch back to smaller pages for the
2413 end of the mapping, if the trailing size is not enough to
2414 use another superpage (i.e. sg_res < lvl_pages). */
2416 if (!nr_pages || first_pte_in_page(pte) ||
2417 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2418 domain_flush_cache(domain, first_pte,
2419 (void *)pte - (void *)first_pte);
2423 if (!sg_res && nr_pages)
2429 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2430 struct scatterlist *sg, unsigned long phys_pfn,
2431 unsigned long nr_pages, int prot)
2434 struct intel_iommu *iommu;
2436 /* Do the real mapping first */
2437 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2441 for_each_domain_iommu(iommu_id, domain) {
2442 iommu = g_iommus[iommu_id];
2443 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2449 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2450 struct scatterlist *sg, unsigned long nr_pages,
2453 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2456 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2457 unsigned long phys_pfn, unsigned long nr_pages,
2460 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2463 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2465 unsigned long flags;
2466 struct context_entry *context;
2472 spin_lock_irqsave(&iommu->lock, flags);
2473 context = iommu_context_addr(iommu, bus, devfn, 0);
2475 spin_unlock_irqrestore(&iommu->lock, flags);
2478 did_old = context_domain_id(context);
2479 context_clear_entry(context);
2480 __iommu_flush_cache(iommu, context, sizeof(*context));
2481 spin_unlock_irqrestore(&iommu->lock, flags);
2482 iommu->flush.flush_context(iommu,
2484 (((u16)bus) << 8) | devfn,
2485 DMA_CCMD_MASK_NOBIT,
2486 DMA_CCMD_DEVICE_INVL);
2487 iommu->flush.flush_iotlb(iommu,
2494 static inline void unlink_domain_info(struct device_domain_info *info)
2496 assert_spin_locked(&device_domain_lock);
2497 list_del(&info->link);
2498 list_del(&info->global);
2500 info->dev->archdata.iommu = NULL;
2503 static void domain_remove_dev_info(struct dmar_domain *domain)
2505 struct device_domain_info *info, *tmp;
2506 unsigned long flags;
2508 spin_lock_irqsave(&device_domain_lock, flags);
2509 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2510 __dmar_remove_one_dev_info(info);
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2514 struct dmar_domain *find_domain(struct device *dev)
2516 struct device_domain_info *info;
2518 if (unlikely(attach_deferred(dev) || iommu_dummy(dev)))
2521 if (dev_is_pci(dev))
2522 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2524 /* No lock here, assumes no domain exit in normal case */
2525 info = dev->archdata.iommu;
2527 return info->domain;
2532 static void do_deferred_attach(struct device *dev)
2534 struct iommu_domain *domain;
2536 dev->archdata.iommu = NULL;
2537 domain = iommu_get_domain_for_dev(dev);
2539 intel_iommu_attach_device(domain, dev);
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2545 struct device_domain_info *info;
2547 list_for_each_entry(info, &device_domain_list, global)
2548 if (info->iommu->segment == segment && info->bus == bus &&
2549 info->devfn == devfn)
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556 struct dmar_domain *domain,
2560 int flags = PASID_FLAG_SUPERVISOR_MODE;
2561 struct dma_pte *pgd = domain->pgd;
2565 * Skip top levels of page tables for iommu which has
2566 * less agaw than default. Unnecessary for PT mode.
2568 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569 pgd = phys_to_virt(dma_pte_addr(pgd));
2570 if (!dma_pte_present(pgd))
2574 level = agaw_to_level(agaw);
2575 if (level != 4 && level != 5)
2578 flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2580 return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581 domain->iommu_did[iommu->seq_id],
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2588 struct dmar_domain *domain)
2590 struct dmar_domain *found = NULL;
2591 struct device_domain_info *info;
2592 unsigned long flags;
2595 info = alloc_devinfo_mem();
2600 info->devfn = devfn;
2601 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2605 info->domain = domain;
2606 info->iommu = iommu;
2607 info->pasid_table = NULL;
2608 info->auxd_enabled = 0;
2609 INIT_LIST_HEAD(&info->auxiliary_domains);
2611 if (dev && dev_is_pci(dev)) {
2612 struct pci_dev *pdev = to_pci_dev(info->dev);
2614 if (!pdev->untrusted &&
2615 !pci_ats_disabled() &&
2616 ecap_dev_iotlb_support(iommu->ecap) &&
2617 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618 dmar_find_matched_atsr_unit(pdev))
2619 info->ats_supported = 1;
2621 if (sm_supported(iommu)) {
2622 if (pasid_supported(iommu)) {
2623 int features = pci_pasid_features(pdev);
2625 info->pasid_supported = features | 1;
2628 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630 info->pri_supported = 1;
2634 spin_lock_irqsave(&device_domain_lock, flags);
2636 found = find_domain(dev);
2639 struct device_domain_info *info2;
2640 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2642 found = info2->domain;
2648 spin_unlock_irqrestore(&device_domain_lock, flags);
2649 free_devinfo_mem(info);
2650 /* Caller must free the original domain */
2654 spin_lock(&iommu->lock);
2655 ret = domain_attach_iommu(domain, iommu);
2656 spin_unlock(&iommu->lock);
2659 spin_unlock_irqrestore(&device_domain_lock, flags);
2660 free_devinfo_mem(info);
2664 list_add(&info->link, &domain->devices);
2665 list_add(&info->global, &device_domain_list);
2667 dev->archdata.iommu = info;
2668 spin_unlock_irqrestore(&device_domain_lock, flags);
2670 /* PASID table is mandatory for a PCI device in scalable mode. */
2671 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672 ret = intel_pasid_alloc_table(dev);
2674 dev_err(dev, "PASID table allocation failed\n");
2675 dmar_remove_one_dev_info(dev);
2679 /* Setup the PASID entry for requests without PASID: */
2680 spin_lock(&iommu->lock);
2681 if (hw_pass_through && domain_type_is_si(domain))
2682 ret = intel_pasid_setup_pass_through(iommu, domain,
2683 dev, PASID_RID2PASID);
2684 else if (domain_use_first_level(domain))
2685 ret = domain_setup_first_level(iommu, domain, dev,
2688 ret = intel_pasid_setup_second_level(iommu, domain,
2689 dev, PASID_RID2PASID);
2690 spin_unlock(&iommu->lock);
2692 dev_err(dev, "Setup RID2PASID failed\n");
2693 dmar_remove_one_dev_info(dev);
2698 if (dev && domain_context_mapping(domain, dev)) {
2699 dev_err(dev, "Domain context map failed\n");
2700 dmar_remove_one_dev_info(dev);
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2709 *(u16 *)opaque = alias;
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2715 struct device_domain_info *info;
2716 struct dmar_domain *domain = NULL;
2717 struct intel_iommu *iommu;
2719 unsigned long flags;
2722 iommu = device_to_iommu(dev, &bus, &devfn);
2726 if (dev_is_pci(dev)) {
2727 struct pci_dev *pdev = to_pci_dev(dev);
2729 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2731 spin_lock_irqsave(&device_domain_lock, flags);
2732 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733 PCI_BUS_NUM(dma_alias),
2736 iommu = info->iommu;
2737 domain = info->domain;
2739 spin_unlock_irqrestore(&device_domain_lock, flags);
2741 /* DMA alias already has a domain, use it */
2746 /* Allocate and initialize new domain for the device */
2747 domain = alloc_domain(0);
2750 if (domain_init(domain, iommu, gaw)) {
2751 domain_exit(domain);
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760 struct dmar_domain *domain)
2762 struct intel_iommu *iommu;
2763 struct dmar_domain *tmp;
2764 u16 req_id, dma_alias;
2767 iommu = device_to_iommu(dev, &bus, &devfn);
2771 req_id = ((u16)bus << 8) | devfn;
2773 if (dev_is_pci(dev)) {
2774 struct pci_dev *pdev = to_pci_dev(dev);
2776 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2778 /* register PCI DMA alias device */
2779 if (req_id != dma_alias) {
2780 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781 dma_alias & 0xff, NULL, domain);
2783 if (!tmp || tmp != domain)
2788 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789 if (!tmp || tmp != domain)
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796 unsigned long long start,
2797 unsigned long long end)
2799 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2802 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803 dma_to_mm_pfn(last_vpfn))) {
2804 pr_err("Reserving iova failed\n");
2808 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2810 * RMRR range might have overlap with physical memory range,
2813 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2815 return __domain_mapping(domain, first_vpfn, NULL,
2816 first_vpfn, last_vpfn - first_vpfn + 1,
2817 DMA_PTE_READ|DMA_PTE_WRITE);
2820 static int domain_prepare_identity_map(struct device *dev,
2821 struct dmar_domain *domain,
2822 unsigned long long start,
2823 unsigned long long end)
2825 /* For _hardware_ passthrough, don't bother. But for software
2826 passthrough, we do it anyway -- it may indicate a memory
2827 range which is reserved in E820, so which didn't get set
2828 up to start with in si_domain */
2829 if (domain == si_domain && hw_pass_through) {
2830 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2835 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2838 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840 dmi_get_system_info(DMI_BIOS_VENDOR),
2841 dmi_get_system_info(DMI_BIOS_VERSION),
2842 dmi_get_system_info(DMI_PRODUCT_VERSION));
2846 if (end >> agaw_to_width(domain->agaw)) {
2847 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849 agaw_to_width(domain->agaw),
2850 dmi_get_system_info(DMI_BIOS_VENDOR),
2851 dmi_get_system_info(DMI_BIOS_VERSION),
2852 dmi_get_system_info(DMI_PRODUCT_VERSION));
2856 return iommu_domain_identity_map(domain, start, end);
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2861 static int __init si_domain_init(int hw)
2863 struct dmar_rmrr_unit *rmrr;
2867 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2871 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872 domain_exit(si_domain);
2879 for_each_online_node(nid) {
2880 unsigned long start_pfn, end_pfn;
2883 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884 ret = iommu_domain_identity_map(si_domain,
2885 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2892 * Identity map the RMRRs so that devices with RMRRs could also use
2895 for_each_rmrr_units(rmrr) {
2896 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2898 unsigned long long start = rmrr->base_address;
2899 unsigned long long end = rmrr->end_address;
2901 if (WARN_ON(end < start ||
2902 end >> agaw_to_width(si_domain->agaw)))
2905 ret = iommu_domain_identity_map(si_domain, start, end);
2914 static int identity_mapping(struct device *dev)
2916 struct device_domain_info *info;
2918 info = dev->archdata.iommu;
2920 return (info->domain == si_domain);
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2927 struct dmar_domain *ndomain;
2928 struct intel_iommu *iommu;
2931 iommu = device_to_iommu(dev, &bus, &devfn);
2935 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936 if (ndomain != domain)
2942 static bool device_has_rmrr(struct device *dev)
2944 struct dmar_rmrr_unit *rmrr;
2949 for_each_rmrr_units(rmrr) {
2951 * Return TRUE if this RMRR contains the device that
2954 for_each_active_dev_scope(rmrr->devices,
2955 rmrr->devices_cnt, i, tmp)
2957 is_downstream_to_pci_bridge(dev, tmp)) {
2967 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968 * is relaxable (ie. is allowed to be not enforced under some conditions)
2969 * @dev: device handle
2971 * We assume that PCI USB devices with RMRRs have them largely
2972 * for historical reasons and that the RMRR space is not actively used post
2973 * boot. This exclusion may change if vendors begin to abuse it.
2975 * The same exception is made for graphics devices, with the requirement that
2976 * any use of the RMRR regions will be torn down before assigning the device
2979 * Return: true if the RMRR is relaxable, false otherwise
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2983 struct pci_dev *pdev;
2985 if (!dev_is_pci(dev))
2988 pdev = to_pci_dev(dev);
2989 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2996 * There are a couple cases where we need to restrict the functionality of
2997 * devices associated with RMRRs. The first is when evaluating a device for
2998 * identity mapping because problems exist when devices are moved in and out
2999 * of domains and their respective RMRR information is lost. This means that
3000 * a device with associated RMRRs will never be in a "passthrough" domain.
3001 * The second is use of the device through the IOMMU API. This interface
3002 * expects to have full control of the IOVA space for the device. We cannot
3003 * satisfy both the requirement that RMRR access is maintained and have an
3004 * unencumbered IOVA space. We also have no ability to quiesce the device's
3005 * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006 * We therefore prevent devices associated with an RMRR from participating in
3007 * the IOMMU API, which eliminates them from device assignment.
3009 * In both cases, devices which have relaxable RMRRs are not concerned by this
3010 * restriction. See device_rmrr_is_relaxable comment.
3012 static bool device_is_rmrr_locked(struct device *dev)
3014 if (!device_has_rmrr(dev))
3017 if (device_rmrr_is_relaxable(dev))
3024 * Return the required default domain type for a specific device.
3026 * @dev: the device in query
3027 * @startup: true if this is during early boot
3030 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032 * - 0: both identity and dynamic domains work for this device
3034 static int device_def_domain_type(struct device *dev)
3036 if (dev_is_pci(dev)) {
3037 struct pci_dev *pdev = to_pci_dev(dev);
3040 * Prevent any device marked as untrusted from getting
3041 * placed into the statically identity mapping domain.
3043 if (pdev->untrusted)
3044 return IOMMU_DOMAIN_DMA;
3046 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047 return IOMMU_DOMAIN_IDENTITY;
3049 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050 return IOMMU_DOMAIN_IDENTITY;
3053 * We want to start off with all devices in the 1:1 domain, and
3054 * take them out later if we find they can't access all of memory.
3056 * However, we can't do this for PCI devices behind bridges,
3057 * because all PCI devices behind the same bridge will end up
3058 * with the same source-id on their transactions.
3060 * Practically speaking, we can't change things around for these
3061 * devices at run-time, because we can't be sure there'll be no
3062 * DMA transactions in flight for any of their siblings.
3064 * So PCI devices (unless they're on the root bus) as well as
3065 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066 * the 1:1 domain, just in _case_ one of their siblings turns out
3067 * not to be able to map all of memory.
3069 if (!pci_is_pcie(pdev)) {
3070 if (!pci_is_root_bus(pdev->bus))
3071 return IOMMU_DOMAIN_DMA;
3072 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073 return IOMMU_DOMAIN_DMA;
3074 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075 return IOMMU_DOMAIN_DMA;
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3084 * Start from the sane iommu hardware state.
3085 * If the queued invalidation is already initialized by us
3086 * (for example, while enabling interrupt-remapping) then
3087 * we got the things already rolling from a sane state.
3091 * Clear any previous faults.
3093 dmar_fault(-1, iommu);
3095 * Disable queued invalidation if supported and already enabled
3096 * before OS handover.
3098 dmar_disable_qi(iommu);
3101 if (dmar_enable_qi(iommu)) {
3103 * Queued Invalidate not enabled, use Register Based Invalidate
3105 iommu->flush.flush_context = __iommu_flush_context;
3106 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107 pr_info("%s: Using Register based invalidation\n",
3110 iommu->flush.flush_context = qi_flush_context;
3111 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112 pr_info("%s: Using Queued invalidation\n", iommu->name);
3116 static int copy_context_table(struct intel_iommu *iommu,
3117 struct root_entry *old_re,
3118 struct context_entry **tbl,
3121 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122 struct context_entry *new_ce = NULL, ce;
3123 struct context_entry *old_ce = NULL;
3124 struct root_entry re;
3125 phys_addr_t old_ce_phys;
3127 tbl_idx = ext ? bus * 2 : bus;
3128 memcpy(&re, old_re, sizeof(re));
3130 for (devfn = 0; devfn < 256; devfn++) {
3131 /* First calculate the correct index */
3132 idx = (ext ? devfn * 2 : devfn) % 256;
3135 /* First save what we may have and clean up */
3137 tbl[tbl_idx] = new_ce;
3138 __iommu_flush_cache(iommu, new_ce,
3148 old_ce_phys = root_entry_lctp(&re);
3150 old_ce_phys = root_entry_uctp(&re);
3153 if (ext && devfn == 0) {
3154 /* No LCTP, try UCTP */
3163 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3168 new_ce = alloc_pgtable_page(iommu->node);
3175 /* Now copy the context entry */
3176 memcpy(&ce, old_ce + idx, sizeof(ce));
3178 if (!__context_present(&ce))
3181 did = context_domain_id(&ce);
3182 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183 set_bit(did, iommu->domain_ids);
3186 * We need a marker for copied context entries. This
3187 * marker needs to work for the old format as well as
3188 * for extended context entries.
3190 * Bit 67 of the context entry is used. In the old
3191 * format this bit is available to software, in the
3192 * extended format it is the PGE bit, but PGE is ignored
3193 * by HW if PASIDs are disabled (and thus still
3196 * So disable PASIDs first and then mark the entry
3197 * copied. This means that we don't copy PASID
3198 * translations from the old kernel, but this is fine as
3199 * faults there are not fatal.
3201 context_clear_pasid_enable(&ce);
3202 context_set_copied(&ce);
3207 tbl[tbl_idx + pos] = new_ce;
3209 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3220 struct context_entry **ctxt_tbls;
3221 struct root_entry *old_rt;
3222 phys_addr_t old_rt_phys;
3223 int ctxt_table_entries;
3224 unsigned long flags;
3229 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231 new_ext = !!ecap_ecs(iommu->ecap);
3234 * The RTT bit can only be changed when translation is disabled,
3235 * but disabling translation means to open a window for data
3236 * corruption. So bail out and don't copy anything if we would
3237 * have to change the bit.
3242 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3246 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3250 /* This is too big for the stack - allocate it from slab */
3251 ctxt_table_entries = ext ? 512 : 256;
3253 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3257 for (bus = 0; bus < 256; bus++) {
3258 ret = copy_context_table(iommu, &old_rt[bus],
3259 ctxt_tbls, bus, ext);
3261 pr_err("%s: Failed to copy context table for bus %d\n",
3267 spin_lock_irqsave(&iommu->lock, flags);
3269 /* Context tables are copied, now write them to the root_entry table */
3270 for (bus = 0; bus < 256; bus++) {
3271 int idx = ext ? bus * 2 : bus;
3274 if (ctxt_tbls[idx]) {
3275 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276 iommu->root_entry[bus].lo = val;
3279 if (!ext || !ctxt_tbls[idx + 1])
3282 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283 iommu->root_entry[bus].hi = val;
3286 spin_unlock_irqrestore(&iommu->lock, flags);
3290 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3300 static int __init init_dmars(void)
3302 struct dmar_drhd_unit *drhd;
3303 struct intel_iommu *iommu;
3309 * initialize and program root entry to not present
3312 for_each_drhd_unit(drhd) {
3314 * lock not needed as this is only incremented in the single
3315 * threaded kernel __init code path all other access are read
3318 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3322 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3325 /* Preallocate enough resources for IOMMU hot-addition */
3326 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3329 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3332 pr_err("Allocating global iommu array failed\n");
3337 for_each_iommu(iommu, drhd) {
3338 if (drhd->ignored) {
3339 iommu_disable_translation(iommu);
3344 * Find the max pasid size of all IOMMU's in the system.
3345 * We need to ensure the system pasid table is no bigger
3346 * than the smallest supported.
3348 if (pasid_supported(iommu)) {
3349 u32 temp = 2 << ecap_pss(iommu->ecap);
3351 intel_pasid_max_id = min_t(u32, temp,
3352 intel_pasid_max_id);
3355 g_iommus[iommu->seq_id] = iommu;
3357 intel_iommu_init_qi(iommu);
3359 ret = iommu_init_domains(iommu);
3363 init_translation_status(iommu);
3365 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366 iommu_disable_translation(iommu);
3367 clear_translation_pre_enabled(iommu);
3368 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3374 * we could share the same root & context tables
3375 * among all IOMMU's. Need to Split it later.
3377 ret = iommu_alloc_root_entry(iommu);
3381 if (translation_pre_enabled(iommu)) {
3382 pr_info("Translation already enabled - trying to copy translation structures\n");
3384 ret = copy_translation_tables(iommu);
3387 * We found the IOMMU with translation
3388 * enabled - but failed to copy over the
3389 * old root-entry table. Try to proceed
3390 * by disabling translation now and
3391 * allocating a clean root-entry table.
3392 * This might cause DMAR faults, but
3393 * probably the dump will still succeed.
3395 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3397 iommu_disable_translation(iommu);
3398 clear_translation_pre_enabled(iommu);
3400 pr_info("Copied translation tables from previous kernel for %s\n",
3405 if (!ecap_pass_through(iommu->ecap))
3406 hw_pass_through = 0;
3407 intel_svm_check(iommu);
3411 * Now that qi is enabled on all iommus, set the root entry and flush
3412 * caches. This is required on some Intel X58 chipsets, otherwise the
3413 * flush_context function will loop forever and the boot hangs.
3415 for_each_active_iommu(iommu, drhd) {
3416 iommu_flush_write_buffer(iommu);
3417 iommu_set_root_entry(iommu);
3418 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3427 iommu_identity_mapping |= IDENTMAP_GFX;
3429 check_tylersburg_isoch();
3431 ret = si_domain_init(hw_pass_through);
3438 * global invalidate context cache
3439 * global invalidate iotlb
3440 * enable translation
3442 for_each_iommu(iommu, drhd) {
3443 if (drhd->ignored) {
3445 * we always have to disable PMRs or DMA may fail on
3449 iommu_disable_protect_mem_regions(iommu);
3453 iommu_flush_write_buffer(iommu);
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3458 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459 * could cause possible lock race condition.
3461 up_write(&dmar_global_lock);
3462 ret = intel_svm_enable_prq(iommu);
3463 down_write(&dmar_global_lock);
3468 ret = dmar_set_interrupt(iommu);
3476 for_each_active_iommu(iommu, drhd) {
3477 disable_dmar_iommu(iommu);
3478 free_dmar_iommu(iommu);
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489 struct dmar_domain *domain,
3490 unsigned long nrpages, uint64_t dma_mask)
3492 unsigned long iova_pfn;
3495 * Restrict dma_mask to the width that the iommu can handle.
3496 * First-level translation restricts the input-address to a
3497 * canonical address (i.e., address bits 63:N have the same
3498 * value as address bit [N-1], where N is 48-bits with 4-level
3499 * paging and 57-bits with 5-level paging). Hence, skip bit
3502 if (domain_use_first_level(domain))
3503 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3506 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3509 /* Ensure we reserve the whole size-aligned region */
3510 nrpages = __roundup_pow_of_two(nrpages);
3512 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3514 * First try to allocate an io virtual address in
3515 * DMA_BIT_MASK(32) and if that fails then try allocating
3518 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519 IOVA_PFN(DMA_BIT_MASK(32)), false);
3523 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524 IOVA_PFN(dma_mask), true);
3525 if (unlikely(!iova_pfn)) {
3526 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3536 struct dmar_domain *domain, *tmp;
3537 struct dmar_rmrr_unit *rmrr;
3538 struct device *i_dev;
3541 /* Device shouldn't be attached by any domains. */
3542 domain = find_domain(dev);
3546 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3550 /* We have a new domain - setup possible RMRRs for the device */
3552 for_each_rmrr_units(rmrr) {
3553 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3558 ret = domain_prepare_identity_map(dev, domain,
3562 dev_err(dev, "Mapping reserved region failed\n");
3567 tmp = set_domain_for_dev(dev, domain);
3568 if (!tmp || domain != tmp) {
3569 domain_exit(domain);
3575 dev_err(dev, "Allocating domain failed\n");
3577 domain->domain.type = IOMMU_DOMAIN_DMA;
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3587 if (iommu_dummy(dev))
3590 if (unlikely(attach_deferred(dev)))
3591 do_deferred_attach(dev);
3593 ret = identity_mapping(dev);
3595 u64 dma_mask = *dev->dma_mask;
3597 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598 dma_mask = dev->coherent_dma_mask;
3600 if (dma_mask >= dma_direct_get_required_mask(dev))
3604 * 32 bit DMA is removed from si_domain and fall back to
3605 * non-identity mapping.
3607 dmar_remove_one_dev_info(dev);
3608 ret = iommu_request_dma_domain_for_dev(dev);
3610 struct iommu_domain *domain;
3611 struct dmar_domain *dmar_domain;
3613 domain = iommu_get_domain_for_dev(dev);
3615 dmar_domain = to_dmar_domain(domain);
3616 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3618 dmar_remove_one_dev_info(dev);
3619 get_private_domain_for_dev(dev);
3622 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629 size_t size, int dir, u64 dma_mask)
3631 struct dmar_domain *domain;
3632 phys_addr_t start_paddr;
3633 unsigned long iova_pfn;
3636 struct intel_iommu *iommu;
3637 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3639 BUG_ON(dir == DMA_NONE);
3641 domain = find_domain(dev);
3643 return DMA_MAPPING_ERROR;
3645 iommu = domain_get_iommu(domain);
3646 size = aligned_nrpages(paddr, size);
3648 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3653 * Check if DMAR supports zero-length reads on write only
3656 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657 !cap_zlr(iommu->cap))
3658 prot |= DMA_PTE_READ;
3659 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660 prot |= DMA_PTE_WRITE;
3662 * paddr - (paddr + size) might be partial page, we should map the whole
3663 * page. Note: if two part of one page are separately mapped, we
3664 * might have two guest_addr mapping to the same host paddr, but this
3665 * is not a big problem
3667 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668 mm_to_dma_pfn(paddr_pfn), size, prot);
3672 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673 start_paddr += paddr & ~PAGE_MASK;
3675 trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3681 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683 size, (unsigned long long)paddr, dir);
3684 return DMA_MAPPING_ERROR;
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688 unsigned long offset, size_t size,
3689 enum dma_data_direction dir,
3690 unsigned long attrs)
3692 if (iommu_need_mapping(dev))
3693 return __intel_map_single(dev, page_to_phys(page) + offset,
3694 size, dir, *dev->dma_mask);
3695 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699 size_t size, enum dma_data_direction dir,
3700 unsigned long attrs)
3702 if (iommu_need_mapping(dev))
3703 return __intel_map_single(dev, phys_addr, size, dir,
3705 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3710 struct dmar_domain *domain;
3711 unsigned long start_pfn, last_pfn;
3712 unsigned long nrpages;
3713 unsigned long iova_pfn;
3714 struct intel_iommu *iommu;
3715 struct page *freelist;
3716 struct pci_dev *pdev = NULL;
3718 domain = find_domain(dev);
3721 iommu = domain_get_iommu(domain);
3723 iova_pfn = IOVA_PFN(dev_addr);
3725 nrpages = aligned_nrpages(dev_addr, size);
3726 start_pfn = mm_to_dma_pfn(iova_pfn);
3727 last_pfn = start_pfn + nrpages - 1;
3729 if (dev_is_pci(dev))
3730 pdev = to_pci_dev(dev);
3732 freelist = domain_unmap(domain, start_pfn, last_pfn);
3733 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734 !has_iova_flush_queue(&domain->iovad)) {
3735 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736 nrpages, !freelist, 0);
3738 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739 dma_free_pagelist(freelist);
3741 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742 (unsigned long)freelist);
3744 * queue up the release of the unmap to save the 1/6th of the
3745 * cpu used up by the iotlb flush operation...
3749 trace_unmap_single(dev, dev_addr, size);
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753 size_t size, enum dma_data_direction dir,
3754 unsigned long attrs)
3756 if (iommu_need_mapping(dev))
3757 intel_unmap(dev, dev_addr, size);
3759 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763 size_t size, enum dma_data_direction dir, unsigned long attrs)
3765 if (iommu_need_mapping(dev))
3766 intel_unmap(dev, dev_addr, size);
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770 dma_addr_t *dma_handle, gfp_t flags,
3771 unsigned long attrs)
3773 struct page *page = NULL;
3776 if (!iommu_need_mapping(dev))
3777 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3779 size = PAGE_ALIGN(size);
3780 order = get_order(size);
3782 if (gfpflags_allow_blocking(flags)) {
3783 unsigned int count = size >> PAGE_SHIFT;
3785 page = dma_alloc_from_contiguous(dev, count, order,
3786 flags & __GFP_NOWARN);
3790 page = alloc_pages(flags, order);
3793 memset(page_address(page), 0, size);
3795 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3797 dev->coherent_dma_mask);
3798 if (*dma_handle != DMA_MAPPING_ERROR)
3799 return page_address(page);
3800 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801 __free_pages(page, order);
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807 dma_addr_t dma_handle, unsigned long attrs)
3810 struct page *page = virt_to_page(vaddr);
3812 if (!iommu_need_mapping(dev))
3813 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3815 size = PAGE_ALIGN(size);
3816 order = get_order(size);
3818 intel_unmap(dev, dma_handle, size);
3819 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820 __free_pages(page, order);
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824 int nelems, enum dma_data_direction dir,
3825 unsigned long attrs)
3827 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828 unsigned long nrpages = 0;
3829 struct scatterlist *sg;
3832 if (!iommu_need_mapping(dev))
3833 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3835 for_each_sg(sglist, sg, nelems, i) {
3836 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3839 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3841 trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845 enum dma_data_direction dir, unsigned long attrs)
3848 struct dmar_domain *domain;
3851 unsigned long iova_pfn;
3853 struct scatterlist *sg;
3854 unsigned long start_vpfn;
3855 struct intel_iommu *iommu;
3857 BUG_ON(dir == DMA_NONE);
3858 if (!iommu_need_mapping(dev))
3859 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3861 domain = find_domain(dev);
3865 iommu = domain_get_iommu(domain);
3867 for_each_sg(sglist, sg, nelems, i)
3868 size += aligned_nrpages(sg->offset, sg->length);
3870 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3873 sglist->dma_length = 0;
3878 * Check if DMAR supports zero-length reads on write only
3881 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882 !cap_zlr(iommu->cap))
3883 prot |= DMA_PTE_READ;
3884 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885 prot |= DMA_PTE_WRITE;
3887 start_vpfn = mm_to_dma_pfn(iova_pfn);
3889 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890 if (unlikely(ret)) {
3891 dma_pte_free_pagetable(domain, start_vpfn,
3892 start_vpfn + size - 1,
3893 agaw_to_level(domain->agaw) + 1);
3894 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3898 for_each_sg(sglist, sg, nelems, i)
3899 trace_map_sg(dev, i + 1, nelems, sg);
3904 static u64 intel_get_required_mask(struct device *dev)
3906 if (!iommu_need_mapping(dev))
3907 return dma_direct_get_required_mask(dev);
3908 return DMA_BIT_MASK(32);
3911 static const struct dma_map_ops intel_dma_ops = {
3912 .alloc = intel_alloc_coherent,
3913 .free = intel_free_coherent,
3914 .map_sg = intel_map_sg,
3915 .unmap_sg = intel_unmap_sg,
3916 .map_page = intel_map_page,
3917 .unmap_page = intel_unmap_page,
3918 .map_resource = intel_map_resource,
3919 .unmap_resource = intel_unmap_resource,
3920 .dma_supported = dma_direct_supported,
3921 .mmap = dma_common_mmap,
3922 .get_sgtable = dma_common_get_sgtable,
3923 .get_required_mask = intel_get_required_mask,
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928 enum dma_data_direction dir, enum dma_sync_target target)
3930 struct dmar_domain *domain;
3931 phys_addr_t tlb_addr;
3933 domain = find_domain(dev);
3934 if (WARN_ON(!domain))
3937 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938 if (is_swiotlb_buffer(tlb_addr))
3939 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944 enum dma_data_direction dir, unsigned long attrs,
3947 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948 struct dmar_domain *domain;
3949 struct intel_iommu *iommu;
3950 unsigned long iova_pfn;
3951 unsigned long nrpages;
3952 phys_addr_t tlb_addr;
3956 if (unlikely(attach_deferred(dev)))
3957 do_deferred_attach(dev);
3959 domain = find_domain(dev);
3961 if (WARN_ON(dir == DMA_NONE || !domain))
3962 return DMA_MAPPING_ERROR;
3964 iommu = domain_get_iommu(domain);
3965 if (WARN_ON(!iommu))
3966 return DMA_MAPPING_ERROR;
3968 nrpages = aligned_nrpages(0, size);
3969 iova_pfn = intel_alloc_iova(dev, domain,
3970 dma_to_mm_pfn(nrpages), dma_mask);
3972 return DMA_MAPPING_ERROR;
3975 * Check if DMAR supports zero-length reads on write only
3978 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3979 !cap_zlr(iommu->cap))
3980 prot |= DMA_PTE_READ;
3981 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3982 prot |= DMA_PTE_WRITE;
3985 * If both the physical buffer start address and size are
3986 * page aligned, we don't need to use a bounce page.
3988 if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3989 tlb_addr = swiotlb_tbl_map_single(dev,
3990 __phys_to_dma(dev, io_tlb_start),
3991 paddr, size, aligned_size, dir, attrs);
3992 if (tlb_addr == DMA_MAPPING_ERROR) {
3995 /* Cleanup the padding area. */
3996 void *padding_start = phys_to_virt(tlb_addr);
3997 size_t padding_size = aligned_size;
3999 if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
4000 (dir == DMA_TO_DEVICE ||
4001 dir == DMA_BIDIRECTIONAL)) {
4002 padding_start += size;
4003 padding_size -= size;
4006 memset(padding_start, 0, padding_size);
4012 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4013 tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4017 trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4019 return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4022 if (is_swiotlb_buffer(tlb_addr))
4023 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4024 aligned_size, dir, attrs);
4026 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4027 dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4028 size, (unsigned long long)paddr, dir);
4030 return DMA_MAPPING_ERROR;
4034 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4035 enum dma_data_direction dir, unsigned long attrs)
4037 size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4038 struct dmar_domain *domain;
4039 phys_addr_t tlb_addr;
4041 domain = find_domain(dev);
4042 if (WARN_ON(!domain))
4045 tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4046 if (WARN_ON(!tlb_addr))
4049 intel_unmap(dev, dev_addr, size);
4050 if (is_swiotlb_buffer(tlb_addr))
4051 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4052 aligned_size, dir, attrs);
4054 trace_bounce_unmap_single(dev, dev_addr, size);
4058 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4059 size_t size, enum dma_data_direction dir, unsigned long attrs)
4061 return bounce_map_single(dev, page_to_phys(page) + offset,
4062 size, dir, attrs, *dev->dma_mask);
4066 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4067 enum dma_data_direction dir, unsigned long attrs)
4069 return bounce_map_single(dev, phys_addr, size,
4070 dir, attrs, *dev->dma_mask);
4074 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4075 enum dma_data_direction dir, unsigned long attrs)
4077 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4081 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4082 enum dma_data_direction dir, unsigned long attrs)
4084 bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4088 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4089 enum dma_data_direction dir, unsigned long attrs)
4091 struct scatterlist *sg;
4094 for_each_sg(sglist, sg, nelems, i)
4095 bounce_unmap_page(dev, sg->dma_address,
4096 sg_dma_len(sg), dir, attrs);
4100 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4101 enum dma_data_direction dir, unsigned long attrs)
4104 struct scatterlist *sg;
4106 for_each_sg(sglist, sg, nelems, i) {
4107 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4108 sg->offset, sg->length,
4110 if (sg->dma_address == DMA_MAPPING_ERROR)
4112 sg_dma_len(sg) = sg->length;
4115 for_each_sg(sglist, sg, nelems, i)
4116 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4121 bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4126 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4127 size_t size, enum dma_data_direction dir)
4129 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4133 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4134 size_t size, enum dma_data_direction dir)
4136 bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4140 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4141 int nelems, enum dma_data_direction dir)
4143 struct scatterlist *sg;
4146 for_each_sg(sglist, sg, nelems, i)
4147 bounce_sync_single(dev, sg_dma_address(sg),
4148 sg_dma_len(sg), dir, SYNC_FOR_CPU);
4152 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4153 int nelems, enum dma_data_direction dir)
4155 struct scatterlist *sg;
4158 for_each_sg(sglist, sg, nelems, i)
4159 bounce_sync_single(dev, sg_dma_address(sg),
4160 sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4163 static const struct dma_map_ops bounce_dma_ops = {
4164 .alloc = intel_alloc_coherent,
4165 .free = intel_free_coherent,
4166 .map_sg = bounce_map_sg,
4167 .unmap_sg = bounce_unmap_sg,
4168 .map_page = bounce_map_page,
4169 .unmap_page = bounce_unmap_page,
4170 .sync_single_for_cpu = bounce_sync_single_for_cpu,
4171 .sync_single_for_device = bounce_sync_single_for_device,
4172 .sync_sg_for_cpu = bounce_sync_sg_for_cpu,
4173 .sync_sg_for_device = bounce_sync_sg_for_device,
4174 .map_resource = bounce_map_resource,
4175 .unmap_resource = bounce_unmap_resource,
4176 .dma_supported = dma_direct_supported,
4179 static inline int iommu_domain_cache_init(void)
4183 iommu_domain_cache = kmem_cache_create("iommu_domain",
4184 sizeof(struct dmar_domain),
4189 if (!iommu_domain_cache) {
4190 pr_err("Couldn't create iommu_domain cache\n");
4197 static inline int iommu_devinfo_cache_init(void)
4201 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4202 sizeof(struct device_domain_info),
4206 if (!iommu_devinfo_cache) {
4207 pr_err("Couldn't create devinfo cache\n");
4214 static int __init iommu_init_mempool(void)
4217 ret = iova_cache_get();
4221 ret = iommu_domain_cache_init();
4225 ret = iommu_devinfo_cache_init();
4229 kmem_cache_destroy(iommu_domain_cache);
4236 static void __init iommu_exit_mempool(void)
4238 kmem_cache_destroy(iommu_devinfo_cache);
4239 kmem_cache_destroy(iommu_domain_cache);
4243 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4245 struct dmar_drhd_unit *drhd;
4249 /* We know that this device on this chipset has its own IOMMU.
4250 * If we find it under a different IOMMU, then the BIOS is lying
4251 * to us. Hope that the IOMMU for this device is actually
4252 * disabled, and it needs no translation...
4254 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4256 /* "can't" happen */
4257 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4260 vtbar &= 0xffff0000;
4262 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4263 drhd = dmar_find_matched_drhd_unit(pdev);
4264 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
4265 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
4266 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4267 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4270 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4272 static void __init init_no_remapping_devices(void)
4274 struct dmar_drhd_unit *drhd;
4278 for_each_drhd_unit(drhd) {
4279 if (!drhd->include_all) {
4280 for_each_active_dev_scope(drhd->devices,
4281 drhd->devices_cnt, i, dev)
4283 /* ignore DMAR unit if no devices exist */
4284 if (i == drhd->devices_cnt)
4289 for_each_active_drhd_unit(drhd) {
4290 if (drhd->include_all)
4293 for_each_active_dev_scope(drhd->devices,
4294 drhd->devices_cnt, i, dev)
4295 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4297 if (i < drhd->devices_cnt)
4300 /* This IOMMU has *only* gfx devices. Either bypass it or
4301 set the gfx_mapped flag, as appropriate */
4302 if (!dmar_map_gfx) {
4304 for_each_active_dev_scope(drhd->devices,
4305 drhd->devices_cnt, i, dev)
4306 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4311 #ifdef CONFIG_SUSPEND
4312 static int init_iommu_hw(void)
4314 struct dmar_drhd_unit *drhd;
4315 struct intel_iommu *iommu = NULL;
4317 for_each_active_iommu(iommu, drhd)
4319 dmar_reenable_qi(iommu);
4321 for_each_iommu(iommu, drhd) {
4322 if (drhd->ignored) {
4324 * we always have to disable PMRs or DMA may fail on
4328 iommu_disable_protect_mem_regions(iommu);
4332 iommu_flush_write_buffer(iommu);
4334 iommu_set_root_entry(iommu);
4336 iommu->flush.flush_context(iommu, 0, 0, 0,
4337 DMA_CCMD_GLOBAL_INVL);
4338 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4339 iommu_enable_translation(iommu);
4340 iommu_disable_protect_mem_regions(iommu);
4346 static void iommu_flush_all(void)
4348 struct dmar_drhd_unit *drhd;
4349 struct intel_iommu *iommu;
4351 for_each_active_iommu(iommu, drhd) {
4352 iommu->flush.flush_context(iommu, 0, 0, 0,
4353 DMA_CCMD_GLOBAL_INVL);
4354 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4355 DMA_TLB_GLOBAL_FLUSH);
4359 static int iommu_suspend(void)
4361 struct dmar_drhd_unit *drhd;
4362 struct intel_iommu *iommu = NULL;
4365 for_each_active_iommu(iommu, drhd) {
4366 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4368 if (!iommu->iommu_state)
4374 for_each_active_iommu(iommu, drhd) {
4375 iommu_disable_translation(iommu);
4377 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4379 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4380 readl(iommu->reg + DMAR_FECTL_REG);
4381 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4382 readl(iommu->reg + DMAR_FEDATA_REG);
4383 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4384 readl(iommu->reg + DMAR_FEADDR_REG);
4385 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4386 readl(iommu->reg + DMAR_FEUADDR_REG);
4388 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4393 for_each_active_iommu(iommu, drhd)
4394 kfree(iommu->iommu_state);
4399 static void iommu_resume(void)
4401 struct dmar_drhd_unit *drhd;
4402 struct intel_iommu *iommu = NULL;
4405 if (init_iommu_hw()) {
4407 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4409 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4413 for_each_active_iommu(iommu, drhd) {
4415 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4417 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4418 iommu->reg + DMAR_FECTL_REG);
4419 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4420 iommu->reg + DMAR_FEDATA_REG);
4421 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4422 iommu->reg + DMAR_FEADDR_REG);
4423 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4424 iommu->reg + DMAR_FEUADDR_REG);
4426 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4429 for_each_active_iommu(iommu, drhd)
4430 kfree(iommu->iommu_state);
4433 static struct syscore_ops iommu_syscore_ops = {
4434 .resume = iommu_resume,
4435 .suspend = iommu_suspend,
4438 static void __init init_iommu_pm_ops(void)
4440 register_syscore_ops(&iommu_syscore_ops);
4444 static inline void init_iommu_pm_ops(void) {}
4445 #endif /* CONFIG_PM */
4447 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4449 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4450 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4451 rmrr->end_address <= rmrr->base_address ||
4452 arch_rmrr_sanity_check(rmrr))
4458 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4460 struct acpi_dmar_reserved_memory *rmrr;
4461 struct dmar_rmrr_unit *rmrru;
4463 rmrr = (struct acpi_dmar_reserved_memory *)header;
4464 if (rmrr_sanity_check(rmrr)) {
4466 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4467 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4468 rmrr->base_address, rmrr->end_address,
4469 dmi_get_system_info(DMI_BIOS_VENDOR),
4470 dmi_get_system_info(DMI_BIOS_VERSION),
4471 dmi_get_system_info(DMI_PRODUCT_VERSION));
4472 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
4475 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4479 rmrru->hdr = header;
4481 rmrru->base_address = rmrr->base_address;
4482 rmrru->end_address = rmrr->end_address;
4484 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4485 ((void *)rmrr) + rmrr->header.length,
4486 &rmrru->devices_cnt);
4487 if (rmrru->devices_cnt && rmrru->devices == NULL)
4490 list_add(&rmrru->list, &dmar_rmrr_units);
4499 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4501 struct dmar_atsr_unit *atsru;
4502 struct acpi_dmar_atsr *tmp;
4504 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4505 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4506 if (atsr->segment != tmp->segment)
4508 if (atsr->header.length != tmp->header.length)
4510 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4517 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4519 struct acpi_dmar_atsr *atsr;
4520 struct dmar_atsr_unit *atsru;
4522 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4525 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4526 atsru = dmar_find_atsr(atsr);
4530 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4535 * If memory is allocated from slab by ACPI _DSM method, we need to
4536 * copy the memory content because the memory buffer will be freed
4539 atsru->hdr = (void *)(atsru + 1);
4540 memcpy(atsru->hdr, hdr, hdr->length);
4541 atsru->include_all = atsr->flags & 0x1;
4542 if (!atsru->include_all) {
4543 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4544 (void *)atsr + atsr->header.length,
4545 &atsru->devices_cnt);
4546 if (atsru->devices_cnt && atsru->devices == NULL) {
4552 list_add_rcu(&atsru->list, &dmar_atsr_units);
4557 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4559 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4563 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4565 struct acpi_dmar_atsr *atsr;
4566 struct dmar_atsr_unit *atsru;
4568 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4569 atsru = dmar_find_atsr(atsr);
4571 list_del_rcu(&atsru->list);
4573 intel_iommu_free_atsr(atsru);
4579 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4583 struct acpi_dmar_atsr *atsr;
4584 struct dmar_atsr_unit *atsru;
4586 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4587 atsru = dmar_find_atsr(atsr);
4591 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4592 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4600 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4603 struct intel_iommu *iommu = dmaru->iommu;
4605 if (g_iommus[iommu->seq_id])
4608 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4609 pr_warn("%s: Doesn't support hardware pass through.\n",
4613 if (!ecap_sc_support(iommu->ecap) &&
4614 domain_update_iommu_snooping(iommu)) {
4615 pr_warn("%s: Doesn't support snooping.\n",
4619 sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4620 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4621 pr_warn("%s: Doesn't support large page.\n",
4627 * Disable translation if already enabled prior to OS handover.
4629 if (iommu->gcmd & DMA_GCMD_TE)
4630 iommu_disable_translation(iommu);
4632 g_iommus[iommu->seq_id] = iommu;
4633 ret = iommu_init_domains(iommu);
4635 ret = iommu_alloc_root_entry(iommu);
4639 intel_svm_check(iommu);
4641 if (dmaru->ignored) {
4643 * we always have to disable PMRs or DMA may fail on this device
4646 iommu_disable_protect_mem_regions(iommu);
4650 intel_iommu_init_qi(iommu);
4651 iommu_flush_write_buffer(iommu);
4653 #ifdef CONFIG_INTEL_IOMMU_SVM
4654 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4655 ret = intel_svm_enable_prq(iommu);
4660 ret = dmar_set_interrupt(iommu);
4664 iommu_set_root_entry(iommu);
4665 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4666 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4667 iommu_enable_translation(iommu);
4669 iommu_disable_protect_mem_regions(iommu);
4673 disable_dmar_iommu(iommu);
4675 free_dmar_iommu(iommu);
4679 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4682 struct intel_iommu *iommu = dmaru->iommu;
4684 if (!intel_iommu_enabled)
4690 ret = intel_iommu_add(dmaru);
4692 disable_dmar_iommu(iommu);
4693 free_dmar_iommu(iommu);
4699 static void intel_iommu_free_dmars(void)
4701 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4702 struct dmar_atsr_unit *atsru, *atsr_n;
4704 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4705 list_del(&rmrru->list);
4706 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4710 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4711 list_del(&atsru->list);
4712 intel_iommu_free_atsr(atsru);
4716 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4719 struct pci_bus *bus;
4720 struct pci_dev *bridge = NULL;
4722 struct acpi_dmar_atsr *atsr;
4723 struct dmar_atsr_unit *atsru;
4725 dev = pci_physfn(dev);
4726 for (bus = dev->bus; bus; bus = bus->parent) {
4728 /* If it's an integrated device, allow ATS */
4731 /* Connected via non-PCIe: no ATS */
4732 if (!pci_is_pcie(bridge) ||
4733 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4735 /* If we found the root port, look it up in the ATSR */
4736 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4741 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4742 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4743 if (atsr->segment != pci_domain_nr(dev->bus))
4746 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4747 if (tmp == &bridge->dev)
4750 if (atsru->include_all)
4760 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4763 struct dmar_rmrr_unit *rmrru;
4764 struct dmar_atsr_unit *atsru;
4765 struct acpi_dmar_atsr *atsr;
4766 struct acpi_dmar_reserved_memory *rmrr;
4768 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4771 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4772 rmrr = container_of(rmrru->hdr,
4773 struct acpi_dmar_reserved_memory, header);
4774 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4775 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4776 ((void *)rmrr) + rmrr->header.length,
4777 rmrr->segment, rmrru->devices,
4778 rmrru->devices_cnt);
4781 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4782 dmar_remove_dev_scope(info, rmrr->segment,
4783 rmrru->devices, rmrru->devices_cnt);
4787 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4788 if (atsru->include_all)
4791 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4792 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4793 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4794 (void *)atsr + atsr->header.length,
4795 atsr->segment, atsru->devices,
4796 atsru->devices_cnt);
4801 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4802 if (dmar_remove_dev_scope(info, atsr->segment,
4803 atsru->devices, atsru->devices_cnt))
4811 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4812 unsigned long val, void *v)
4814 struct memory_notify *mhp = v;
4815 unsigned long long start, end;
4816 unsigned long start_vpfn, last_vpfn;
4819 case MEM_GOING_ONLINE:
4820 start = mhp->start_pfn << PAGE_SHIFT;
4821 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4822 if (iommu_domain_identity_map(si_domain, start, end)) {
4823 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4830 case MEM_CANCEL_ONLINE:
4831 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4832 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4833 while (start_vpfn <= last_vpfn) {
4835 struct dmar_drhd_unit *drhd;
4836 struct intel_iommu *iommu;
4837 struct page *freelist;
4839 iova = find_iova(&si_domain->iovad, start_vpfn);
4841 pr_debug("Failed get IOVA for PFN %lx\n",
4846 iova = split_and_remove_iova(&si_domain->iovad, iova,
4847 start_vpfn, last_vpfn);
4849 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4850 start_vpfn, last_vpfn);
4854 freelist = domain_unmap(si_domain, iova->pfn_lo,
4858 for_each_active_iommu(iommu, drhd)
4859 iommu_flush_iotlb_psi(iommu, si_domain,
4860 iova->pfn_lo, iova_size(iova),
4863 dma_free_pagelist(freelist);
4865 start_vpfn = iova->pfn_hi + 1;
4866 free_iova_mem(iova);
4874 static struct notifier_block intel_iommu_memory_nb = {
4875 .notifier_call = intel_iommu_memory_notifier,
4879 static void free_all_cpu_cached_iovas(unsigned int cpu)
4883 for (i = 0; i < g_num_of_iommus; i++) {
4884 struct intel_iommu *iommu = g_iommus[i];
4885 struct dmar_domain *domain;
4891 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4892 domain = get_iommu_domain(iommu, (u16)did);
4896 free_cpu_cached_iovas(cpu, &domain->iovad);
4901 static int intel_iommu_cpu_dead(unsigned int cpu)
4903 free_all_cpu_cached_iovas(cpu);
4907 static void intel_disable_iommus(void)
4909 struct intel_iommu *iommu = NULL;
4910 struct dmar_drhd_unit *drhd;
4912 for_each_iommu(iommu, drhd)
4913 iommu_disable_translation(iommu);
4916 void intel_iommu_shutdown(void)
4918 struct dmar_drhd_unit *drhd;
4919 struct intel_iommu *iommu = NULL;
4921 if (no_iommu || dmar_disabled)
4924 down_write(&dmar_global_lock);
4926 /* Disable PMRs explicitly here. */
4927 for_each_iommu(iommu, drhd)
4928 iommu_disable_protect_mem_regions(iommu);
4930 /* Make sure the IOMMUs are switched off */
4931 intel_disable_iommus();
4933 up_write(&dmar_global_lock);
4936 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4938 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4940 return container_of(iommu_dev, struct intel_iommu, iommu);
4943 static ssize_t intel_iommu_show_version(struct device *dev,
4944 struct device_attribute *attr,
4947 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4948 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4949 return sprintf(buf, "%d:%d\n",
4950 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4952 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4954 static ssize_t intel_iommu_show_address(struct device *dev,
4955 struct device_attribute *attr,
4958 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4959 return sprintf(buf, "%llx\n", iommu->reg_phys);
4961 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4963 static ssize_t intel_iommu_show_cap(struct device *dev,
4964 struct device_attribute *attr,
4967 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4968 return sprintf(buf, "%llx\n", iommu->cap);
4970 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4972 static ssize_t intel_iommu_show_ecap(struct device *dev,
4973 struct device_attribute *attr,
4976 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4977 return sprintf(buf, "%llx\n", iommu->ecap);
4979 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4981 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4982 struct device_attribute *attr,
4985 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4986 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4988 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4990 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4991 struct device_attribute *attr,
4994 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4995 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4996 cap_ndoms(iommu->cap)));
4998 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
5000 static struct attribute *intel_iommu_attrs[] = {
5001 &dev_attr_version.attr,
5002 &dev_attr_address.attr,
5004 &dev_attr_ecap.attr,
5005 &dev_attr_domains_supported.attr,
5006 &dev_attr_domains_used.attr,
5010 static struct attribute_group intel_iommu_group = {
5011 .name = "intel-iommu",
5012 .attrs = intel_iommu_attrs,
5015 const struct attribute_group *intel_iommu_groups[] = {
5020 static inline bool has_untrusted_dev(void)
5022 struct pci_dev *pdev = NULL;
5024 for_each_pci_dev(pdev)
5025 if (pdev->untrusted)
5031 static int __init platform_optin_force_iommu(void)
5033 if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5036 if (no_iommu || dmar_disabled)
5037 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5040 * If Intel-IOMMU is disabled by default, we will apply identity
5041 * map for all devices except those marked as being untrusted.
5044 iommu_set_default_passthrough(false);
5052 static int __init probe_acpi_namespace_devices(void)
5054 struct dmar_drhd_unit *drhd;
5055 /* To avoid a -Wunused-but-set-variable warning. */
5056 struct intel_iommu *iommu __maybe_unused;
5060 for_each_active_iommu(iommu, drhd) {
5061 for_each_active_dev_scope(drhd->devices,
5062 drhd->devices_cnt, i, dev) {
5063 struct acpi_device_physical_node *pn;
5064 struct iommu_group *group;
5065 struct acpi_device *adev;
5067 if (dev->bus != &acpi_bus_type)
5070 adev = to_acpi_device(dev);
5071 mutex_lock(&adev->physical_node_lock);
5072 list_for_each_entry(pn,
5073 &adev->physical_node_list, node) {
5074 group = iommu_group_get(pn->dev);
5076 iommu_group_put(group);
5080 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5081 ret = iommu_probe_device(pn->dev);
5085 mutex_unlock(&adev->physical_node_lock);
5095 int __init intel_iommu_init(void)
5098 struct dmar_drhd_unit *drhd;
5099 struct intel_iommu *iommu;
5102 * Intel IOMMU is required for a TXT/tboot launch or platform
5103 * opt in, so enforce that.
5105 force_on = tboot_force_iommu() || platform_optin_force_iommu();
5107 if (iommu_init_mempool()) {
5109 panic("tboot: Failed to initialize iommu memory\n");
5113 down_write(&dmar_global_lock);
5114 if (dmar_table_init()) {
5116 panic("tboot: Failed to initialize DMAR table\n");
5120 if (dmar_dev_scope_init() < 0) {
5122 panic("tboot: Failed to initialize DMAR device scope\n");
5126 up_write(&dmar_global_lock);
5129 * The bus notifier takes the dmar_global_lock, so lockdep will
5130 * complain later when we register it under the lock.
5132 dmar_register_bus_notifier();
5134 down_write(&dmar_global_lock);
5137 intel_iommu_debugfs_init();
5139 if (no_iommu || dmar_disabled) {
5141 * We exit the function here to ensure IOMMU's remapping and
5142 * mempool aren't setup, which means that the IOMMU's PMRs
5143 * won't be disabled via the call to init_dmars(). So disable
5144 * it explicitly here. The PMRs were setup by tboot prior to
5145 * calling SENTER, but the kernel is expected to reset/tear
5148 if (intel_iommu_tboot_noforce) {
5149 for_each_iommu(iommu, drhd)
5150 iommu_disable_protect_mem_regions(iommu);
5154 * Make sure the IOMMUs are switched off, even when we
5155 * boot into a kexec kernel and the previous kernel left
5158 intel_disable_iommus();
5162 if (list_empty(&dmar_rmrr_units))
5163 pr_info("No RMRR found\n");
5165 if (list_empty(&dmar_atsr_units))
5166 pr_info("No ATSR found\n");
5168 if (dmar_init_reserved_ranges()) {
5170 panic("tboot: Failed to reserve iommu ranges\n");
5171 goto out_free_reserved_range;
5175 intel_iommu_gfx_mapped = 1;
5177 init_no_remapping_devices();
5182 panic("tboot: Failed to initialize DMARs\n");
5183 pr_err("Initialization failed\n");
5184 goto out_free_reserved_range;
5186 up_write(&dmar_global_lock);
5188 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5190 * If the system has no untrusted device or the user has decided
5191 * to disable the bounce page mechanisms, we don't need swiotlb.
5192 * Mark this and the pre-allocated bounce pages will be released
5195 if (!has_untrusted_dev() || intel_no_bounce)
5198 dma_ops = &intel_dma_ops;
5200 init_iommu_pm_ops();
5202 down_read(&dmar_global_lock);
5203 for_each_active_iommu(iommu, drhd) {
5204 iommu_device_sysfs_add(&iommu->iommu, NULL,
5207 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5208 iommu_device_register(&iommu->iommu);
5210 up_read(&dmar_global_lock);
5212 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5213 if (si_domain && !hw_pass_through)
5214 register_memory_notifier(&intel_iommu_memory_nb);
5215 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5216 intel_iommu_cpu_dead);
5218 down_read(&dmar_global_lock);
5219 if (probe_acpi_namespace_devices())
5220 pr_warn("ACPI name space devices didn't probe correctly\n");
5222 /* Finally, we enable the DMA remapping hardware. */
5223 for_each_iommu(iommu, drhd) {
5224 if (!drhd->ignored && !translation_pre_enabled(iommu))
5225 iommu_enable_translation(iommu);
5227 iommu_disable_protect_mem_regions(iommu);
5229 up_read(&dmar_global_lock);
5231 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5233 intel_iommu_enabled = 1;
5237 out_free_reserved_range:
5238 put_iova_domain(&reserved_iova_list);
5240 intel_iommu_free_dmars();
5241 up_write(&dmar_global_lock);
5242 iommu_exit_mempool();
5246 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5248 struct intel_iommu *iommu = opaque;
5250 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5255 * NB - intel-iommu lacks any sort of reference counting for the users of
5256 * dependent devices. If multiple endpoints have intersecting dependent
5257 * devices, unbinding the driver from any one of them will possibly leave
5258 * the others unable to operate.
5260 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5262 if (!iommu || !dev || !dev_is_pci(dev))
5265 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5268 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5270 struct dmar_domain *domain;
5271 struct intel_iommu *iommu;
5272 unsigned long flags;
5274 assert_spin_locked(&device_domain_lock);
5279 iommu = info->iommu;
5280 domain = info->domain;
5283 if (dev_is_pci(info->dev) && sm_supported(iommu))
5284 intel_pasid_tear_down_entry(iommu, info->dev,
5287 iommu_disable_dev_iotlb(info);
5288 domain_context_clear(iommu, info->dev);
5289 intel_pasid_free_table(info->dev);
5292 unlink_domain_info(info);
5294 spin_lock_irqsave(&iommu->lock, flags);
5295 domain_detach_iommu(domain, iommu);
5296 spin_unlock_irqrestore(&iommu->lock, flags);
5298 /* free the private domain */
5299 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5300 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5301 list_empty(&domain->devices))
5302 domain_exit(info->domain);
5304 free_devinfo_mem(info);
5307 static void dmar_remove_one_dev_info(struct device *dev)
5309 struct device_domain_info *info;
5310 unsigned long flags;
5312 spin_lock_irqsave(&device_domain_lock, flags);
5313 info = dev->archdata.iommu;
5314 if (info && info != DEFER_DEVICE_DOMAIN_INFO
5315 && info != DUMMY_DEVICE_DOMAIN_INFO)
5316 __dmar_remove_one_dev_info(info);
5317 spin_unlock_irqrestore(&device_domain_lock, flags);
5320 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5324 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5325 domain_reserve_special_ranges(domain);
5327 /* calculate AGAW */
5328 domain->gaw = guest_width;
5329 adjust_width = guestwidth_to_adjustwidth(guest_width);
5330 domain->agaw = width_to_agaw(adjust_width);
5332 domain->iommu_coherency = 0;
5333 domain->iommu_snooping = 0;
5334 domain->iommu_superpage = 0;
5335 domain->max_addr = 0;
5337 /* always allocate the top pgd */
5338 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5341 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5345 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5347 struct dmar_domain *dmar_domain;
5348 struct iommu_domain *domain;
5352 case IOMMU_DOMAIN_DMA:
5354 case IOMMU_DOMAIN_UNMANAGED:
5355 dmar_domain = alloc_domain(0);
5357 pr_err("Can't allocate dmar_domain\n");
5360 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5361 pr_err("Domain initialization failed\n");
5362 domain_exit(dmar_domain);
5366 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5367 ret = init_iova_flush_queue(&dmar_domain->iovad,
5371 pr_info("iova flush queue initialization failed\n");
5374 domain_update_iommu_cap(dmar_domain);
5376 domain = &dmar_domain->domain;
5377 domain->geometry.aperture_start = 0;
5378 domain->geometry.aperture_end =
5379 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5380 domain->geometry.force_aperture = true;
5383 case IOMMU_DOMAIN_IDENTITY:
5384 return &si_domain->domain;
5392 static void intel_iommu_domain_free(struct iommu_domain *domain)
5394 if (domain != &si_domain->domain)
5395 domain_exit(to_dmar_domain(domain));
5399 * Check whether a @domain could be attached to the @dev through the
5400 * aux-domain attach/detach APIs.
5403 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5405 struct device_domain_info *info = dev->archdata.iommu;
5407 return info && info->auxd_enabled &&
5408 domain->type == IOMMU_DOMAIN_UNMANAGED;
5411 static void auxiliary_link_device(struct dmar_domain *domain,
5414 struct device_domain_info *info = dev->archdata.iommu;
5416 assert_spin_locked(&device_domain_lock);
5420 domain->auxd_refcnt++;
5421 list_add(&domain->auxd, &info->auxiliary_domains);
5424 static void auxiliary_unlink_device(struct dmar_domain *domain,
5427 struct device_domain_info *info = dev->archdata.iommu;
5429 assert_spin_locked(&device_domain_lock);
5433 list_del(&domain->auxd);
5434 domain->auxd_refcnt--;
5436 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5437 ioasid_free(domain->default_pasid);
5440 static int aux_domain_add_dev(struct dmar_domain *domain,
5445 unsigned long flags;
5446 struct intel_iommu *iommu;
5448 iommu = device_to_iommu(dev, &bus, &devfn);
5452 if (domain->default_pasid <= 0) {
5455 /* No private data needed for the default pasid */
5456 pasid = ioasid_alloc(NULL, PASID_MIN,
5457 pci_max_pasids(to_pci_dev(dev)) - 1,
5459 if (pasid == INVALID_IOASID) {
5460 pr_err("Can't allocate default pasid\n");
5463 domain->default_pasid = pasid;
5466 spin_lock_irqsave(&device_domain_lock, flags);
5468 * iommu->lock must be held to attach domain to iommu and setup the
5469 * pasid entry for second level translation.
5471 spin_lock(&iommu->lock);
5472 ret = domain_attach_iommu(domain, iommu);
5476 /* Setup the PASID entry for mediated devices: */
5477 if (domain_use_first_level(domain))
5478 ret = domain_setup_first_level(iommu, domain, dev,
5479 domain->default_pasid);
5481 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5482 domain->default_pasid);
5485 spin_unlock(&iommu->lock);
5487 auxiliary_link_device(domain, dev);
5489 spin_unlock_irqrestore(&device_domain_lock, flags);
5494 domain_detach_iommu(domain, iommu);
5496 spin_unlock(&iommu->lock);
5497 spin_unlock_irqrestore(&device_domain_lock, flags);
5498 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5499 ioasid_free(domain->default_pasid);
5504 static void aux_domain_remove_dev(struct dmar_domain *domain,
5507 struct device_domain_info *info;
5508 struct intel_iommu *iommu;
5509 unsigned long flags;
5511 if (!is_aux_domain(dev, &domain->domain))
5514 spin_lock_irqsave(&device_domain_lock, flags);
5515 info = dev->archdata.iommu;
5516 iommu = info->iommu;
5518 auxiliary_unlink_device(domain, dev);
5520 spin_lock(&iommu->lock);
5521 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5522 domain_detach_iommu(domain, iommu);
5523 spin_unlock(&iommu->lock);
5525 spin_unlock_irqrestore(&device_domain_lock, flags);
5528 static int prepare_domain_attach_device(struct iommu_domain *domain,
5531 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5532 struct intel_iommu *iommu;
5536 iommu = device_to_iommu(dev, &bus, &devfn);
5540 /* check if this iommu agaw is sufficient for max mapped address */
5541 addr_width = agaw_to_width(iommu->agaw);
5542 if (addr_width > cap_mgaw(iommu->cap))
5543 addr_width = cap_mgaw(iommu->cap);
5545 if (dmar_domain->max_addr > (1LL << addr_width)) {
5546 dev_err(dev, "%s: iommu width (%d) is not "
5547 "sufficient for the mapped address (%llx)\n",
5548 __func__, addr_width, dmar_domain->max_addr);
5551 dmar_domain->gaw = addr_width;
5554 * Knock out extra levels of page tables if necessary
5556 while (iommu->agaw < dmar_domain->agaw) {
5557 struct dma_pte *pte;
5559 pte = dmar_domain->pgd;
5560 if (dma_pte_present(pte)) {
5561 dmar_domain->pgd = (struct dma_pte *)
5562 phys_to_virt(dma_pte_addr(pte));
5563 free_pgtable_page(pte);
5565 dmar_domain->agaw--;
5571 static int intel_iommu_attach_device(struct iommu_domain *domain,
5576 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5577 device_is_rmrr_locked(dev)) {
5578 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5582 if (is_aux_domain(dev, domain))
5585 /* normally dev is not mapped */
5586 if (unlikely(domain_context_mapped(dev))) {
5587 struct dmar_domain *old_domain;
5589 old_domain = find_domain(dev);
5591 dmar_remove_one_dev_info(dev);
5594 ret = prepare_domain_attach_device(domain, dev);
5598 return domain_add_dev_info(to_dmar_domain(domain), dev);
5601 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5606 if (!is_aux_domain(dev, domain))
5609 ret = prepare_domain_attach_device(domain, dev);
5613 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5616 static void intel_iommu_detach_device(struct iommu_domain *domain,
5619 dmar_remove_one_dev_info(dev);
5622 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5625 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5628 static int intel_iommu_map(struct iommu_domain *domain,
5629 unsigned long iova, phys_addr_t hpa,
5630 size_t size, int iommu_prot, gfp_t gfp)
5632 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5637 if (iommu_prot & IOMMU_READ)
5638 prot |= DMA_PTE_READ;
5639 if (iommu_prot & IOMMU_WRITE)
5640 prot |= DMA_PTE_WRITE;
5641 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5642 prot |= DMA_PTE_SNP;
5644 max_addr = iova + size;
5645 if (dmar_domain->max_addr < max_addr) {
5648 /* check if minimum agaw is sufficient for mapped address */
5649 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5650 if (end < max_addr) {
5651 pr_err("%s: iommu width (%d) is not "
5652 "sufficient for the mapped address (%llx)\n",
5653 __func__, dmar_domain->gaw, max_addr);
5656 dmar_domain->max_addr = max_addr;
5658 /* Round up size to next multiple of PAGE_SIZE, if it and
5659 the low bits of hpa would take us onto the next page */
5660 size = aligned_nrpages(hpa, size);
5661 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5662 hpa >> VTD_PAGE_SHIFT, size, prot);
5666 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5667 unsigned long iova, size_t size,
5668 struct iommu_iotlb_gather *gather)
5670 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5671 struct page *freelist = NULL;
5672 unsigned long start_pfn, last_pfn;
5673 unsigned int npages;
5674 int iommu_id, level = 0;
5676 /* Cope with horrid API which requires us to unmap more than the
5677 size argument if it happens to be a large-page mapping. */
5678 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5680 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5681 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5683 start_pfn = iova >> VTD_PAGE_SHIFT;
5684 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5686 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5688 npages = last_pfn - start_pfn + 1;
5690 for_each_domain_iommu(iommu_id, dmar_domain)
5691 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5692 start_pfn, npages, !freelist, 0);
5694 dma_free_pagelist(freelist);
5696 if (dmar_domain->max_addr == iova + size)
5697 dmar_domain->max_addr = iova;
5702 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5705 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5706 struct dma_pte *pte;
5710 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5711 if (pte && dma_pte_present(pte))
5712 phys = dma_pte_addr(pte) +
5713 (iova & (BIT_MASK(level_to_offset_bits(level) +
5714 VTD_PAGE_SHIFT) - 1));
5719 static inline bool scalable_mode_support(void)
5721 struct dmar_drhd_unit *drhd;
5722 struct intel_iommu *iommu;
5726 for_each_active_iommu(iommu, drhd) {
5727 if (!sm_supported(iommu)) {
5737 static inline bool iommu_pasid_support(void)
5739 struct dmar_drhd_unit *drhd;
5740 struct intel_iommu *iommu;
5744 for_each_active_iommu(iommu, drhd) {
5745 if (!pasid_supported(iommu)) {
5755 static inline bool nested_mode_support(void)
5757 struct dmar_drhd_unit *drhd;
5758 struct intel_iommu *iommu;
5762 for_each_active_iommu(iommu, drhd) {
5763 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5773 static bool intel_iommu_capable(enum iommu_cap cap)
5775 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5776 return domain_update_iommu_snooping(NULL) == 1;
5777 if (cap == IOMMU_CAP_INTR_REMAP)
5778 return irq_remapping_enabled == 1;
5783 static int intel_iommu_add_device(struct device *dev)
5785 struct dmar_domain *dmar_domain;
5786 struct iommu_domain *domain;
5787 struct intel_iommu *iommu;
5788 struct iommu_group *group;
5792 iommu = device_to_iommu(dev, &bus, &devfn);
5796 iommu_device_link(&iommu->iommu, dev);
5798 if (translation_pre_enabled(iommu))
5799 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5801 group = iommu_group_get_for_dev(dev);
5803 if (IS_ERR(group)) {
5804 ret = PTR_ERR(group);
5808 iommu_group_put(group);
5810 domain = iommu_get_domain_for_dev(dev);
5811 dmar_domain = to_dmar_domain(domain);
5812 if (domain->type == IOMMU_DOMAIN_DMA) {
5813 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5814 ret = iommu_request_dm_for_dev(dev);
5816 dmar_remove_one_dev_info(dev);
5817 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5818 domain_add_dev_info(si_domain, dev);
5820 "Device uses a private identity domain.\n");
5824 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5825 ret = iommu_request_dma_domain_for_dev(dev);
5827 dmar_remove_one_dev_info(dev);
5828 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5829 if (!get_private_domain_for_dev(dev)) {
5831 "Failed to get a private domain.\n");
5837 "Device uses a private dma domain.\n");
5842 if (device_needs_bounce(dev)) {
5843 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5844 set_dma_ops(dev, &bounce_dma_ops);
5850 iommu_device_unlink(&iommu->iommu, dev);
5854 static void intel_iommu_remove_device(struct device *dev)
5856 struct intel_iommu *iommu;
5859 iommu = device_to_iommu(dev, &bus, &devfn);
5863 dmar_remove_one_dev_info(dev);
5865 iommu_group_remove_device(dev);
5867 iommu_device_unlink(&iommu->iommu, dev);
5869 if (device_needs_bounce(dev))
5870 set_dma_ops(dev, NULL);
5873 static void intel_iommu_get_resv_regions(struct device *device,
5874 struct list_head *head)
5876 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5877 struct iommu_resv_region *reg;
5878 struct dmar_rmrr_unit *rmrr;
5879 struct device *i_dev;
5882 down_read(&dmar_global_lock);
5883 for_each_rmrr_units(rmrr) {
5884 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5886 struct iommu_resv_region *resv;
5887 enum iommu_resv_type type;
5890 if (i_dev != device &&
5891 !is_downstream_to_pci_bridge(device, i_dev))
5894 length = rmrr->end_address - rmrr->base_address + 1;
5896 type = device_rmrr_is_relaxable(device) ?
5897 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5899 resv = iommu_alloc_resv_region(rmrr->base_address,
5900 length, prot, type);
5904 list_add_tail(&resv->list, head);
5907 up_read(&dmar_global_lock);
5909 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5910 if (dev_is_pci(device)) {
5911 struct pci_dev *pdev = to_pci_dev(device);
5913 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5914 reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5915 IOMMU_RESV_DIRECT_RELAXABLE);
5917 list_add_tail(®->list, head);
5920 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5922 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5923 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5927 list_add_tail(®->list, head);
5930 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5932 struct device_domain_info *info;
5933 struct context_entry *context;
5934 struct dmar_domain *domain;
5935 unsigned long flags;
5939 domain = find_domain(dev);
5943 spin_lock_irqsave(&device_domain_lock, flags);
5944 spin_lock(&iommu->lock);
5947 info = dev->archdata.iommu;
5948 if (!info || !info->pasid_supported)
5951 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5952 if (WARN_ON(!context))
5955 ctx_lo = context[0].lo;
5957 if (!(ctx_lo & CONTEXT_PASIDE)) {
5958 ctx_lo |= CONTEXT_PASIDE;
5959 context[0].lo = ctx_lo;
5961 iommu->flush.flush_context(iommu,
5962 domain->iommu_did[iommu->seq_id],
5963 PCI_DEVID(info->bus, info->devfn),
5964 DMA_CCMD_MASK_NOBIT,
5965 DMA_CCMD_DEVICE_INVL);
5968 /* Enable PASID support in the device, if it wasn't already */
5969 if (!info->pasid_enabled)
5970 iommu_enable_dev_iotlb(info);
5975 spin_unlock(&iommu->lock);
5976 spin_unlock_irqrestore(&device_domain_lock, flags);
5981 static void intel_iommu_apply_resv_region(struct device *dev,
5982 struct iommu_domain *domain,
5983 struct iommu_resv_region *region)
5985 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5986 unsigned long start, end;
5988 start = IOVA_PFN(region->start);
5989 end = IOVA_PFN(region->start + region->length - 1);
5991 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5994 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5996 if (dev_is_pci(dev))
5997 return pci_device_group(dev);
5998 return generic_device_group(dev);
6001 #ifdef CONFIG_INTEL_IOMMU_SVM
6002 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
6004 struct intel_iommu *iommu;
6007 if (iommu_dummy(dev)) {
6009 "No IOMMU translation for device; cannot enable SVM\n");
6013 iommu = device_to_iommu(dev, &bus, &devfn);
6015 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6021 #endif /* CONFIG_INTEL_IOMMU_SVM */
6023 static int intel_iommu_enable_auxd(struct device *dev)
6025 struct device_domain_info *info;
6026 struct intel_iommu *iommu;
6027 unsigned long flags;
6031 iommu = device_to_iommu(dev, &bus, &devfn);
6032 if (!iommu || dmar_disabled)
6035 if (!sm_supported(iommu) || !pasid_supported(iommu))
6038 ret = intel_iommu_enable_pasid(iommu, dev);
6042 spin_lock_irqsave(&device_domain_lock, flags);
6043 info = dev->archdata.iommu;
6044 info->auxd_enabled = 1;
6045 spin_unlock_irqrestore(&device_domain_lock, flags);
6050 static int intel_iommu_disable_auxd(struct device *dev)
6052 struct device_domain_info *info;
6053 unsigned long flags;
6055 spin_lock_irqsave(&device_domain_lock, flags);
6056 info = dev->archdata.iommu;
6057 if (!WARN_ON(!info))
6058 info->auxd_enabled = 0;
6059 spin_unlock_irqrestore(&device_domain_lock, flags);
6065 * A PCI express designated vendor specific extended capability is defined
6066 * in the section 3.7 of Intel scalable I/O virtualization technical spec
6067 * for system software and tools to detect endpoint devices supporting the
6068 * Intel scalable IO virtualization without host driver dependency.
6070 * Returns the address of the matching extended capability structure within
6071 * the device's PCI configuration space or 0 if the device does not support
6074 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6079 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6081 pci_read_config_word(pdev, pos + 4, &vendor);
6082 pci_read_config_word(pdev, pos + 8, &id);
6083 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6086 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6093 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6095 if (feat == IOMMU_DEV_FEAT_AUX) {
6098 if (!dev_is_pci(dev) || dmar_disabled ||
6099 !scalable_mode_support() || !iommu_pasid_support())
6102 ret = pci_pasid_features(to_pci_dev(dev));
6106 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6113 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6115 if (feat == IOMMU_DEV_FEAT_AUX)
6116 return intel_iommu_enable_auxd(dev);
6122 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6124 if (feat == IOMMU_DEV_FEAT_AUX)
6125 return intel_iommu_disable_auxd(dev);
6131 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6133 struct device_domain_info *info = dev->archdata.iommu;
6135 if (feat == IOMMU_DEV_FEAT_AUX)
6136 return scalable_mode_support() && info && info->auxd_enabled;
6142 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6144 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6146 return dmar_domain->default_pasid > 0 ?
6147 dmar_domain->default_pasid : -EINVAL;
6150 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6153 return attach_deferred(dev);
6157 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6158 enum iommu_attr attr, void *data)
6160 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6161 unsigned long flags;
6164 if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6168 case DOMAIN_ATTR_NESTING:
6169 spin_lock_irqsave(&device_domain_lock, flags);
6170 if (nested_mode_support() &&
6171 list_empty(&dmar_domain->devices)) {
6172 dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6173 dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6177 spin_unlock_irqrestore(&device_domain_lock, flags);
6187 const struct iommu_ops intel_iommu_ops = {
6188 .capable = intel_iommu_capable,
6189 .domain_alloc = intel_iommu_domain_alloc,
6190 .domain_free = intel_iommu_domain_free,
6191 .domain_set_attr = intel_iommu_domain_set_attr,
6192 .attach_dev = intel_iommu_attach_device,
6193 .detach_dev = intel_iommu_detach_device,
6194 .aux_attach_dev = intel_iommu_aux_attach_device,
6195 .aux_detach_dev = intel_iommu_aux_detach_device,
6196 .aux_get_pasid = intel_iommu_aux_get_pasid,
6197 .map = intel_iommu_map,
6198 .unmap = intel_iommu_unmap,
6199 .iova_to_phys = intel_iommu_iova_to_phys,
6200 .add_device = intel_iommu_add_device,
6201 .remove_device = intel_iommu_remove_device,
6202 .get_resv_regions = intel_iommu_get_resv_regions,
6203 .put_resv_regions = generic_iommu_put_resv_regions,
6204 .apply_resv_region = intel_iommu_apply_resv_region,
6205 .device_group = intel_iommu_device_group,
6206 .dev_has_feat = intel_iommu_dev_has_feat,
6207 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
6208 .dev_enable_feat = intel_iommu_dev_enable_feat,
6209 .dev_disable_feat = intel_iommu_dev_disable_feat,
6210 .is_attach_deferred = intel_iommu_is_attach_deferred,
6211 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
6214 static void quirk_iommu_igfx(struct pci_dev *dev)
6216 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6220 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6229 /* Broadwell igfx malfunctions with dmar */
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6240 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6241 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6242 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6243 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6244 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6255 static void quirk_iommu_rwbf(struct pci_dev *dev)
6258 * Mobile 4 Series Chipset neglects to set RWBF capability,
6259 * but needs it. Same seems to hold for the desktop versions.
6261 pci_info(dev, "Forcing write-buffer flush capability\n");
6265 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6266 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6267 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6268 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6269 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6270 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6271 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6274 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
6275 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
6276 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
6277 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
6278 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
6279 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
6280 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
6281 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
6283 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6287 if (pci_read_config_word(dev, GGC, &ggc))
6290 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6291 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6293 } else if (dmar_map_gfx) {
6294 /* we have to ensure the gfx device is idle before we flush */
6295 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6296 intel_iommu_strict = 1;
6299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6304 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6305 ISOCH DMAR unit for the Azalia sound device, but not give it any
6306 TLB entries, which causes it to deadlock. Check for that. We do
6307 this in a function called from init_dmars(), instead of in a PCI
6308 quirk, because we don't want to print the obnoxious "BIOS broken"
6309 message if VT-d is actually disabled.
6311 static void __init check_tylersburg_isoch(void)
6313 struct pci_dev *pdev;
6314 uint32_t vtisochctrl;
6316 /* If there's no Azalia in the system anyway, forget it. */
6317 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6322 /* System Management Registers. Might be hidden, in which case
6323 we can't do the sanity check. But that's OK, because the
6324 known-broken BIOSes _don't_ actually hide it, so far. */
6325 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6329 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6336 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6337 if (vtisochctrl & 1)
6340 /* Drop all bits other than the number of TLB entries */
6341 vtisochctrl &= 0x1c;
6343 /* If we have the recommended number of TLB entries (16), fine. */
6344 if (vtisochctrl == 0x10)
6347 /* Zero TLB entries? You get to ride the short bus to school. */
6349 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6350 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6351 dmi_get_system_info(DMI_BIOS_VENDOR),
6352 dmi_get_system_info(DMI_BIOS_VERSION),
6353 dmi_get_system_info(DMI_PRODUCT_VERSION));
6354 iommu_identity_mapping |= IDENTMAP_AZALIA;
6358 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",