1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
344 static int domain_detach_iommu(struct dmar_domain *domain,
345 struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
366 #define IDENTMAP_ALL 1
367 #define IDENTMAP_GFX 2
368 #define IDENTMAP_AZALIA 4
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
379 * Iterate over elements in device_domain_list and call the specified
380 * callback @fn against each element.
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383 void *data), void *data)
387 struct device_domain_info *info;
389 spin_lock_irqsave(&device_domain_lock, flags);
390 list_for_each_entry(info, &device_domain_list, global) {
391 ret = fn(info, data);
393 spin_unlock_irqrestore(&device_domain_lock, flags);
397 spin_unlock_irqrestore(&device_domain_lock, flags);
402 const struct iommu_ops intel_iommu_ops;
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
406 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
411 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
414 static void init_translation_status(struct intel_iommu *iommu)
418 gsts = readl(iommu->reg + DMAR_GSTS_REG);
419 if (gsts & DMA_GSTS_TES)
420 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
426 return container_of(dom, struct dmar_domain, domain);
429 static int __init intel_iommu_setup(char *str)
434 if (!strncmp(str, "on", 2)) {
436 pr_info("IOMMU enabled\n");
437 } else if (!strncmp(str, "off", 3)) {
439 no_platform_optin = 1;
440 pr_info("IOMMU disabled\n");
441 } else if (!strncmp(str, "igfx_off", 8)) {
443 pr_info("Disable GFX device mapping\n");
444 } else if (!strncmp(str, "forcedac", 8)) {
445 pr_info("Forcing DAC for PCI devices\n");
447 } else if (!strncmp(str, "strict", 6)) {
448 pr_info("Disable batched IOTLB flush\n");
449 intel_iommu_strict = 1;
450 } else if (!strncmp(str, "sp_off", 6)) {
451 pr_info("Disable supported super page\n");
452 intel_iommu_superpage = 0;
453 } else if (!strncmp(str, "sm_on", 5)) {
454 pr_info("Intel-IOMMU: scalable mode supported\n");
456 } else if (!strncmp(str, "tboot_noforce", 13)) {
458 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459 intel_iommu_tboot_noforce = 1;
462 str += strcspn(str, ",");
468 __setup("intel_iommu=", intel_iommu_setup);
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
475 struct dmar_domain **domains;
478 domains = iommu->domains[idx];
482 return domains[did & 0xff];
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486 struct dmar_domain *domain)
488 struct dmar_domain **domains;
491 if (!iommu->domains[idx]) {
492 size_t size = 256 * sizeof(struct dmar_domain *);
493 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
496 domains = iommu->domains[idx];
497 if (WARN_ON(!domains))
500 domains[did & 0xff] = domain;
503 void *alloc_pgtable_page(int node)
508 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
510 vaddr = page_address(page);
514 void free_pgtable_page(void *vaddr)
516 free_page((unsigned long)vaddr);
519 static inline void *alloc_domain_mem(void)
521 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
524 static void free_domain_mem(void *vaddr)
526 kmem_cache_free(iommu_domain_cache, vaddr);
529 static inline void * alloc_devinfo_mem(void)
531 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
534 static inline void free_devinfo_mem(void *vaddr)
536 kmem_cache_free(iommu_devinfo_cache, vaddr);
539 static inline int domain_type_is_si(struct dmar_domain *domain)
541 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
547 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
549 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 sagaw = cap_sagaw(iommu->cap);
558 for (agaw = width_to_agaw(max_gaw);
560 if (test_bit(agaw, &sagaw))
568 * Calculate max SAGAW for each iommu.
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
572 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 * calculate agaw for each iommu.
577 * "SAGAW" may be different across iommus, use a default agaw, and
578 * get a supported less agaw for iommus that don't support the default agaw.
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
582 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 /* si_domain and vm domain should not get here. */
591 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
594 for_each_domain_iommu(iommu_id, domain)
597 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
600 return g_iommus[iommu_id];
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 struct dmar_drhd_unit *drhd;
606 struct intel_iommu *iommu;
610 domain->iommu_coherency = 1;
612 for_each_domain_iommu(i, domain) {
614 if (!ecap_coherent(g_iommus[i]->ecap)) {
615 domain->iommu_coherency = 0;
622 /* No hardware attached; use lowest common denominator */
624 for_each_active_iommu(iommu, drhd) {
625 if (!ecap_coherent(iommu->ecap)) {
626 domain->iommu_coherency = 0;
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
635 struct dmar_drhd_unit *drhd;
636 struct intel_iommu *iommu;
640 for_each_active_iommu(iommu, drhd) {
642 if (!ecap_sc_support(iommu->ecap)) {
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
655 struct dmar_drhd_unit *drhd;
656 struct intel_iommu *iommu;
659 if (!intel_iommu_superpage) {
663 /* set iommu_superpage to the smallest common denominator */
665 for_each_active_iommu(iommu, drhd) {
667 mask &= cap_super_page_val(iommu->cap);
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
680 domain_update_iommu_coherency(domain);
681 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
688 struct root_entry *root = &iommu->root_entry[bus];
689 struct context_entry *context;
693 if (sm_supported(iommu)) {
701 context = phys_to_virt(*entry & VTD_PAGE_MASK);
703 unsigned long phy_addr;
707 context = alloc_pgtable_page(iommu->node);
711 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712 phy_addr = virt_to_phys((void *)context);
713 *entry = phy_addr | 1;
714 __iommu_flush_cache(iommu, entry, sizeof(*entry));
716 return &context[devfn];
719 static int iommu_dummy(struct device *dev)
721 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
725 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726 * sub-hierarchy of a candidate PCI-PCI bridge
727 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728 * @bridge: the candidate PCI-PCI bridge
730 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
735 struct pci_dev *pdev, *pbridge;
737 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
740 pdev = to_pci_dev(dev);
741 pbridge = to_pci_dev(bridge);
743 if (pbridge->subordinate &&
744 pbridge->subordinate->number <= pdev->bus->number &&
745 pbridge->subordinate->busn_res.end >= pdev->bus->number)
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
753 struct dmar_drhd_unit *drhd = NULL;
754 struct intel_iommu *iommu;
756 struct pci_dev *pdev = NULL;
760 if (iommu_dummy(dev))
763 if (dev_is_pci(dev)) {
764 struct pci_dev *pf_pdev;
766 pdev = to_pci_dev(dev);
769 /* VMD child devices currently cannot be handled individually */
770 if (is_vmd(pdev->bus))
774 /* VFs aren't listed in scope tables; we need to look up
775 * the PF instead to find the IOMMU. */
776 pf_pdev = pci_physfn(pdev);
778 segment = pci_domain_nr(pdev->bus);
779 } else if (has_acpi_companion(dev))
780 dev = &ACPI_COMPANION(dev)->dev;
783 for_each_active_iommu(iommu, drhd) {
784 if (pdev && segment != drhd->segment)
787 for_each_active_dev_scope(drhd->devices,
788 drhd->devices_cnt, i, tmp) {
790 /* For a VF use its original BDF# not that of the PF
791 * which we used for the IOMMU lookup. Strictly speaking
792 * we could do this for all PCI devices; we only need to
793 * get the BDF# from the scope table for ACPI matches. */
794 if (pdev && pdev->is_virtfn)
797 *bus = drhd->devices[i].bus;
798 *devfn = drhd->devices[i].devfn;
802 if (is_downstream_to_pci_bridge(dev, tmp))
806 if (pdev && drhd->include_all) {
808 *bus = pdev->bus->number;
809 *devfn = pdev->devfn;
820 static void domain_flush_cache(struct dmar_domain *domain,
821 void *addr, int size)
823 if (!domain->iommu_coherency)
824 clflush_cache_range(addr, size);
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
829 struct context_entry *context;
833 spin_lock_irqsave(&iommu->lock, flags);
834 context = iommu_context_addr(iommu, bus, devfn, 0);
836 ret = context_present(context);
837 spin_unlock_irqrestore(&iommu->lock, flags);
841 static void free_context_table(struct intel_iommu *iommu)
845 struct context_entry *context;
847 spin_lock_irqsave(&iommu->lock, flags);
848 if (!iommu->root_entry) {
851 for (i = 0; i < ROOT_ENTRY_NR; i++) {
852 context = iommu_context_addr(iommu, i, 0, 0);
854 free_pgtable_page(context);
856 if (!sm_supported(iommu))
859 context = iommu_context_addr(iommu, i, 0x80, 0);
861 free_pgtable_page(context);
864 free_pgtable_page(iommu->root_entry);
865 iommu->root_entry = NULL;
867 spin_unlock_irqrestore(&iommu->lock, flags);
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871 unsigned long pfn, int *target_level)
873 struct dma_pte *parent, *pte;
874 int level = agaw_to_level(domain->agaw);
877 BUG_ON(!domain->pgd);
879 if (!domain_pfn_supported(domain, pfn))
880 /* Address beyond IOMMU's addressing capabilities. */
883 parent = domain->pgd;
888 offset = pfn_level_offset(pfn, level);
889 pte = &parent[offset];
890 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
892 if (level == *target_level)
895 if (!dma_pte_present(pte)) {
898 tmp_page = alloc_pgtable_page(domain->nid);
903 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905 if (cmpxchg64(&pte->val, 0ULL, pteval))
906 /* Someone else set it while we were thinking; use theirs. */
907 free_pgtable_page(tmp_page);
909 domain_flush_cache(domain, pte, sizeof(*pte));
914 parent = phys_to_virt(dma_pte_addr(pte));
919 *target_level = level;
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
927 int level, int *large_page)
929 struct dma_pte *parent, *pte;
930 int total = agaw_to_level(domain->agaw);
933 parent = domain->pgd;
934 while (level <= total) {
935 offset = pfn_level_offset(pfn, total);
936 pte = &parent[offset];
940 if (!dma_pte_present(pte)) {
945 if (dma_pte_superpage(pte)) {
950 parent = phys_to_virt(dma_pte_addr(pte));
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958 unsigned long start_pfn,
959 unsigned long last_pfn)
961 unsigned int large_page;
962 struct dma_pte *first_pte, *pte;
964 BUG_ON(!domain_pfn_supported(domain, start_pfn));
965 BUG_ON(!domain_pfn_supported(domain, last_pfn));
966 BUG_ON(start_pfn > last_pfn);
968 /* we don't need lock here; nobody else touches the iova range */
971 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
973 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
978 start_pfn += lvl_to_nr_pages(large_page);
980 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
982 domain_flush_cache(domain, first_pte,
983 (void *)pte - (void *)first_pte);
985 } while (start_pfn && start_pfn <= last_pfn);
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989 int retain_level, struct dma_pte *pte,
990 unsigned long pfn, unsigned long start_pfn,
991 unsigned long last_pfn)
993 pfn = max(start_pfn, pfn);
994 pte = &pte[pfn_level_offset(pfn, level)];
997 unsigned long level_pfn;
998 struct dma_pte *level_pte;
1000 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1003 level_pfn = pfn & level_mask(level);
1004 level_pte = phys_to_virt(dma_pte_addr(pte));
1007 dma_pte_free_level(domain, level - 1, retain_level,
1008 level_pte, level_pfn, start_pfn,
1013 * Free the page table if we're below the level we want to
1014 * retain and the range covers the entire table.
1016 if (level < retain_level && !(start_pfn > level_pfn ||
1017 last_pfn < level_pfn + level_size(level) - 1)) {
1019 domain_flush_cache(domain, pte, sizeof(*pte));
1020 free_pgtable_page(level_pte);
1023 pfn += level_size(level);
1024 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1028 * clear last level (leaf) ptes and free page table pages below the
1029 * level we wish to keep intact.
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn,
1036 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038 BUG_ON(start_pfn > last_pfn);
1040 dma_pte_clear_range(domain, start_pfn, last_pfn);
1042 /* We don't need lock here; nobody else touches the iova range */
1043 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044 domain->pgd, 0, start_pfn, last_pfn);
1047 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048 free_pgtable_page(domain->pgd);
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054 need to *modify* it at all. All we need to do is make a list of all the
1055 pages which can be freed just as soon as we've flushed the IOTLB and we
1056 know the hardware page-walk will no longer touch them.
1057 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060 int level, struct dma_pte *pte,
1061 struct page *freelist)
1065 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066 pg->freelist = freelist;
1072 pte = page_address(pg);
1074 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075 freelist = dma_pte_list_pagetables(domain, level - 1,
1078 } while (!first_pte_in_page(pte));
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084 struct dma_pte *pte, unsigned long pfn,
1085 unsigned long start_pfn,
1086 unsigned long last_pfn,
1087 struct page *freelist)
1089 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1091 pfn = max(start_pfn, pfn);
1092 pte = &pte[pfn_level_offset(pfn, level)];
1095 unsigned long level_pfn;
1097 if (!dma_pte_present(pte))
1100 level_pfn = pfn & level_mask(level);
1102 /* If range covers entire pagetable, free it */
1103 if (start_pfn <= level_pfn &&
1104 last_pfn >= level_pfn + level_size(level) - 1) {
1105 /* These suborbinate page tables are going away entirely. Don't
1106 bother to clear them; we're just going to *free* them. */
1107 if (level > 1 && !dma_pte_superpage(pte))
1108 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 } else if (level > 1) {
1115 /* Recurse down into a level that isn't *entirely* obsolete */
1116 freelist = dma_pte_clear_level(domain, level - 1,
1117 phys_to_virt(dma_pte_addr(pte)),
1118 level_pfn, start_pfn, last_pfn,
1122 pfn += level_size(level);
1123 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1126 domain_flush_cache(domain, first_pte,
1127 (void *)++last_pte - (void *)first_pte);
1132 /* We can't just free the pages because the IOMMU may still be walking
1133 the page tables, and may have cached the intermediate levels. The
1134 pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136 unsigned long start_pfn,
1137 unsigned long last_pfn)
1139 struct page *freelist;
1141 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143 BUG_ON(start_pfn > last_pfn);
1145 /* we don't need lock here; nobody else touches the iova range */
1146 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147 domain->pgd, 0, start_pfn, last_pfn, NULL);
1150 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151 struct page *pgd_page = virt_to_page(domain->pgd);
1152 pgd_page->freelist = freelist;
1153 freelist = pgd_page;
1161 static void dma_free_pagelist(struct page *freelist)
1165 while ((pg = freelist)) {
1166 freelist = pg->freelist;
1167 free_pgtable_page(page_address(pg));
1171 static void iova_entry_free(unsigned long data)
1173 struct page *freelist = (struct page *)data;
1175 dma_free_pagelist(freelist);
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1181 struct root_entry *root;
1182 unsigned long flags;
1184 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1186 pr_err("Allocating root entry for %s failed\n",
1191 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1193 spin_lock_irqsave(&iommu->lock, flags);
1194 iommu->root_entry = root;
1195 spin_unlock_irqrestore(&iommu->lock, flags);
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 addr = virt_to_phys(iommu->root_entry);
1207 if (sm_supported(iommu))
1208 addr |= DMA_RTADDR_SMT;
1210 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215 /* Make sure hardware complete it */
1216 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217 readl, (sts & DMA_GSTS_RTPS), sts);
1219 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1227 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1230 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1233 /* Make sure hardware complete it */
1234 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235 readl, (!(val & DMA_GSTS_WBFS)), val);
1237 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242 u16 did, u16 source_id, u8 function_mask,
1249 case DMA_CCMD_GLOBAL_INVL:
1250 val = DMA_CCMD_GLOBAL_INVL;
1252 case DMA_CCMD_DOMAIN_INVL:
1253 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1255 case DMA_CCMD_DEVICE_INVL:
1256 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1262 val |= DMA_CCMD_ICC;
1264 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1267 /* Make sure hardware complete it */
1268 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1271 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276 u64 addr, unsigned int size_order, u64 type)
1278 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279 u64 val = 0, val_iva = 0;
1283 case DMA_TLB_GLOBAL_FLUSH:
1284 /* global flush doesn't need set IVA_REG */
1285 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1287 case DMA_TLB_DSI_FLUSH:
1288 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 case DMA_TLB_PSI_FLUSH:
1291 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292 /* IH bit is passed in as part of address */
1293 val_iva = size_order | addr;
1298 /* Note: set drain read/write */
1301 * This is probably to be super secure.. Looks like we can
1302 * ignore it without any impact.
1304 if (cap_read_drain(iommu->cap))
1305 val |= DMA_TLB_READ_DRAIN;
1307 if (cap_write_drain(iommu->cap))
1308 val |= DMA_TLB_WRITE_DRAIN;
1310 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311 /* Note: Only uses first TLB reg currently */
1313 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1316 /* Make sure hardware complete it */
1317 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1320 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 /* check IOTLB invalidation granularity */
1323 if (DMA_TLB_IAIG(val) == 0)
1324 pr_err("Flush IOTLB failed\n");
1325 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327 (unsigned long long)DMA_TLB_IIRG(type),
1328 (unsigned long long)DMA_TLB_IAIG(val));
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1335 struct device_domain_info *info;
1337 assert_spin_locked(&device_domain_lock);
1342 list_for_each_entry(info, &domain->devices, link)
1343 if (info->iommu == iommu && info->bus == bus &&
1344 info->devfn == devfn) {
1345 if (info->ats_supported && info->dev)
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1355 struct device_domain_info *info;
1356 bool has_iotlb_device = false;
1358 assert_spin_locked(&device_domain_lock);
1360 list_for_each_entry(info, &domain->devices, link) {
1361 struct pci_dev *pdev;
1363 if (!info->dev || !dev_is_pci(info->dev))
1366 pdev = to_pci_dev(info->dev);
1367 if (pdev->ats_enabled) {
1368 has_iotlb_device = true;
1373 domain->has_iotlb_device = has_iotlb_device;
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1378 struct pci_dev *pdev;
1380 assert_spin_locked(&device_domain_lock);
1382 if (!info || !dev_is_pci(info->dev))
1385 pdev = to_pci_dev(info->dev);
1386 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389 * reserved, which should be set to 0.
1391 if (!ecap_dit(info->iommu->ecap))
1394 struct pci_dev *pf_pdev;
1396 /* pdev will be returned if device is not a vf */
1397 pf_pdev = pci_physfn(pdev);
1398 info->pfsid = pci_dev_id(pf_pdev);
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402 /* The PCIe spec, in its wisdom, declares that the behaviour of
1403 the device if you enable PASID support after ATS support is
1404 undefined. So always enable PASID support on devices which
1405 have it, even if we can't yet know if we're ever going to
1407 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408 info->pasid_enabled = 1;
1410 if (info->pri_supported &&
1411 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1412 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413 info->pri_enabled = 1;
1415 if (!pdev->untrusted && info->ats_supported &&
1416 pci_ats_page_aligned(pdev) &&
1417 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418 info->ats_enabled = 1;
1419 domain_update_iotlb(info->domain);
1420 info->ats_qdep = pci_ats_queue_depth(pdev);
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1426 struct pci_dev *pdev;
1428 assert_spin_locked(&device_domain_lock);
1430 if (!dev_is_pci(info->dev))
1433 pdev = to_pci_dev(info->dev);
1435 if (info->ats_enabled) {
1436 pci_disable_ats(pdev);
1437 info->ats_enabled = 0;
1438 domain_update_iotlb(info->domain);
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441 if (info->pri_enabled) {
1442 pci_disable_pri(pdev);
1443 info->pri_enabled = 0;
1445 if (info->pasid_enabled) {
1446 pci_disable_pasid(pdev);
1447 info->pasid_enabled = 0;
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453 u64 addr, unsigned mask)
1456 unsigned long flags;
1457 struct device_domain_info *info;
1459 if (!domain->has_iotlb_device)
1462 spin_lock_irqsave(&device_domain_lock, flags);
1463 list_for_each_entry(info, &domain->devices, link) {
1464 if (!info->ats_enabled)
1467 sid = info->bus << 8 | info->devfn;
1468 qdep = info->ats_qdep;
1469 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1472 spin_unlock_irqrestore(&device_domain_lock, flags);
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476 struct dmar_domain *domain,
1477 unsigned long pfn, unsigned int pages,
1480 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482 u16 did = domain->iommu_did[iommu->seq_id];
1489 * Fallback to domain selective flush if no PSI support or the size is
1491 * PSI requires page size to be 2 ^ x, and the base address is naturally
1492 * aligned to the size
1494 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1498 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1502 * In caching mode, changes of pages from non-present to present require
1503 * flush. However, device IOTLB doesn't need to be flushed in this case.
1505 if (!cap_caching_mode(iommu->cap) || !map)
1506 iommu_flush_dev_iotlb(domain, addr, mask);
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511 struct dmar_domain *domain,
1512 unsigned long pfn, unsigned int pages)
1514 /* It's a non-present to present mapping. Only flush if caching mode */
1515 if (cap_caching_mode(iommu->cap))
1516 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1518 iommu_flush_write_buffer(iommu);
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1523 struct dmar_domain *domain;
1526 domain = container_of(iovad, struct dmar_domain, iovad);
1528 for_each_domain_iommu(idx, domain) {
1529 struct intel_iommu *iommu = g_iommus[idx];
1530 u16 did = domain->iommu_did[iommu->seq_id];
1532 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1534 if (!cap_caching_mode(iommu->cap))
1535 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536 0, MAX_AGAW_PFN_WIDTH);
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1543 unsigned long flags;
1545 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1548 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550 pmen &= ~DMA_PMEN_EPM;
1551 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1553 /* wait for the protected region status bit to clear */
1554 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555 readl, !(pmen & DMA_PMEN_PRS), pmen);
1557 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1563 unsigned long flags;
1565 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566 iommu->gcmd |= DMA_GCMD_TE;
1567 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1569 /* Make sure hardware complete it */
1570 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571 readl, (sts & DMA_GSTS_TES), sts);
1573 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1581 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582 iommu->gcmd &= ~DMA_GCMD_TE;
1583 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585 /* Make sure hardware complete it */
1586 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587 readl, (!(sts & DMA_GSTS_TES)), sts);
1589 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1594 u32 ndomains, nlongs;
1597 ndomains = cap_ndoms(iommu->cap);
1598 pr_debug("%s: Number of Domains supported <%d>\n",
1599 iommu->name, ndomains);
1600 nlongs = BITS_TO_LONGS(ndomains);
1602 spin_lock_init(&iommu->lock);
1604 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605 if (!iommu->domain_ids) {
1606 pr_err("%s: Allocating domain id array failed\n",
1611 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612 iommu->domains = kzalloc(size, GFP_KERNEL);
1614 if (iommu->domains) {
1615 size = 256 * sizeof(struct dmar_domain *);
1616 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1619 if (!iommu->domains || !iommu->domains[0]) {
1620 pr_err("%s: Allocating domain array failed\n",
1622 kfree(iommu->domain_ids);
1623 kfree(iommu->domains);
1624 iommu->domain_ids = NULL;
1625 iommu->domains = NULL;
1630 * If Caching mode is set, then invalid translations are tagged
1631 * with domain-id 0, hence we need to pre-allocate it. We also
1632 * use domain-id 0 as a marker for non-allocated domain-id, so
1633 * make sure it is not used for a real domain.
1635 set_bit(0, iommu->domain_ids);
1638 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639 * entry for first-level or pass-through translation modes should
1640 * be programmed with a domain id different from those used for
1641 * second-level or nested translation. We reserve a domain id for
1644 if (sm_supported(iommu))
1645 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1652 struct device_domain_info *info, *tmp;
1653 unsigned long flags;
1655 if (!iommu->domains || !iommu->domain_ids)
1658 spin_lock_irqsave(&device_domain_lock, flags);
1659 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660 if (info->iommu != iommu)
1663 if (!info->dev || !info->domain)
1666 __dmar_remove_one_dev_info(info);
1668 spin_unlock_irqrestore(&device_domain_lock, flags);
1670 if (iommu->gcmd & DMA_GCMD_TE)
1671 iommu_disable_translation(iommu);
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1676 if ((iommu->domains) && (iommu->domain_ids)) {
1677 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1680 for (i = 0; i < elems; i++)
1681 kfree(iommu->domains[i]);
1682 kfree(iommu->domains);
1683 kfree(iommu->domain_ids);
1684 iommu->domains = NULL;
1685 iommu->domain_ids = NULL;
1688 g_iommus[iommu->seq_id] = NULL;
1690 /* free context mapping */
1691 free_context_table(iommu);
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694 if (pasid_supported(iommu)) {
1695 if (ecap_prs(iommu->ecap))
1696 intel_svm_finish_prq(iommu);
1701 static struct dmar_domain *alloc_domain(int flags)
1703 struct dmar_domain *domain;
1705 domain = alloc_domain_mem();
1709 memset(domain, 0, sizeof(*domain));
1710 domain->nid = NUMA_NO_NODE;
1711 domain->flags = flags;
1712 domain->has_iotlb_device = false;
1713 INIT_LIST_HEAD(&domain->devices);
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720 struct intel_iommu *iommu)
1722 unsigned long ndomains;
1725 assert_spin_locked(&device_domain_lock);
1726 assert_spin_locked(&iommu->lock);
1728 domain->iommu_refcnt[iommu->seq_id] += 1;
1729 domain->iommu_count += 1;
1730 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731 ndomains = cap_ndoms(iommu->cap);
1732 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1734 if (num >= ndomains) {
1735 pr_err("%s: No free domain ids\n", iommu->name);
1736 domain->iommu_refcnt[iommu->seq_id] -= 1;
1737 domain->iommu_count -= 1;
1741 set_bit(num, iommu->domain_ids);
1742 set_iommu_domain(iommu, num, domain);
1744 domain->iommu_did[iommu->seq_id] = num;
1745 domain->nid = iommu->node;
1747 domain_update_iommu_cap(domain);
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754 struct intel_iommu *iommu)
1758 assert_spin_locked(&device_domain_lock);
1759 assert_spin_locked(&iommu->lock);
1761 domain->iommu_refcnt[iommu->seq_id] -= 1;
1762 count = --domain->iommu_count;
1763 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764 num = domain->iommu_did[iommu->seq_id];
1765 clear_bit(num, iommu->domain_ids);
1766 set_iommu_domain(iommu, num, NULL);
1768 domain_update_iommu_cap(domain);
1769 domain->iommu_did[iommu->seq_id] = 0;
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1778 static int dmar_init_reserved_ranges(void)
1780 struct pci_dev *pdev = NULL;
1784 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1786 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787 &reserved_rbtree_key);
1789 /* IOAPIC ranges shouldn't be accessed by DMA */
1790 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791 IOVA_PFN(IOAPIC_RANGE_END));
1793 pr_err("Reserve IOAPIC range failed\n");
1797 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798 for_each_pci_dev(pdev) {
1801 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802 r = &pdev->resource[i];
1803 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1805 iova = reserve_iova(&reserved_iova_list,
1809 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1819 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1825 int r = (gaw - 12) % 9;
1836 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1839 int adjust_width, agaw;
1840 unsigned long sagaw;
1843 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1845 err = init_iova_flush_queue(&domain->iovad,
1846 iommu_flush_iova, iova_entry_free);
1850 domain_reserve_special_ranges(domain);
1852 /* calculate AGAW */
1853 if (guest_width > cap_mgaw(iommu->cap))
1854 guest_width = cap_mgaw(iommu->cap);
1855 domain->gaw = guest_width;
1856 adjust_width = guestwidth_to_adjustwidth(guest_width);
1857 agaw = width_to_agaw(adjust_width);
1858 sagaw = cap_sagaw(iommu->cap);
1859 if (!test_bit(agaw, &sagaw)) {
1860 /* hardware doesn't support it, choose a bigger one */
1861 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862 agaw = find_next_bit(&sagaw, 5, agaw);
1866 domain->agaw = agaw;
1868 if (ecap_coherent(iommu->ecap))
1869 domain->iommu_coherency = 1;
1871 domain->iommu_coherency = 0;
1873 if (ecap_sc_support(iommu->ecap))
1874 domain->iommu_snooping = 1;
1876 domain->iommu_snooping = 0;
1878 if (intel_iommu_superpage)
1879 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1881 domain->iommu_superpage = 0;
1883 domain->nid = iommu->node;
1885 /* always allocate the top pgd */
1886 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1889 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1893 static void domain_exit(struct dmar_domain *domain)
1896 /* Remove associated devices and clear attached or cached domains */
1897 domain_remove_dev_info(domain);
1900 put_iova_domain(&domain->iovad);
1903 struct page *freelist;
1905 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1906 dma_free_pagelist(freelist);
1909 free_domain_mem(domain);
1913 * Get the PASID directory size for scalable mode context entry.
1914 * Value of X in the PDTS field of a scalable mode context entry
1915 * indicates PASID directory with 2^(X + 7) entries.
1917 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1921 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1922 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1930 * Set the RID_PASID field of a scalable mode context entry. The
1931 * IOMMU hardware will use the PASID value set in this field for
1932 * DMA translations of DMA requests without PASID.
1935 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1937 context->hi |= pasid & ((1 << 20) - 1);
1938 context->hi |= (1 << 20);
1942 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1945 static inline void context_set_sm_dte(struct context_entry *context)
1947 context->lo |= (1 << 2);
1951 * Set the PRE(Page Request Enable) field of a scalable mode context
1954 static inline void context_set_sm_pre(struct context_entry *context)
1956 context->lo |= (1 << 4);
1959 /* Convert value to context PASID directory size field coding. */
1960 #define context_pdts(pds) (((pds) & 0x7) << 9)
1962 static int domain_context_mapping_one(struct dmar_domain *domain,
1963 struct intel_iommu *iommu,
1964 struct pasid_table *table,
1967 u16 did = domain->iommu_did[iommu->seq_id];
1968 int translation = CONTEXT_TT_MULTI_LEVEL;
1969 struct device_domain_info *info = NULL;
1970 struct context_entry *context;
1971 unsigned long flags;
1976 if (hw_pass_through && domain_type_is_si(domain))
1977 translation = CONTEXT_TT_PASS_THROUGH;
1979 pr_debug("Set context mapping for %02x:%02x.%d\n",
1980 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1982 BUG_ON(!domain->pgd);
1984 spin_lock_irqsave(&device_domain_lock, flags);
1985 spin_lock(&iommu->lock);
1988 context = iommu_context_addr(iommu, bus, devfn, 1);
1993 if (context_present(context))
1997 * For kdump cases, old valid entries may be cached due to the
1998 * in-flight DMA and copied pgtable, but there is no unmapping
1999 * behaviour for them, thus we need an explicit cache flush for
2000 * the newly-mapped device. For kdump, at this point, the device
2001 * is supposed to finish reset at its driver probe stage, so no
2002 * in-flight DMA will exist, and we don't need to worry anymore
2005 if (context_copied(context)) {
2006 u16 did_old = context_domain_id(context);
2008 if (did_old < cap_ndoms(iommu->cap)) {
2009 iommu->flush.flush_context(iommu, did_old,
2010 (((u16)bus) << 8) | devfn,
2011 DMA_CCMD_MASK_NOBIT,
2012 DMA_CCMD_DEVICE_INVL);
2013 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2018 context_clear_entry(context);
2020 if (sm_supported(iommu)) {
2025 /* Setup the PASID DIR pointer: */
2026 pds = context_get_sm_pds(table);
2027 context->lo = (u64)virt_to_phys(table->table) |
2030 /* Setup the RID_PASID field: */
2031 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2034 * Setup the Device-TLB enable bit and Page request
2037 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2038 if (info && info->ats_supported)
2039 context_set_sm_dte(context);
2040 if (info && info->pri_supported)
2041 context_set_sm_pre(context);
2043 struct dma_pte *pgd = domain->pgd;
2046 context_set_domain_id(context, did);
2048 if (translation != CONTEXT_TT_PASS_THROUGH) {
2050 * Skip top levels of page tables for iommu which has
2051 * less agaw than default. Unnecessary for PT mode.
2053 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2055 pgd = phys_to_virt(dma_pte_addr(pgd));
2056 if (!dma_pte_present(pgd))
2060 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2061 if (info && info->ats_supported)
2062 translation = CONTEXT_TT_DEV_IOTLB;
2064 translation = CONTEXT_TT_MULTI_LEVEL;
2066 context_set_address_root(context, virt_to_phys(pgd));
2067 context_set_address_width(context, agaw);
2070 * In pass through mode, AW must be programmed to
2071 * indicate the largest AGAW value supported by
2072 * hardware. And ASR is ignored by hardware.
2074 context_set_address_width(context, iommu->msagaw);
2077 context_set_translation_type(context, translation);
2080 context_set_fault_enable(context);
2081 context_set_present(context);
2082 domain_flush_cache(domain, context, sizeof(*context));
2085 * It's a non-present to present mapping. If hardware doesn't cache
2086 * non-present entry we only need to flush the write-buffer. If the
2087 * _does_ cache non-present entries, then it does so in the special
2088 * domain #0, which we have to flush:
2090 if (cap_caching_mode(iommu->cap)) {
2091 iommu->flush.flush_context(iommu, 0,
2092 (((u16)bus) << 8) | devfn,
2093 DMA_CCMD_MASK_NOBIT,
2094 DMA_CCMD_DEVICE_INVL);
2095 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2097 iommu_flush_write_buffer(iommu);
2099 iommu_enable_dev_iotlb(info);
2104 spin_unlock(&iommu->lock);
2105 spin_unlock_irqrestore(&device_domain_lock, flags);
2110 struct domain_context_mapping_data {
2111 struct dmar_domain *domain;
2112 struct intel_iommu *iommu;
2113 struct pasid_table *table;
2116 static int domain_context_mapping_cb(struct pci_dev *pdev,
2117 u16 alias, void *opaque)
2119 struct domain_context_mapping_data *data = opaque;
2121 return domain_context_mapping_one(data->domain, data->iommu,
2122 data->table, PCI_BUS_NUM(alias),
2127 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2129 struct domain_context_mapping_data data;
2130 struct pasid_table *table;
2131 struct intel_iommu *iommu;
2134 iommu = device_to_iommu(dev, &bus, &devfn);
2138 table = intel_pasid_get_table(dev);
2140 if (!dev_is_pci(dev))
2141 return domain_context_mapping_one(domain, iommu, table,
2144 data.domain = domain;
2148 return pci_for_each_dma_alias(to_pci_dev(dev),
2149 &domain_context_mapping_cb, &data);
2152 static int domain_context_mapped_cb(struct pci_dev *pdev,
2153 u16 alias, void *opaque)
2155 struct intel_iommu *iommu = opaque;
2157 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2160 static int domain_context_mapped(struct device *dev)
2162 struct intel_iommu *iommu;
2165 iommu = device_to_iommu(dev, &bus, &devfn);
2169 if (!dev_is_pci(dev))
2170 return device_context_mapped(iommu, bus, devfn);
2172 return !pci_for_each_dma_alias(to_pci_dev(dev),
2173 domain_context_mapped_cb, iommu);
2176 /* Returns a number of VTD pages, but aligned to MM page size */
2177 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2180 host_addr &= ~PAGE_MASK;
2181 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2184 /* Return largest possible superpage level for a given mapping */
2185 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186 unsigned long iov_pfn,
2187 unsigned long phy_pfn,
2188 unsigned long pages)
2190 int support, level = 1;
2191 unsigned long pfnmerge;
2193 support = domain->iommu_superpage;
2195 /* To use a large page, the virtual *and* physical addresses
2196 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197 of them will mean we have to use smaller pages. So just
2198 merge them and check both at once. */
2199 pfnmerge = iov_pfn | phy_pfn;
2201 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202 pages >>= VTD_STRIDE_SHIFT;
2205 pfnmerge >>= VTD_STRIDE_SHIFT;
2212 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213 struct scatterlist *sg, unsigned long phys_pfn,
2214 unsigned long nr_pages, int prot)
2216 struct dma_pte *first_pte = NULL, *pte = NULL;
2217 phys_addr_t uninitialized_var(pteval);
2218 unsigned long sg_res = 0;
2219 unsigned int largepage_lvl = 0;
2220 unsigned long lvl_pages = 0;
2222 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2224 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2227 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2231 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2234 while (nr_pages > 0) {
2238 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2240 sg_res = aligned_nrpages(sg->offset, sg->length);
2241 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242 sg->dma_length = sg->length;
2243 pteval = (sg_phys(sg) - pgoff) | prot;
2244 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2248 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2250 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2253 /* It is large page*/
2254 if (largepage_lvl > 1) {
2255 unsigned long nr_superpages, end_pfn;
2257 pteval |= DMA_PTE_LARGE_PAGE;
2258 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2260 nr_superpages = sg_res / lvl_pages;
2261 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2264 * Ensure that old small page tables are
2265 * removed to make room for superpage(s).
2266 * We're adding new large pages, so make sure
2267 * we don't remove their parent tables.
2269 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2272 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2276 /* We don't need lock here, nobody else
2277 * touches the iova range
2279 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2281 static int dumps = 5;
2282 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283 iov_pfn, tmp, (unsigned long long)pteval);
2286 debug_dma_dump_mappings(NULL);
2291 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2293 BUG_ON(nr_pages < lvl_pages);
2294 BUG_ON(sg_res < lvl_pages);
2296 nr_pages -= lvl_pages;
2297 iov_pfn += lvl_pages;
2298 phys_pfn += lvl_pages;
2299 pteval += lvl_pages * VTD_PAGE_SIZE;
2300 sg_res -= lvl_pages;
2302 /* If the next PTE would be the first in a new page, then we
2303 need to flush the cache on the entries we've just written.
2304 And then we'll need to recalculate 'pte', so clear it and
2305 let it get set again in the if (!pte) block above.
2307 If we're done (!nr_pages) we need to flush the cache too.
2309 Also if we've been setting superpages, we may need to
2310 recalculate 'pte' and switch back to smaller pages for the
2311 end of the mapping, if the trailing size is not enough to
2312 use another superpage (i.e. sg_res < lvl_pages). */
2314 if (!nr_pages || first_pte_in_page(pte) ||
2315 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316 domain_flush_cache(domain, first_pte,
2317 (void *)pte - (void *)first_pte);
2321 if (!sg_res && nr_pages)
2327 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328 struct scatterlist *sg, unsigned long phys_pfn,
2329 unsigned long nr_pages, int prot)
2332 struct intel_iommu *iommu;
2334 /* Do the real mapping first */
2335 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2339 for_each_domain_iommu(iommu_id, domain) {
2340 iommu = g_iommus[iommu_id];
2341 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2347 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2348 struct scatterlist *sg, unsigned long nr_pages,
2351 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2354 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2355 unsigned long phys_pfn, unsigned long nr_pages,
2358 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2361 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2363 unsigned long flags;
2364 struct context_entry *context;
2370 spin_lock_irqsave(&iommu->lock, flags);
2371 context = iommu_context_addr(iommu, bus, devfn, 0);
2373 spin_unlock_irqrestore(&iommu->lock, flags);
2376 did_old = context_domain_id(context);
2377 context_clear_entry(context);
2378 __iommu_flush_cache(iommu, context, sizeof(*context));
2379 spin_unlock_irqrestore(&iommu->lock, flags);
2380 iommu->flush.flush_context(iommu,
2382 (((u16)bus) << 8) | devfn,
2383 DMA_CCMD_MASK_NOBIT,
2384 DMA_CCMD_DEVICE_INVL);
2385 iommu->flush.flush_iotlb(iommu,
2392 static inline void unlink_domain_info(struct device_domain_info *info)
2394 assert_spin_locked(&device_domain_lock);
2395 list_del(&info->link);
2396 list_del(&info->global);
2398 info->dev->archdata.iommu = NULL;
2401 static void domain_remove_dev_info(struct dmar_domain *domain)
2403 struct device_domain_info *info, *tmp;
2404 unsigned long flags;
2406 spin_lock_irqsave(&device_domain_lock, flags);
2407 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2408 __dmar_remove_one_dev_info(info);
2409 spin_unlock_irqrestore(&device_domain_lock, flags);
2414 * Note: we use struct device->archdata.iommu stores the info
2416 static struct dmar_domain *find_domain(struct device *dev)
2418 struct device_domain_info *info;
2420 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2421 struct iommu_domain *domain;
2423 dev->archdata.iommu = NULL;
2424 domain = iommu_get_domain_for_dev(dev);
2426 intel_iommu_attach_device(domain, dev);
2429 /* No lock here, assumes no domain exit in normal case */
2430 info = dev->archdata.iommu;
2433 return info->domain;
2437 static inline struct device_domain_info *
2438 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2440 struct device_domain_info *info;
2442 list_for_each_entry(info, &device_domain_list, global)
2443 if (info->iommu->segment == segment && info->bus == bus &&
2444 info->devfn == devfn)
2450 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2453 struct dmar_domain *domain)
2455 struct dmar_domain *found = NULL;
2456 struct device_domain_info *info;
2457 unsigned long flags;
2460 info = alloc_devinfo_mem();
2465 info->devfn = devfn;
2466 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2467 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2470 info->domain = domain;
2471 info->iommu = iommu;
2472 info->pasid_table = NULL;
2473 info->auxd_enabled = 0;
2474 INIT_LIST_HEAD(&info->auxiliary_domains);
2476 if (dev && dev_is_pci(dev)) {
2477 struct pci_dev *pdev = to_pci_dev(info->dev);
2479 if (!pdev->untrusted &&
2480 !pci_ats_disabled() &&
2481 ecap_dev_iotlb_support(iommu->ecap) &&
2482 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2483 dmar_find_matched_atsr_unit(pdev))
2484 info->ats_supported = 1;
2486 if (sm_supported(iommu)) {
2487 if (pasid_supported(iommu)) {
2488 int features = pci_pasid_features(pdev);
2490 info->pasid_supported = features | 1;
2493 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2494 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2495 info->pri_supported = 1;
2499 spin_lock_irqsave(&device_domain_lock, flags);
2501 found = find_domain(dev);
2504 struct device_domain_info *info2;
2505 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2507 found = info2->domain;
2513 spin_unlock_irqrestore(&device_domain_lock, flags);
2514 free_devinfo_mem(info);
2515 /* Caller must free the original domain */
2519 spin_lock(&iommu->lock);
2520 ret = domain_attach_iommu(domain, iommu);
2521 spin_unlock(&iommu->lock);
2524 spin_unlock_irqrestore(&device_domain_lock, flags);
2525 free_devinfo_mem(info);
2529 list_add(&info->link, &domain->devices);
2530 list_add(&info->global, &device_domain_list);
2532 dev->archdata.iommu = info;
2533 spin_unlock_irqrestore(&device_domain_lock, flags);
2535 /* PASID table is mandatory for a PCI device in scalable mode. */
2536 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2537 ret = intel_pasid_alloc_table(dev);
2539 dev_err(dev, "PASID table allocation failed\n");
2540 dmar_remove_one_dev_info(dev);
2544 /* Setup the PASID entry for requests without PASID: */
2545 spin_lock(&iommu->lock);
2546 if (hw_pass_through && domain_type_is_si(domain))
2547 ret = intel_pasid_setup_pass_through(iommu, domain,
2548 dev, PASID_RID2PASID);
2550 ret = intel_pasid_setup_second_level(iommu, domain,
2551 dev, PASID_RID2PASID);
2552 spin_unlock(&iommu->lock);
2554 dev_err(dev, "Setup RID2PASID failed\n");
2555 dmar_remove_one_dev_info(dev);
2560 if (dev && domain_context_mapping(domain, dev)) {
2561 dev_err(dev, "Domain context map failed\n");
2562 dmar_remove_one_dev_info(dev);
2569 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2571 *(u16 *)opaque = alias;
2575 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2577 struct device_domain_info *info;
2578 struct dmar_domain *domain = NULL;
2579 struct intel_iommu *iommu;
2581 unsigned long flags;
2584 iommu = device_to_iommu(dev, &bus, &devfn);
2588 if (dev_is_pci(dev)) {
2589 struct pci_dev *pdev = to_pci_dev(dev);
2591 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2593 spin_lock_irqsave(&device_domain_lock, flags);
2594 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2595 PCI_BUS_NUM(dma_alias),
2598 iommu = info->iommu;
2599 domain = info->domain;
2601 spin_unlock_irqrestore(&device_domain_lock, flags);
2603 /* DMA alias already has a domain, use it */
2608 /* Allocate and initialize new domain for the device */
2609 domain = alloc_domain(0);
2612 if (domain_init(domain, iommu, gaw)) {
2613 domain_exit(domain);
2621 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2622 struct dmar_domain *domain)
2624 struct intel_iommu *iommu;
2625 struct dmar_domain *tmp;
2626 u16 req_id, dma_alias;
2629 iommu = device_to_iommu(dev, &bus, &devfn);
2633 req_id = ((u16)bus << 8) | devfn;
2635 if (dev_is_pci(dev)) {
2636 struct pci_dev *pdev = to_pci_dev(dev);
2638 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2640 /* register PCI DMA alias device */
2641 if (req_id != dma_alias) {
2642 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2643 dma_alias & 0xff, NULL, domain);
2645 if (!tmp || tmp != domain)
2650 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2651 if (!tmp || tmp != domain)
2657 static int iommu_domain_identity_map(struct dmar_domain *domain,
2658 unsigned long long start,
2659 unsigned long long end)
2661 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2662 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2664 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2665 dma_to_mm_pfn(last_vpfn))) {
2666 pr_err("Reserving iova failed\n");
2670 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2672 * RMRR range might have overlap with physical memory range,
2675 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2677 return __domain_mapping(domain, first_vpfn, NULL,
2678 first_vpfn, last_vpfn - first_vpfn + 1,
2679 DMA_PTE_READ|DMA_PTE_WRITE);
2682 static int domain_prepare_identity_map(struct device *dev,
2683 struct dmar_domain *domain,
2684 unsigned long long start,
2685 unsigned long long end)
2687 /* For _hardware_ passthrough, don't bother. But for software
2688 passthrough, we do it anyway -- it may indicate a memory
2689 range which is reserved in E820, so which didn't get set
2690 up to start with in si_domain */
2691 if (domain == si_domain && hw_pass_through) {
2692 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2697 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2700 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2701 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2702 dmi_get_system_info(DMI_BIOS_VENDOR),
2703 dmi_get_system_info(DMI_BIOS_VERSION),
2704 dmi_get_system_info(DMI_PRODUCT_VERSION));
2708 if (end >> agaw_to_width(domain->agaw)) {
2709 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2710 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2711 agaw_to_width(domain->agaw),
2712 dmi_get_system_info(DMI_BIOS_VENDOR),
2713 dmi_get_system_info(DMI_BIOS_VERSION),
2714 dmi_get_system_info(DMI_PRODUCT_VERSION));
2718 return iommu_domain_identity_map(domain, start, end);
2721 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2723 static int __init si_domain_init(int hw)
2725 struct dmar_rmrr_unit *rmrr;
2729 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2733 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734 domain_exit(si_domain);
2741 for_each_online_node(nid) {
2742 unsigned long start_pfn, end_pfn;
2745 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2746 ret = iommu_domain_identity_map(si_domain,
2747 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2754 * Normally we use DMA domains for devices which have RMRRs. But we
2755 * loose this requirement for graphic and usb devices. Identity map
2756 * the RMRRs for graphic and USB devices so that they could use the
2759 for_each_rmrr_units(rmrr) {
2760 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2762 unsigned long long start = rmrr->base_address;
2763 unsigned long long end = rmrr->end_address;
2765 if (device_is_rmrr_locked(dev))
2768 if (WARN_ON(end < start ||
2769 end >> agaw_to_width(si_domain->agaw)))
2772 ret = iommu_domain_identity_map(si_domain, start, end);
2781 static int identity_mapping(struct device *dev)
2783 struct device_domain_info *info;
2785 info = dev->archdata.iommu;
2786 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2787 return (info->domain == si_domain);
2792 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2794 struct dmar_domain *ndomain;
2795 struct intel_iommu *iommu;
2798 iommu = device_to_iommu(dev, &bus, &devfn);
2802 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2803 if (ndomain != domain)
2809 static bool device_has_rmrr(struct device *dev)
2811 struct dmar_rmrr_unit *rmrr;
2816 for_each_rmrr_units(rmrr) {
2818 * Return TRUE if this RMRR contains the device that
2821 for_each_active_dev_scope(rmrr->devices,
2822 rmrr->devices_cnt, i, tmp)
2824 is_downstream_to_pci_bridge(dev, tmp)) {
2834 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2835 * is relaxable (ie. is allowed to be not enforced under some conditions)
2836 * @dev: device handle
2838 * We assume that PCI USB devices with RMRRs have them largely
2839 * for historical reasons and that the RMRR space is not actively used post
2840 * boot. This exclusion may change if vendors begin to abuse it.
2842 * The same exception is made for graphics devices, with the requirement that
2843 * any use of the RMRR regions will be torn down before assigning the device
2846 * Return: true if the RMRR is relaxable, false otherwise
2848 static bool device_rmrr_is_relaxable(struct device *dev)
2850 struct pci_dev *pdev;
2852 if (!dev_is_pci(dev))
2855 pdev = to_pci_dev(dev);
2856 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2863 * There are a couple cases where we need to restrict the functionality of
2864 * devices associated with RMRRs. The first is when evaluating a device for
2865 * identity mapping because problems exist when devices are moved in and out
2866 * of domains and their respective RMRR information is lost. This means that
2867 * a device with associated RMRRs will never be in a "passthrough" domain.
2868 * The second is use of the device through the IOMMU API. This interface
2869 * expects to have full control of the IOVA space for the device. We cannot
2870 * satisfy both the requirement that RMRR access is maintained and have an
2871 * unencumbered IOVA space. We also have no ability to quiesce the device's
2872 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2873 * We therefore prevent devices associated with an RMRR from participating in
2874 * the IOMMU API, which eliminates them from device assignment.
2876 * In both cases, devices which have relaxable RMRRs are not concerned by this
2877 * restriction. See device_rmrr_is_relaxable comment.
2879 static bool device_is_rmrr_locked(struct device *dev)
2881 if (!device_has_rmrr(dev))
2884 if (device_rmrr_is_relaxable(dev))
2891 * Return the required default domain type for a specific device.
2893 * @dev: the device in query
2894 * @startup: true if this is during early boot
2897 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2898 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2899 * - 0: both identity and dynamic domains work for this device
2901 static int device_def_domain_type(struct device *dev)
2903 if (dev_is_pci(dev)) {
2904 struct pci_dev *pdev = to_pci_dev(dev);
2906 if (device_is_rmrr_locked(dev))
2907 return IOMMU_DOMAIN_DMA;
2910 * Prevent any device marked as untrusted from getting
2911 * placed into the statically identity mapping domain.
2913 if (pdev->untrusted)
2914 return IOMMU_DOMAIN_DMA;
2916 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2917 return IOMMU_DOMAIN_IDENTITY;
2919 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2920 return IOMMU_DOMAIN_IDENTITY;
2923 * We want to start off with all devices in the 1:1 domain, and
2924 * take them out later if we find they can't access all of memory.
2926 * However, we can't do this for PCI devices behind bridges,
2927 * because all PCI devices behind the same bridge will end up
2928 * with the same source-id on their transactions.
2930 * Practically speaking, we can't change things around for these
2931 * devices at run-time, because we can't be sure there'll be no
2932 * DMA transactions in flight for any of their siblings.
2934 * So PCI devices (unless they're on the root bus) as well as
2935 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2936 * the 1:1 domain, just in _case_ one of their siblings turns out
2937 * not to be able to map all of memory.
2939 if (!pci_is_pcie(pdev)) {
2940 if (!pci_is_root_bus(pdev->bus))
2941 return IOMMU_DOMAIN_DMA;
2942 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2943 return IOMMU_DOMAIN_DMA;
2944 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2945 return IOMMU_DOMAIN_DMA;
2947 if (device_has_rmrr(dev))
2948 return IOMMU_DOMAIN_DMA;
2951 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2952 IOMMU_DOMAIN_IDENTITY : 0;
2955 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2958 * Start from the sane iommu hardware state.
2959 * If the queued invalidation is already initialized by us
2960 * (for example, while enabling interrupt-remapping) then
2961 * we got the things already rolling from a sane state.
2965 * Clear any previous faults.
2967 dmar_fault(-1, iommu);
2969 * Disable queued invalidation if supported and already enabled
2970 * before OS handover.
2972 dmar_disable_qi(iommu);
2975 if (dmar_enable_qi(iommu)) {
2977 * Queued Invalidate not enabled, use Register Based Invalidate
2979 iommu->flush.flush_context = __iommu_flush_context;
2980 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2981 pr_info("%s: Using Register based invalidation\n",
2984 iommu->flush.flush_context = qi_flush_context;
2985 iommu->flush.flush_iotlb = qi_flush_iotlb;
2986 pr_info("%s: Using Queued invalidation\n", iommu->name);
2990 static int copy_context_table(struct intel_iommu *iommu,
2991 struct root_entry *old_re,
2992 struct context_entry **tbl,
2995 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2996 struct context_entry *new_ce = NULL, ce;
2997 struct context_entry *old_ce = NULL;
2998 struct root_entry re;
2999 phys_addr_t old_ce_phys;
3001 tbl_idx = ext ? bus * 2 : bus;
3002 memcpy(&re, old_re, sizeof(re));
3004 for (devfn = 0; devfn < 256; devfn++) {
3005 /* First calculate the correct index */
3006 idx = (ext ? devfn * 2 : devfn) % 256;
3009 /* First save what we may have and clean up */
3011 tbl[tbl_idx] = new_ce;
3012 __iommu_flush_cache(iommu, new_ce,
3022 old_ce_phys = root_entry_lctp(&re);
3024 old_ce_phys = root_entry_uctp(&re);
3027 if (ext && devfn == 0) {
3028 /* No LCTP, try UCTP */
3037 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3042 new_ce = alloc_pgtable_page(iommu->node);
3049 /* Now copy the context entry */
3050 memcpy(&ce, old_ce + idx, sizeof(ce));
3052 if (!__context_present(&ce))
3055 did = context_domain_id(&ce);
3056 if (did >= 0 && did < cap_ndoms(iommu->cap))
3057 set_bit(did, iommu->domain_ids);
3060 * We need a marker for copied context entries. This
3061 * marker needs to work for the old format as well as
3062 * for extended context entries.
3064 * Bit 67 of the context entry is used. In the old
3065 * format this bit is available to software, in the
3066 * extended format it is the PGE bit, but PGE is ignored
3067 * by HW if PASIDs are disabled (and thus still
3070 * So disable PASIDs first and then mark the entry
3071 * copied. This means that we don't copy PASID
3072 * translations from the old kernel, but this is fine as
3073 * faults there are not fatal.
3075 context_clear_pasid_enable(&ce);
3076 context_set_copied(&ce);
3081 tbl[tbl_idx + pos] = new_ce;
3083 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3092 static int copy_translation_tables(struct intel_iommu *iommu)
3094 struct context_entry **ctxt_tbls;
3095 struct root_entry *old_rt;
3096 phys_addr_t old_rt_phys;
3097 int ctxt_table_entries;
3098 unsigned long flags;
3103 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3104 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3105 new_ext = !!ecap_ecs(iommu->ecap);
3108 * The RTT bit can only be changed when translation is disabled,
3109 * but disabling translation means to open a window for data
3110 * corruption. So bail out and don't copy anything if we would
3111 * have to change the bit.
3116 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3120 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3124 /* This is too big for the stack - allocate it from slab */
3125 ctxt_table_entries = ext ? 512 : 256;
3127 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3131 for (bus = 0; bus < 256; bus++) {
3132 ret = copy_context_table(iommu, &old_rt[bus],
3133 ctxt_tbls, bus, ext);
3135 pr_err("%s: Failed to copy context table for bus %d\n",
3141 spin_lock_irqsave(&iommu->lock, flags);
3143 /* Context tables are copied, now write them to the root_entry table */
3144 for (bus = 0; bus < 256; bus++) {
3145 int idx = ext ? bus * 2 : bus;
3148 if (ctxt_tbls[idx]) {
3149 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3150 iommu->root_entry[bus].lo = val;
3153 if (!ext || !ctxt_tbls[idx + 1])
3156 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3157 iommu->root_entry[bus].hi = val;
3160 spin_unlock_irqrestore(&iommu->lock, flags);
3164 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3174 static int __init init_dmars(void)
3176 struct dmar_drhd_unit *drhd;
3177 struct intel_iommu *iommu;
3183 * initialize and program root entry to not present
3186 for_each_drhd_unit(drhd) {
3188 * lock not needed as this is only incremented in the single
3189 * threaded kernel __init code path all other access are read
3192 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3196 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3199 /* Preallocate enough resources for IOMMU hot-addition */
3200 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3201 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3203 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3206 pr_err("Allocating global iommu array failed\n");
3211 for_each_iommu(iommu, drhd) {
3212 if (drhd->ignored) {
3213 iommu_disable_translation(iommu);
3218 * Find the max pasid size of all IOMMU's in the system.
3219 * We need to ensure the system pasid table is no bigger
3220 * than the smallest supported.
3222 if (pasid_supported(iommu)) {
3223 u32 temp = 2 << ecap_pss(iommu->ecap);
3225 intel_pasid_max_id = min_t(u32, temp,
3226 intel_pasid_max_id);
3229 g_iommus[iommu->seq_id] = iommu;
3231 intel_iommu_init_qi(iommu);
3233 ret = iommu_init_domains(iommu);
3237 init_translation_status(iommu);
3239 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240 iommu_disable_translation(iommu);
3241 clear_translation_pre_enabled(iommu);
3242 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3248 * we could share the same root & context tables
3249 * among all IOMMU's. Need to Split it later.
3251 ret = iommu_alloc_root_entry(iommu);
3255 if (translation_pre_enabled(iommu)) {
3256 pr_info("Translation already enabled - trying to copy translation structures\n");
3258 ret = copy_translation_tables(iommu);
3261 * We found the IOMMU with translation
3262 * enabled - but failed to copy over the
3263 * old root-entry table. Try to proceed
3264 * by disabling translation now and
3265 * allocating a clean root-entry table.
3266 * This might cause DMAR faults, but
3267 * probably the dump will still succeed.
3269 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3271 iommu_disable_translation(iommu);
3272 clear_translation_pre_enabled(iommu);
3274 pr_info("Copied translation tables from previous kernel for %s\n",
3279 if (!ecap_pass_through(iommu->ecap))
3280 hw_pass_through = 0;
3281 #ifdef CONFIG_INTEL_IOMMU_SVM
3282 if (pasid_supported(iommu))
3283 intel_svm_init(iommu);
3288 * Now that qi is enabled on all iommus, set the root entry and flush
3289 * caches. This is required on some Intel X58 chipsets, otherwise the
3290 * flush_context function will loop forever and the boot hangs.
3292 for_each_active_iommu(iommu, drhd) {
3293 iommu_flush_write_buffer(iommu);
3294 iommu_set_root_entry(iommu);
3295 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3296 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3299 if (iommu_pass_through)
3300 iommu_identity_mapping |= IDENTMAP_ALL;
3302 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3307 iommu_identity_mapping |= IDENTMAP_GFX;
3309 check_tylersburg_isoch();
3311 ret = si_domain_init(hw_pass_through);
3318 * global invalidate context cache
3319 * global invalidate iotlb
3320 * enable translation
3322 for_each_iommu(iommu, drhd) {
3323 if (drhd->ignored) {
3325 * we always have to disable PMRs or DMA may fail on
3329 iommu_disable_protect_mem_regions(iommu);
3333 iommu_flush_write_buffer(iommu);
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3338 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3339 * could cause possible lock race condition.
3341 up_write(&dmar_global_lock);
3342 ret = intel_svm_enable_prq(iommu);
3343 down_write(&dmar_global_lock);
3348 ret = dmar_set_interrupt(iommu);
3356 for_each_active_iommu(iommu, drhd) {
3357 disable_dmar_iommu(iommu);
3358 free_dmar_iommu(iommu);
3367 /* This takes a number of _MM_ pages, not VTD pages */
3368 static unsigned long intel_alloc_iova(struct device *dev,
3369 struct dmar_domain *domain,
3370 unsigned long nrpages, uint64_t dma_mask)
3372 unsigned long iova_pfn;
3374 /* Restrict dma_mask to the width that the iommu can handle */
3375 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3376 /* Ensure we reserve the whole size-aligned region */
3377 nrpages = __roundup_pow_of_two(nrpages);
3379 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3381 * First try to allocate an io virtual address in
3382 * DMA_BIT_MASK(32) and if that fails then try allocating
3385 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3386 IOVA_PFN(DMA_BIT_MASK(32)), false);
3390 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3391 IOVA_PFN(dma_mask), true);
3392 if (unlikely(!iova_pfn)) {
3393 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3400 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3402 struct dmar_domain *domain, *tmp;
3403 struct dmar_rmrr_unit *rmrr;
3404 struct device *i_dev;
3407 /* Device shouldn't be attached by any domains. */
3408 domain = find_domain(dev);
3412 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3416 /* We have a new domain - setup possible RMRRs for the device */
3418 for_each_rmrr_units(rmrr) {
3419 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3424 ret = domain_prepare_identity_map(dev, domain,
3428 dev_err(dev, "Mapping reserved region failed\n");
3433 tmp = set_domain_for_dev(dev, domain);
3434 if (!tmp || domain != tmp) {
3435 domain_exit(domain);
3441 dev_err(dev, "Allocating domain failed\n");
3443 domain->domain.type = IOMMU_DOMAIN_DMA;
3448 /* Check if the dev needs to go through non-identity map and unmap process.*/
3449 static bool iommu_need_mapping(struct device *dev)
3453 if (iommu_dummy(dev))
3456 ret = identity_mapping(dev);
3458 u64 dma_mask = *dev->dma_mask;
3460 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3461 dma_mask = dev->coherent_dma_mask;
3463 if (dma_mask >= dma_get_required_mask(dev))
3467 * 32 bit DMA is removed from si_domain and fall back to
3468 * non-identity mapping.
3470 dmar_remove_one_dev_info(dev);
3471 ret = iommu_request_dma_domain_for_dev(dev);
3473 struct iommu_domain *domain;
3474 struct dmar_domain *dmar_domain;
3476 domain = iommu_get_domain_for_dev(dev);
3478 dmar_domain = to_dmar_domain(domain);
3479 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3481 dmar_remove_one_dev_info(dev);
3482 get_private_domain_for_dev(dev);
3485 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3491 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3492 size_t size, int dir, u64 dma_mask)
3494 struct dmar_domain *domain;
3495 phys_addr_t start_paddr;
3496 unsigned long iova_pfn;
3499 struct intel_iommu *iommu;
3500 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3502 BUG_ON(dir == DMA_NONE);
3504 domain = find_domain(dev);
3506 return DMA_MAPPING_ERROR;
3508 iommu = domain_get_iommu(domain);
3509 size = aligned_nrpages(paddr, size);
3511 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3516 * Check if DMAR supports zero-length reads on write only
3519 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3520 !cap_zlr(iommu->cap))
3521 prot |= DMA_PTE_READ;
3522 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3523 prot |= DMA_PTE_WRITE;
3525 * paddr - (paddr + size) might be partial page, we should map the whole
3526 * page. Note: if two part of one page are separately mapped, we
3527 * might have two guest_addr mapping to the same host paddr, but this
3528 * is not a big problem
3530 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3531 mm_to_dma_pfn(paddr_pfn), size, prot);
3535 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3536 start_paddr += paddr & ~PAGE_MASK;
3541 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3542 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3543 size, (unsigned long long)paddr, dir);
3544 return DMA_MAPPING_ERROR;
3547 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3548 unsigned long offset, size_t size,
3549 enum dma_data_direction dir,
3550 unsigned long attrs)
3552 if (iommu_need_mapping(dev))
3553 return __intel_map_single(dev, page_to_phys(page) + offset,
3554 size, dir, *dev->dma_mask);
3555 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3558 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3559 size_t size, enum dma_data_direction dir,
3560 unsigned long attrs)
3562 if (iommu_need_mapping(dev))
3563 return __intel_map_single(dev, phys_addr, size, dir,
3565 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3568 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3570 struct dmar_domain *domain;
3571 unsigned long start_pfn, last_pfn;
3572 unsigned long nrpages;
3573 unsigned long iova_pfn;
3574 struct intel_iommu *iommu;
3575 struct page *freelist;
3576 struct pci_dev *pdev = NULL;
3578 domain = find_domain(dev);
3581 iommu = domain_get_iommu(domain);
3583 iova_pfn = IOVA_PFN(dev_addr);
3585 nrpages = aligned_nrpages(dev_addr, size);
3586 start_pfn = mm_to_dma_pfn(iova_pfn);
3587 last_pfn = start_pfn + nrpages - 1;
3589 if (dev_is_pci(dev))
3590 pdev = to_pci_dev(dev);
3592 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3594 freelist = domain_unmap(domain, start_pfn, last_pfn);
3596 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3597 !has_iova_flush_queue(&domain->iovad)) {
3598 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3599 nrpages, !freelist, 0);
3601 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3602 dma_free_pagelist(freelist);
3604 queue_iova(&domain->iovad, iova_pfn, nrpages,
3605 (unsigned long)freelist);
3607 * queue up the release of the unmap to save the 1/6th of the
3608 * cpu used up by the iotlb flush operation...
3613 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3614 size_t size, enum dma_data_direction dir,
3615 unsigned long attrs)
3617 if (iommu_need_mapping(dev))
3618 intel_unmap(dev, dev_addr, size);
3620 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3623 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3624 size_t size, enum dma_data_direction dir, unsigned long attrs)
3626 if (iommu_need_mapping(dev))
3627 intel_unmap(dev, dev_addr, size);
3630 static void *intel_alloc_coherent(struct device *dev, size_t size,
3631 dma_addr_t *dma_handle, gfp_t flags,
3632 unsigned long attrs)
3634 struct page *page = NULL;
3637 if (!iommu_need_mapping(dev))
3638 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3640 size = PAGE_ALIGN(size);
3641 order = get_order(size);
3643 if (gfpflags_allow_blocking(flags)) {
3644 unsigned int count = size >> PAGE_SHIFT;
3646 page = dma_alloc_from_contiguous(dev, count, order,
3647 flags & __GFP_NOWARN);
3651 page = alloc_pages(flags, order);
3654 memset(page_address(page), 0, size);
3656 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3658 dev->coherent_dma_mask);
3659 if (*dma_handle != DMA_MAPPING_ERROR)
3660 return page_address(page);
3661 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3662 __free_pages(page, order);
3667 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3668 dma_addr_t dma_handle, unsigned long attrs)
3671 struct page *page = virt_to_page(vaddr);
3673 if (!iommu_need_mapping(dev))
3674 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3676 size = PAGE_ALIGN(size);
3677 order = get_order(size);
3679 intel_unmap(dev, dma_handle, size);
3680 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3681 __free_pages(page, order);
3684 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3685 int nelems, enum dma_data_direction dir,
3686 unsigned long attrs)
3688 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3689 unsigned long nrpages = 0;
3690 struct scatterlist *sg;
3693 if (!iommu_need_mapping(dev))
3694 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3696 for_each_sg(sglist, sg, nelems, i) {
3697 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3700 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3703 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3704 enum dma_data_direction dir, unsigned long attrs)
3707 struct dmar_domain *domain;
3710 unsigned long iova_pfn;
3712 struct scatterlist *sg;
3713 unsigned long start_vpfn;
3714 struct intel_iommu *iommu;
3716 BUG_ON(dir == DMA_NONE);
3717 if (!iommu_need_mapping(dev))
3718 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3720 domain = find_domain(dev);
3724 iommu = domain_get_iommu(domain);
3726 for_each_sg(sglist, sg, nelems, i)
3727 size += aligned_nrpages(sg->offset, sg->length);
3729 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3732 sglist->dma_length = 0;
3737 * Check if DMAR supports zero-length reads on write only
3740 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3741 !cap_zlr(iommu->cap))
3742 prot |= DMA_PTE_READ;
3743 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3744 prot |= DMA_PTE_WRITE;
3746 start_vpfn = mm_to_dma_pfn(iova_pfn);
3748 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3749 if (unlikely(ret)) {
3750 dma_pte_free_pagetable(domain, start_vpfn,
3751 start_vpfn + size - 1,
3752 agaw_to_level(domain->agaw) + 1);
3753 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3760 static const struct dma_map_ops intel_dma_ops = {
3761 .alloc = intel_alloc_coherent,
3762 .free = intel_free_coherent,
3763 .map_sg = intel_map_sg,
3764 .unmap_sg = intel_unmap_sg,
3765 .map_page = intel_map_page,
3766 .unmap_page = intel_unmap_page,
3767 .map_resource = intel_map_resource,
3768 .unmap_resource = intel_unmap_resource,
3769 .dma_supported = dma_direct_supported,
3772 static inline int iommu_domain_cache_init(void)
3776 iommu_domain_cache = kmem_cache_create("iommu_domain",
3777 sizeof(struct dmar_domain),
3782 if (!iommu_domain_cache) {
3783 pr_err("Couldn't create iommu_domain cache\n");
3790 static inline int iommu_devinfo_cache_init(void)
3794 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3795 sizeof(struct device_domain_info),
3799 if (!iommu_devinfo_cache) {
3800 pr_err("Couldn't create devinfo cache\n");
3807 static int __init iommu_init_mempool(void)
3810 ret = iova_cache_get();
3814 ret = iommu_domain_cache_init();
3818 ret = iommu_devinfo_cache_init();
3822 kmem_cache_destroy(iommu_domain_cache);
3829 static void __init iommu_exit_mempool(void)
3831 kmem_cache_destroy(iommu_devinfo_cache);
3832 kmem_cache_destroy(iommu_domain_cache);
3836 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3838 struct dmar_drhd_unit *drhd;
3842 /* We know that this device on this chipset has its own IOMMU.
3843 * If we find it under a different IOMMU, then the BIOS is lying
3844 * to us. Hope that the IOMMU for this device is actually
3845 * disabled, and it needs no translation...
3847 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3849 /* "can't" happen */
3850 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3853 vtbar &= 0xffff0000;
3855 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3856 drhd = dmar_find_matched_drhd_unit(pdev);
3857 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3858 TAINT_FIRMWARE_WORKAROUND,
3859 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3860 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3862 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3864 static void __init init_no_remapping_devices(void)
3866 struct dmar_drhd_unit *drhd;
3870 for_each_drhd_unit(drhd) {
3871 if (!drhd->include_all) {
3872 for_each_active_dev_scope(drhd->devices,
3873 drhd->devices_cnt, i, dev)
3875 /* ignore DMAR unit if no devices exist */
3876 if (i == drhd->devices_cnt)
3881 for_each_active_drhd_unit(drhd) {
3882 if (drhd->include_all)
3885 for_each_active_dev_scope(drhd->devices,
3886 drhd->devices_cnt, i, dev)
3887 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3889 if (i < drhd->devices_cnt)
3892 /* This IOMMU has *only* gfx devices. Either bypass it or
3893 set the gfx_mapped flag, as appropriate */
3894 if (!dmar_map_gfx) {
3896 for_each_active_dev_scope(drhd->devices,
3897 drhd->devices_cnt, i, dev)
3898 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3903 #ifdef CONFIG_SUSPEND
3904 static int init_iommu_hw(void)
3906 struct dmar_drhd_unit *drhd;
3907 struct intel_iommu *iommu = NULL;
3909 for_each_active_iommu(iommu, drhd)
3911 dmar_reenable_qi(iommu);
3913 for_each_iommu(iommu, drhd) {
3914 if (drhd->ignored) {
3916 * we always have to disable PMRs or DMA may fail on
3920 iommu_disable_protect_mem_regions(iommu);
3924 iommu_flush_write_buffer(iommu);
3926 iommu_set_root_entry(iommu);
3928 iommu->flush.flush_context(iommu, 0, 0, 0,
3929 DMA_CCMD_GLOBAL_INVL);
3930 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3931 iommu_enable_translation(iommu);
3932 iommu_disable_protect_mem_regions(iommu);
3938 static void iommu_flush_all(void)
3940 struct dmar_drhd_unit *drhd;
3941 struct intel_iommu *iommu;
3943 for_each_active_iommu(iommu, drhd) {
3944 iommu->flush.flush_context(iommu, 0, 0, 0,
3945 DMA_CCMD_GLOBAL_INVL);
3946 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3947 DMA_TLB_GLOBAL_FLUSH);
3951 static int iommu_suspend(void)
3953 struct dmar_drhd_unit *drhd;
3954 struct intel_iommu *iommu = NULL;
3957 for_each_active_iommu(iommu, drhd) {
3958 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3960 if (!iommu->iommu_state)
3966 for_each_active_iommu(iommu, drhd) {
3967 iommu_disable_translation(iommu);
3969 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3971 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3972 readl(iommu->reg + DMAR_FECTL_REG);
3973 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3974 readl(iommu->reg + DMAR_FEDATA_REG);
3975 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3976 readl(iommu->reg + DMAR_FEADDR_REG);
3977 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3978 readl(iommu->reg + DMAR_FEUADDR_REG);
3980 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3985 for_each_active_iommu(iommu, drhd)
3986 kfree(iommu->iommu_state);
3991 static void iommu_resume(void)
3993 struct dmar_drhd_unit *drhd;
3994 struct intel_iommu *iommu = NULL;
3997 if (init_iommu_hw()) {
3999 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4001 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4005 for_each_active_iommu(iommu, drhd) {
4007 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4009 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4010 iommu->reg + DMAR_FECTL_REG);
4011 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4012 iommu->reg + DMAR_FEDATA_REG);
4013 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4014 iommu->reg + DMAR_FEADDR_REG);
4015 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4016 iommu->reg + DMAR_FEUADDR_REG);
4018 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4021 for_each_active_iommu(iommu, drhd)
4022 kfree(iommu->iommu_state);
4025 static struct syscore_ops iommu_syscore_ops = {
4026 .resume = iommu_resume,
4027 .suspend = iommu_suspend,
4030 static void __init init_iommu_pm_ops(void)
4032 register_syscore_ops(&iommu_syscore_ops);
4036 static inline void init_iommu_pm_ops(void) {}
4037 #endif /* CONFIG_PM */
4039 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4041 struct acpi_dmar_reserved_memory *rmrr;
4042 struct dmar_rmrr_unit *rmrru;
4044 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4048 rmrru->hdr = header;
4049 rmrr = (struct acpi_dmar_reserved_memory *)header;
4050 rmrru->base_address = rmrr->base_address;
4051 rmrru->end_address = rmrr->end_address;
4053 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4054 ((void *)rmrr) + rmrr->header.length,
4055 &rmrru->devices_cnt);
4056 if (rmrru->devices_cnt && rmrru->devices == NULL)
4059 list_add(&rmrru->list, &dmar_rmrr_units);
4068 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4070 struct dmar_atsr_unit *atsru;
4071 struct acpi_dmar_atsr *tmp;
4073 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4074 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4075 if (atsr->segment != tmp->segment)
4077 if (atsr->header.length != tmp->header.length)
4079 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4086 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4088 struct acpi_dmar_atsr *atsr;
4089 struct dmar_atsr_unit *atsru;
4091 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4094 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4095 atsru = dmar_find_atsr(atsr);
4099 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4104 * If memory is allocated from slab by ACPI _DSM method, we need to
4105 * copy the memory content because the memory buffer will be freed
4108 atsru->hdr = (void *)(atsru + 1);
4109 memcpy(atsru->hdr, hdr, hdr->length);
4110 atsru->include_all = atsr->flags & 0x1;
4111 if (!atsru->include_all) {
4112 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4113 (void *)atsr + atsr->header.length,
4114 &atsru->devices_cnt);
4115 if (atsru->devices_cnt && atsru->devices == NULL) {
4121 list_add_rcu(&atsru->list, &dmar_atsr_units);
4126 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4128 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4132 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4134 struct acpi_dmar_atsr *atsr;
4135 struct dmar_atsr_unit *atsru;
4137 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4138 atsru = dmar_find_atsr(atsr);
4140 list_del_rcu(&atsru->list);
4142 intel_iommu_free_atsr(atsru);
4148 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4152 struct acpi_dmar_atsr *atsr;
4153 struct dmar_atsr_unit *atsru;
4155 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4156 atsru = dmar_find_atsr(atsr);
4160 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4161 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4169 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4172 struct intel_iommu *iommu = dmaru->iommu;
4174 if (g_iommus[iommu->seq_id])
4177 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4178 pr_warn("%s: Doesn't support hardware pass through.\n",
4182 if (!ecap_sc_support(iommu->ecap) &&
4183 domain_update_iommu_snooping(iommu)) {
4184 pr_warn("%s: Doesn't support snooping.\n",
4188 sp = domain_update_iommu_superpage(iommu) - 1;
4189 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4190 pr_warn("%s: Doesn't support large page.\n",
4196 * Disable translation if already enabled prior to OS handover.
4198 if (iommu->gcmd & DMA_GCMD_TE)
4199 iommu_disable_translation(iommu);
4201 g_iommus[iommu->seq_id] = iommu;
4202 ret = iommu_init_domains(iommu);
4204 ret = iommu_alloc_root_entry(iommu);
4208 #ifdef CONFIG_INTEL_IOMMU_SVM
4209 if (pasid_supported(iommu))
4210 intel_svm_init(iommu);
4213 if (dmaru->ignored) {
4215 * we always have to disable PMRs or DMA may fail on this device
4218 iommu_disable_protect_mem_regions(iommu);
4222 intel_iommu_init_qi(iommu);
4223 iommu_flush_write_buffer(iommu);
4225 #ifdef CONFIG_INTEL_IOMMU_SVM
4226 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4227 ret = intel_svm_enable_prq(iommu);
4232 ret = dmar_set_interrupt(iommu);
4236 iommu_set_root_entry(iommu);
4237 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4238 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4239 iommu_enable_translation(iommu);
4241 iommu_disable_protect_mem_regions(iommu);
4245 disable_dmar_iommu(iommu);
4247 free_dmar_iommu(iommu);
4251 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4254 struct intel_iommu *iommu = dmaru->iommu;
4256 if (!intel_iommu_enabled)
4262 ret = intel_iommu_add(dmaru);
4264 disable_dmar_iommu(iommu);
4265 free_dmar_iommu(iommu);
4271 static void intel_iommu_free_dmars(void)
4273 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4274 struct dmar_atsr_unit *atsru, *atsr_n;
4276 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4277 list_del(&rmrru->list);
4278 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4282 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4283 list_del(&atsru->list);
4284 intel_iommu_free_atsr(atsru);
4288 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4291 struct pci_bus *bus;
4292 struct pci_dev *bridge = NULL;
4294 struct acpi_dmar_atsr *atsr;
4295 struct dmar_atsr_unit *atsru;
4297 dev = pci_physfn(dev);
4298 for (bus = dev->bus; bus; bus = bus->parent) {
4300 /* If it's an integrated device, allow ATS */
4303 /* Connected via non-PCIe: no ATS */
4304 if (!pci_is_pcie(bridge) ||
4305 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4307 /* If we found the root port, look it up in the ATSR */
4308 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4313 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4314 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4315 if (atsr->segment != pci_domain_nr(dev->bus))
4318 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4319 if (tmp == &bridge->dev)
4322 if (atsru->include_all)
4332 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4335 struct dmar_rmrr_unit *rmrru;
4336 struct dmar_atsr_unit *atsru;
4337 struct acpi_dmar_atsr *atsr;
4338 struct acpi_dmar_reserved_memory *rmrr;
4340 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4343 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4344 rmrr = container_of(rmrru->hdr,
4345 struct acpi_dmar_reserved_memory, header);
4346 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4347 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4348 ((void *)rmrr) + rmrr->header.length,
4349 rmrr->segment, rmrru->devices,
4350 rmrru->devices_cnt);
4353 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4354 dmar_remove_dev_scope(info, rmrr->segment,
4355 rmrru->devices, rmrru->devices_cnt);
4359 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4360 if (atsru->include_all)
4363 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4364 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4365 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4366 (void *)atsr + atsr->header.length,
4367 atsr->segment, atsru->devices,
4368 atsru->devices_cnt);
4373 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4374 if (dmar_remove_dev_scope(info, atsr->segment,
4375 atsru->devices, atsru->devices_cnt))
4383 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4384 unsigned long val, void *v)
4386 struct memory_notify *mhp = v;
4387 unsigned long long start, end;
4388 unsigned long start_vpfn, last_vpfn;
4391 case MEM_GOING_ONLINE:
4392 start = mhp->start_pfn << PAGE_SHIFT;
4393 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4394 if (iommu_domain_identity_map(si_domain, start, end)) {
4395 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4402 case MEM_CANCEL_ONLINE:
4403 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4404 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4405 while (start_vpfn <= last_vpfn) {
4407 struct dmar_drhd_unit *drhd;
4408 struct intel_iommu *iommu;
4409 struct page *freelist;
4411 iova = find_iova(&si_domain->iovad, start_vpfn);
4413 pr_debug("Failed get IOVA for PFN %lx\n",
4418 iova = split_and_remove_iova(&si_domain->iovad, iova,
4419 start_vpfn, last_vpfn);
4421 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4422 start_vpfn, last_vpfn);
4426 freelist = domain_unmap(si_domain, iova->pfn_lo,
4430 for_each_active_iommu(iommu, drhd)
4431 iommu_flush_iotlb_psi(iommu, si_domain,
4432 iova->pfn_lo, iova_size(iova),
4435 dma_free_pagelist(freelist);
4437 start_vpfn = iova->pfn_hi + 1;
4438 free_iova_mem(iova);
4446 static struct notifier_block intel_iommu_memory_nb = {
4447 .notifier_call = intel_iommu_memory_notifier,
4451 static void free_all_cpu_cached_iovas(unsigned int cpu)
4455 for (i = 0; i < g_num_of_iommus; i++) {
4456 struct intel_iommu *iommu = g_iommus[i];
4457 struct dmar_domain *domain;
4463 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4464 domain = get_iommu_domain(iommu, (u16)did);
4468 free_cpu_cached_iovas(cpu, &domain->iovad);
4473 static int intel_iommu_cpu_dead(unsigned int cpu)
4475 free_all_cpu_cached_iovas(cpu);
4479 static void intel_disable_iommus(void)
4481 struct intel_iommu *iommu = NULL;
4482 struct dmar_drhd_unit *drhd;
4484 for_each_iommu(iommu, drhd)
4485 iommu_disable_translation(iommu);
4488 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4490 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4492 return container_of(iommu_dev, struct intel_iommu, iommu);
4495 static ssize_t intel_iommu_show_version(struct device *dev,
4496 struct device_attribute *attr,
4499 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4500 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4501 return sprintf(buf, "%d:%d\n",
4502 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4504 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4506 static ssize_t intel_iommu_show_address(struct device *dev,
4507 struct device_attribute *attr,
4510 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4511 return sprintf(buf, "%llx\n", iommu->reg_phys);
4513 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4515 static ssize_t intel_iommu_show_cap(struct device *dev,
4516 struct device_attribute *attr,
4519 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4520 return sprintf(buf, "%llx\n", iommu->cap);
4522 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4524 static ssize_t intel_iommu_show_ecap(struct device *dev,
4525 struct device_attribute *attr,
4528 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4529 return sprintf(buf, "%llx\n", iommu->ecap);
4531 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4533 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4534 struct device_attribute *attr,
4537 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4538 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4540 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4542 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4543 struct device_attribute *attr,
4546 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4547 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4548 cap_ndoms(iommu->cap)));
4550 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4552 static struct attribute *intel_iommu_attrs[] = {
4553 &dev_attr_version.attr,
4554 &dev_attr_address.attr,
4556 &dev_attr_ecap.attr,
4557 &dev_attr_domains_supported.attr,
4558 &dev_attr_domains_used.attr,
4562 static struct attribute_group intel_iommu_group = {
4563 .name = "intel-iommu",
4564 .attrs = intel_iommu_attrs,
4567 const struct attribute_group *intel_iommu_groups[] = {
4572 static int __init platform_optin_force_iommu(void)
4574 struct pci_dev *pdev = NULL;
4575 bool has_untrusted_dev = false;
4577 if (!dmar_platform_optin() || no_platform_optin)
4580 for_each_pci_dev(pdev) {
4581 if (pdev->untrusted) {
4582 has_untrusted_dev = true;
4587 if (!has_untrusted_dev)
4590 if (no_iommu || dmar_disabled)
4591 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4594 * If Intel-IOMMU is disabled by default, we will apply identity
4595 * map for all devices except those marked as being untrusted.
4598 iommu_identity_mapping |= IDENTMAP_ALL;
4601 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4609 static int __init probe_acpi_namespace_devices(void)
4611 struct dmar_drhd_unit *drhd;
4612 /* To avoid a -Wunused-but-set-variable warning. */
4613 struct intel_iommu *iommu __maybe_unused;
4617 for_each_active_iommu(iommu, drhd) {
4618 for_each_active_dev_scope(drhd->devices,
4619 drhd->devices_cnt, i, dev) {
4620 struct acpi_device_physical_node *pn;
4621 struct iommu_group *group;
4622 struct acpi_device *adev;
4624 if (dev->bus != &acpi_bus_type)
4627 adev = to_acpi_device(dev);
4628 mutex_lock(&adev->physical_node_lock);
4629 list_for_each_entry(pn,
4630 &adev->physical_node_list, node) {
4631 group = iommu_group_get(pn->dev);
4633 iommu_group_put(group);
4637 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4638 ret = iommu_probe_device(pn->dev);
4642 mutex_unlock(&adev->physical_node_lock);
4652 int __init intel_iommu_init(void)
4655 struct dmar_drhd_unit *drhd;
4656 struct intel_iommu *iommu;
4659 * Intel IOMMU is required for a TXT/tboot launch or platform
4660 * opt in, so enforce that.
4662 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4664 if (iommu_init_mempool()) {
4666 panic("tboot: Failed to initialize iommu memory\n");
4670 down_write(&dmar_global_lock);
4671 if (dmar_table_init()) {
4673 panic("tboot: Failed to initialize DMAR table\n");
4677 if (dmar_dev_scope_init() < 0) {
4679 panic("tboot: Failed to initialize DMAR device scope\n");
4683 up_write(&dmar_global_lock);
4686 * The bus notifier takes the dmar_global_lock, so lockdep will
4687 * complain later when we register it under the lock.
4689 dmar_register_bus_notifier();
4691 down_write(&dmar_global_lock);
4693 if (no_iommu || dmar_disabled) {
4695 * We exit the function here to ensure IOMMU's remapping and
4696 * mempool aren't setup, which means that the IOMMU's PMRs
4697 * won't be disabled via the call to init_dmars(). So disable
4698 * it explicitly here. The PMRs were setup by tboot prior to
4699 * calling SENTER, but the kernel is expected to reset/tear
4702 if (intel_iommu_tboot_noforce) {
4703 for_each_iommu(iommu, drhd)
4704 iommu_disable_protect_mem_regions(iommu);
4708 * Make sure the IOMMUs are switched off, even when we
4709 * boot into a kexec kernel and the previous kernel left
4712 intel_disable_iommus();
4716 if (list_empty(&dmar_rmrr_units))
4717 pr_info("No RMRR found\n");
4719 if (list_empty(&dmar_atsr_units))
4720 pr_info("No ATSR found\n");
4722 if (dmar_init_reserved_ranges()) {
4724 panic("tboot: Failed to reserve iommu ranges\n");
4725 goto out_free_reserved_range;
4729 intel_iommu_gfx_mapped = 1;
4731 init_no_remapping_devices();
4736 panic("tboot: Failed to initialize DMARs\n");
4737 pr_err("Initialization failed\n");
4738 goto out_free_reserved_range;
4740 up_write(&dmar_global_lock);
4742 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4745 dma_ops = &intel_dma_ops;
4747 init_iommu_pm_ops();
4749 for_each_active_iommu(iommu, drhd) {
4750 iommu_device_sysfs_add(&iommu->iommu, NULL,
4753 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4754 iommu_device_register(&iommu->iommu);
4757 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4758 if (si_domain && !hw_pass_through)
4759 register_memory_notifier(&intel_iommu_memory_nb);
4760 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4761 intel_iommu_cpu_dead);
4763 down_read(&dmar_global_lock);
4764 if (probe_acpi_namespace_devices())
4765 pr_warn("ACPI name space devices didn't probe correctly\n");
4766 up_read(&dmar_global_lock);
4768 /* Finally, we enable the DMA remapping hardware. */
4769 for_each_iommu(iommu, drhd) {
4770 if (!drhd->ignored && !translation_pre_enabled(iommu))
4771 iommu_enable_translation(iommu);
4773 iommu_disable_protect_mem_regions(iommu);
4775 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4777 intel_iommu_enabled = 1;
4778 intel_iommu_debugfs_init();
4782 out_free_reserved_range:
4783 put_iova_domain(&reserved_iova_list);
4785 intel_iommu_free_dmars();
4786 up_write(&dmar_global_lock);
4787 iommu_exit_mempool();
4791 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4793 struct intel_iommu *iommu = opaque;
4795 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4800 * NB - intel-iommu lacks any sort of reference counting for the users of
4801 * dependent devices. If multiple endpoints have intersecting dependent
4802 * devices, unbinding the driver from any one of them will possibly leave
4803 * the others unable to operate.
4805 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4807 if (!iommu || !dev || !dev_is_pci(dev))
4810 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4813 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4815 struct dmar_domain *domain;
4816 struct intel_iommu *iommu;
4817 unsigned long flags;
4819 assert_spin_locked(&device_domain_lock);
4824 iommu = info->iommu;
4825 domain = info->domain;
4828 if (dev_is_pci(info->dev) && sm_supported(iommu))
4829 intel_pasid_tear_down_entry(iommu, info->dev,
4832 iommu_disable_dev_iotlb(info);
4833 domain_context_clear(iommu, info->dev);
4834 intel_pasid_free_table(info->dev);
4837 unlink_domain_info(info);
4839 spin_lock_irqsave(&iommu->lock, flags);
4840 domain_detach_iommu(domain, iommu);
4841 spin_unlock_irqrestore(&iommu->lock, flags);
4843 /* free the private domain */
4844 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4845 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4846 list_empty(&domain->devices))
4847 domain_exit(info->domain);
4849 free_devinfo_mem(info);
4852 static void dmar_remove_one_dev_info(struct device *dev)
4854 struct device_domain_info *info;
4855 unsigned long flags;
4857 spin_lock_irqsave(&device_domain_lock, flags);
4858 info = dev->archdata.iommu;
4860 __dmar_remove_one_dev_info(info);
4861 spin_unlock_irqrestore(&device_domain_lock, flags);
4864 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4868 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4869 domain_reserve_special_ranges(domain);
4871 /* calculate AGAW */
4872 domain->gaw = guest_width;
4873 adjust_width = guestwidth_to_adjustwidth(guest_width);
4874 domain->agaw = width_to_agaw(adjust_width);
4876 domain->iommu_coherency = 0;
4877 domain->iommu_snooping = 0;
4878 domain->iommu_superpage = 0;
4879 domain->max_addr = 0;
4881 /* always allocate the top pgd */
4882 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4885 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4889 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4891 struct dmar_domain *dmar_domain;
4892 struct iommu_domain *domain;
4895 case IOMMU_DOMAIN_DMA:
4897 case IOMMU_DOMAIN_UNMANAGED:
4898 dmar_domain = alloc_domain(0);
4900 pr_err("Can't allocate dmar_domain\n");
4903 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4904 pr_err("Domain initialization failed\n");
4905 domain_exit(dmar_domain);
4909 if (type == IOMMU_DOMAIN_DMA &&
4910 init_iova_flush_queue(&dmar_domain->iovad,
4911 iommu_flush_iova, iova_entry_free)) {
4912 pr_warn("iova flush queue initialization failed\n");
4913 intel_iommu_strict = 1;
4916 domain_update_iommu_cap(dmar_domain);
4918 domain = &dmar_domain->domain;
4919 domain->geometry.aperture_start = 0;
4920 domain->geometry.aperture_end =
4921 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4922 domain->geometry.force_aperture = true;
4925 case IOMMU_DOMAIN_IDENTITY:
4926 return &si_domain->domain;
4934 static void intel_iommu_domain_free(struct iommu_domain *domain)
4936 if (domain != &si_domain->domain)
4937 domain_exit(to_dmar_domain(domain));
4941 * Check whether a @domain could be attached to the @dev through the
4942 * aux-domain attach/detach APIs.
4945 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4947 struct device_domain_info *info = dev->archdata.iommu;
4949 return info && info->auxd_enabled &&
4950 domain->type == IOMMU_DOMAIN_UNMANAGED;
4953 static void auxiliary_link_device(struct dmar_domain *domain,
4956 struct device_domain_info *info = dev->archdata.iommu;
4958 assert_spin_locked(&device_domain_lock);
4962 domain->auxd_refcnt++;
4963 list_add(&domain->auxd, &info->auxiliary_domains);
4966 static void auxiliary_unlink_device(struct dmar_domain *domain,
4969 struct device_domain_info *info = dev->archdata.iommu;
4971 assert_spin_locked(&device_domain_lock);
4975 list_del(&domain->auxd);
4976 domain->auxd_refcnt--;
4978 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4979 intel_pasid_free_id(domain->default_pasid);
4982 static int aux_domain_add_dev(struct dmar_domain *domain,
4987 unsigned long flags;
4988 struct intel_iommu *iommu;
4990 iommu = device_to_iommu(dev, &bus, &devfn);
4994 if (domain->default_pasid <= 0) {
4997 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4998 pci_max_pasids(to_pci_dev(dev)),
5001 pr_err("Can't allocate default pasid\n");
5004 domain->default_pasid = pasid;
5007 spin_lock_irqsave(&device_domain_lock, flags);
5009 * iommu->lock must be held to attach domain to iommu and setup the
5010 * pasid entry for second level translation.
5012 spin_lock(&iommu->lock);
5013 ret = domain_attach_iommu(domain, iommu);
5017 /* Setup the PASID entry for mediated devices: */
5018 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5019 domain->default_pasid);
5022 spin_unlock(&iommu->lock);
5024 auxiliary_link_device(domain, dev);
5026 spin_unlock_irqrestore(&device_domain_lock, flags);
5031 domain_detach_iommu(domain, iommu);
5033 spin_unlock(&iommu->lock);
5034 spin_unlock_irqrestore(&device_domain_lock, flags);
5035 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5036 intel_pasid_free_id(domain->default_pasid);
5041 static void aux_domain_remove_dev(struct dmar_domain *domain,
5044 struct device_domain_info *info;
5045 struct intel_iommu *iommu;
5046 unsigned long flags;
5048 if (!is_aux_domain(dev, &domain->domain))
5051 spin_lock_irqsave(&device_domain_lock, flags);
5052 info = dev->archdata.iommu;
5053 iommu = info->iommu;
5055 auxiliary_unlink_device(domain, dev);
5057 spin_lock(&iommu->lock);
5058 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5059 domain_detach_iommu(domain, iommu);
5060 spin_unlock(&iommu->lock);
5062 spin_unlock_irqrestore(&device_domain_lock, flags);
5065 static int prepare_domain_attach_device(struct iommu_domain *domain,
5068 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5069 struct intel_iommu *iommu;
5073 iommu = device_to_iommu(dev, &bus, &devfn);
5077 /* check if this iommu agaw is sufficient for max mapped address */
5078 addr_width = agaw_to_width(iommu->agaw);
5079 if (addr_width > cap_mgaw(iommu->cap))
5080 addr_width = cap_mgaw(iommu->cap);
5082 if (dmar_domain->max_addr > (1LL << addr_width)) {
5083 dev_err(dev, "%s: iommu width (%d) is not "
5084 "sufficient for the mapped address (%llx)\n",
5085 __func__, addr_width, dmar_domain->max_addr);
5088 dmar_domain->gaw = addr_width;
5091 * Knock out extra levels of page tables if necessary
5093 while (iommu->agaw < dmar_domain->agaw) {
5094 struct dma_pte *pte;
5096 pte = dmar_domain->pgd;
5097 if (dma_pte_present(pte)) {
5098 dmar_domain->pgd = (struct dma_pte *)
5099 phys_to_virt(dma_pte_addr(pte));
5100 free_pgtable_page(pte);
5102 dmar_domain->agaw--;
5108 static int intel_iommu_attach_device(struct iommu_domain *domain,
5113 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5114 device_is_rmrr_locked(dev)) {
5115 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5119 if (is_aux_domain(dev, domain))
5122 /* normally dev is not mapped */
5123 if (unlikely(domain_context_mapped(dev))) {
5124 struct dmar_domain *old_domain;
5126 old_domain = find_domain(dev);
5128 dmar_remove_one_dev_info(dev);
5131 ret = prepare_domain_attach_device(domain, dev);
5135 return domain_add_dev_info(to_dmar_domain(domain), dev);
5138 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5143 if (!is_aux_domain(dev, domain))
5146 ret = prepare_domain_attach_device(domain, dev);
5150 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5153 static void intel_iommu_detach_device(struct iommu_domain *domain,
5156 dmar_remove_one_dev_info(dev);
5159 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5162 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5165 static int intel_iommu_map(struct iommu_domain *domain,
5166 unsigned long iova, phys_addr_t hpa,
5167 size_t size, int iommu_prot)
5169 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5174 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5177 if (iommu_prot & IOMMU_READ)
5178 prot |= DMA_PTE_READ;
5179 if (iommu_prot & IOMMU_WRITE)
5180 prot |= DMA_PTE_WRITE;
5181 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5182 prot |= DMA_PTE_SNP;
5184 max_addr = iova + size;
5185 if (dmar_domain->max_addr < max_addr) {
5188 /* check if minimum agaw is sufficient for mapped address */
5189 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5190 if (end < max_addr) {
5191 pr_err("%s: iommu width (%d) is not "
5192 "sufficient for the mapped address (%llx)\n",
5193 __func__, dmar_domain->gaw, max_addr);
5196 dmar_domain->max_addr = max_addr;
5198 /* Round up size to next multiple of PAGE_SIZE, if it and
5199 the low bits of hpa would take us onto the next page */
5200 size = aligned_nrpages(hpa, size);
5201 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5202 hpa >> VTD_PAGE_SHIFT, size, prot);
5206 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5207 unsigned long iova, size_t size)
5209 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5210 struct page *freelist = NULL;
5211 unsigned long start_pfn, last_pfn;
5212 unsigned int npages;
5213 int iommu_id, level = 0;
5215 /* Cope with horrid API which requires us to unmap more than the
5216 size argument if it happens to be a large-page mapping. */
5217 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5218 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5221 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5222 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5224 start_pfn = iova >> VTD_PAGE_SHIFT;
5225 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5227 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5229 npages = last_pfn - start_pfn + 1;
5231 for_each_domain_iommu(iommu_id, dmar_domain)
5232 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5233 start_pfn, npages, !freelist, 0);
5235 dma_free_pagelist(freelist);
5237 if (dmar_domain->max_addr == iova + size)
5238 dmar_domain->max_addr = iova;
5243 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5246 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5247 struct dma_pte *pte;
5251 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5254 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5256 phys = dma_pte_addr(pte);
5261 static inline bool scalable_mode_support(void)
5263 struct dmar_drhd_unit *drhd;
5264 struct intel_iommu *iommu;
5268 for_each_active_iommu(iommu, drhd) {
5269 if (!sm_supported(iommu)) {
5279 static inline bool iommu_pasid_support(void)
5281 struct dmar_drhd_unit *drhd;
5282 struct intel_iommu *iommu;
5286 for_each_active_iommu(iommu, drhd) {
5287 if (!pasid_supported(iommu)) {
5297 static bool intel_iommu_capable(enum iommu_cap cap)
5299 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5300 return domain_update_iommu_snooping(NULL) == 1;
5301 if (cap == IOMMU_CAP_INTR_REMAP)
5302 return irq_remapping_enabled == 1;
5307 static int intel_iommu_add_device(struct device *dev)
5309 struct dmar_domain *dmar_domain;
5310 struct iommu_domain *domain;
5311 struct intel_iommu *iommu;
5312 struct iommu_group *group;
5316 iommu = device_to_iommu(dev, &bus, &devfn);
5320 iommu_device_link(&iommu->iommu, dev);
5322 if (translation_pre_enabled(iommu))
5323 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5325 group = iommu_group_get_for_dev(dev);
5328 return PTR_ERR(group);
5330 iommu_group_put(group);
5332 domain = iommu_get_domain_for_dev(dev);
5333 dmar_domain = to_dmar_domain(domain);
5334 if (domain->type == IOMMU_DOMAIN_DMA) {
5335 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5336 ret = iommu_request_dm_for_dev(dev);
5338 dmar_remove_one_dev_info(dev);
5339 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5340 domain_add_dev_info(si_domain, dev);
5342 "Device uses a private identity domain.\n");
5346 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5347 ret = iommu_request_dma_domain_for_dev(dev);
5349 dmar_remove_one_dev_info(dev);
5350 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5351 if (!get_private_domain_for_dev(dev)) {
5353 "Failed to get a private domain.\n");
5358 "Device uses a private dma domain.\n");
5366 static void intel_iommu_remove_device(struct device *dev)
5368 struct intel_iommu *iommu;
5371 iommu = device_to_iommu(dev, &bus, &devfn);
5375 dmar_remove_one_dev_info(dev);
5377 iommu_group_remove_device(dev);
5379 iommu_device_unlink(&iommu->iommu, dev);
5382 static void intel_iommu_get_resv_regions(struct device *device,
5383 struct list_head *head)
5385 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5386 struct iommu_resv_region *reg;
5387 struct dmar_rmrr_unit *rmrr;
5388 struct device *i_dev;
5391 down_read(&dmar_global_lock);
5392 for_each_rmrr_units(rmrr) {
5393 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5395 struct iommu_resv_region *resv;
5396 enum iommu_resv_type type;
5399 if (i_dev != device &&
5400 !is_downstream_to_pci_bridge(device, i_dev))
5403 length = rmrr->end_address - rmrr->base_address + 1;
5405 type = device_rmrr_is_relaxable(device) ?
5406 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5408 resv = iommu_alloc_resv_region(rmrr->base_address,
5409 length, prot, type);
5413 list_add_tail(&resv->list, head);
5416 up_read(&dmar_global_lock);
5418 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5419 if (dev_is_pci(device)) {
5420 struct pci_dev *pdev = to_pci_dev(device);
5422 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5423 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5426 list_add_tail(®->list, head);
5429 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5431 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5432 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5436 list_add_tail(®->list, head);
5439 static void intel_iommu_put_resv_regions(struct device *dev,
5440 struct list_head *head)
5442 struct iommu_resv_region *entry, *next;
5444 list_for_each_entry_safe(entry, next, head, list)
5448 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5450 struct device_domain_info *info;
5451 struct context_entry *context;
5452 struct dmar_domain *domain;
5453 unsigned long flags;
5457 domain = find_domain(dev);
5461 spin_lock_irqsave(&device_domain_lock, flags);
5462 spin_lock(&iommu->lock);
5465 info = dev->archdata.iommu;
5466 if (!info || !info->pasid_supported)
5469 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5470 if (WARN_ON(!context))
5473 ctx_lo = context[0].lo;
5475 if (!(ctx_lo & CONTEXT_PASIDE)) {
5476 ctx_lo |= CONTEXT_PASIDE;
5477 context[0].lo = ctx_lo;
5479 iommu->flush.flush_context(iommu,
5480 domain->iommu_did[iommu->seq_id],
5481 PCI_DEVID(info->bus, info->devfn),
5482 DMA_CCMD_MASK_NOBIT,
5483 DMA_CCMD_DEVICE_INVL);
5486 /* Enable PASID support in the device, if it wasn't already */
5487 if (!info->pasid_enabled)
5488 iommu_enable_dev_iotlb(info);
5493 spin_unlock(&iommu->lock);
5494 spin_unlock_irqrestore(&device_domain_lock, flags);
5499 static void intel_iommu_apply_resv_region(struct device *dev,
5500 struct iommu_domain *domain,
5501 struct iommu_resv_region *region)
5503 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504 unsigned long start, end;
5506 start = IOVA_PFN(region->start);
5507 end = IOVA_PFN(region->start + region->length - 1);
5509 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5512 #ifdef CONFIG_INTEL_IOMMU_SVM
5513 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5515 struct intel_iommu *iommu;
5518 if (iommu_dummy(dev)) {
5520 "No IOMMU translation for device; cannot enable SVM\n");
5524 iommu = device_to_iommu(dev, &bus, &devfn);
5526 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5532 #endif /* CONFIG_INTEL_IOMMU_SVM */
5534 static int intel_iommu_enable_auxd(struct device *dev)
5536 struct device_domain_info *info;
5537 struct intel_iommu *iommu;
5538 unsigned long flags;
5542 iommu = device_to_iommu(dev, &bus, &devfn);
5543 if (!iommu || dmar_disabled)
5546 if (!sm_supported(iommu) || !pasid_supported(iommu))
5549 ret = intel_iommu_enable_pasid(iommu, dev);
5553 spin_lock_irqsave(&device_domain_lock, flags);
5554 info = dev->archdata.iommu;
5555 info->auxd_enabled = 1;
5556 spin_unlock_irqrestore(&device_domain_lock, flags);
5561 static int intel_iommu_disable_auxd(struct device *dev)
5563 struct device_domain_info *info;
5564 unsigned long flags;
5566 spin_lock_irqsave(&device_domain_lock, flags);
5567 info = dev->archdata.iommu;
5568 if (!WARN_ON(!info))
5569 info->auxd_enabled = 0;
5570 spin_unlock_irqrestore(&device_domain_lock, flags);
5576 * A PCI express designated vendor specific extended capability is defined
5577 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5578 * for system software and tools to detect endpoint devices supporting the
5579 * Intel scalable IO virtualization without host driver dependency.
5581 * Returns the address of the matching extended capability structure within
5582 * the device's PCI configuration space or 0 if the device does not support
5585 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5590 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5592 pci_read_config_word(pdev, pos + 4, &vendor);
5593 pci_read_config_word(pdev, pos + 8, &id);
5594 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5597 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5604 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5606 if (feat == IOMMU_DEV_FEAT_AUX) {
5609 if (!dev_is_pci(dev) || dmar_disabled ||
5610 !scalable_mode_support() || !iommu_pasid_support())
5613 ret = pci_pasid_features(to_pci_dev(dev));
5617 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5624 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5626 if (feat == IOMMU_DEV_FEAT_AUX)
5627 return intel_iommu_enable_auxd(dev);
5633 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5635 if (feat == IOMMU_DEV_FEAT_AUX)
5636 return intel_iommu_disable_auxd(dev);
5642 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5644 struct device_domain_info *info = dev->archdata.iommu;
5646 if (feat == IOMMU_DEV_FEAT_AUX)
5647 return scalable_mode_support() && info && info->auxd_enabled;
5653 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5655 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5657 return dmar_domain->default_pasid > 0 ?
5658 dmar_domain->default_pasid : -EINVAL;
5661 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5664 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5667 const struct iommu_ops intel_iommu_ops = {
5668 .capable = intel_iommu_capable,
5669 .domain_alloc = intel_iommu_domain_alloc,
5670 .domain_free = intel_iommu_domain_free,
5671 .attach_dev = intel_iommu_attach_device,
5672 .detach_dev = intel_iommu_detach_device,
5673 .aux_attach_dev = intel_iommu_aux_attach_device,
5674 .aux_detach_dev = intel_iommu_aux_detach_device,
5675 .aux_get_pasid = intel_iommu_aux_get_pasid,
5676 .map = intel_iommu_map,
5677 .unmap = intel_iommu_unmap,
5678 .iova_to_phys = intel_iommu_iova_to_phys,
5679 .add_device = intel_iommu_add_device,
5680 .remove_device = intel_iommu_remove_device,
5681 .get_resv_regions = intel_iommu_get_resv_regions,
5682 .put_resv_regions = intel_iommu_put_resv_regions,
5683 .apply_resv_region = intel_iommu_apply_resv_region,
5684 .device_group = pci_device_group,
5685 .dev_has_feat = intel_iommu_dev_has_feat,
5686 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5687 .dev_enable_feat = intel_iommu_dev_enable_feat,
5688 .dev_disable_feat = intel_iommu_dev_disable_feat,
5689 .is_attach_deferred = intel_iommu_is_attach_deferred,
5690 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5693 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5695 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5696 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5708 static void quirk_iommu_rwbf(struct pci_dev *dev)
5711 * Mobile 4 Series Chipset neglects to set RWBF capability,
5712 * but needs it. Same seems to hold for the desktop versions.
5714 pci_info(dev, "Forcing write-buffer flush capability\n");
5718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5727 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5728 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5729 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5730 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5731 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5732 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5733 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5734 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5736 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5740 if (pci_read_config_word(dev, GGC, &ggc))
5743 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5744 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5746 } else if (dmar_map_gfx) {
5747 /* we have to ensure the gfx device is idle before we flush */
5748 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5749 intel_iommu_strict = 1;
5752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5757 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5758 ISOCH DMAR unit for the Azalia sound device, but not give it any
5759 TLB entries, which causes it to deadlock. Check for that. We do
5760 this in a function called from init_dmars(), instead of in a PCI
5761 quirk, because we don't want to print the obnoxious "BIOS broken"
5762 message if VT-d is actually disabled.
5764 static void __init check_tylersburg_isoch(void)
5766 struct pci_dev *pdev;
5767 uint32_t vtisochctrl;
5769 /* If there's no Azalia in the system anyway, forget it. */
5770 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5775 /* System Management Registers. Might be hidden, in which case
5776 we can't do the sanity check. But that's OK, because the
5777 known-broken BIOSes _don't_ actually hide it, so far. */
5778 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5782 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5789 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790 if (vtisochctrl & 1)
5793 /* Drop all bits other than the number of TLB entries */
5794 vtisochctrl &= 0x1c;
5796 /* If we have the recommended number of TLB entries (16), fine. */
5797 if (vtisochctrl == 0x10)
5800 /* Zero TLB entries? You get to ride the short bus to school. */
5802 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804 dmi_get_system_info(DMI_BIOS_VENDOR),
5805 dmi_get_system_info(DMI_BIOS_VERSION),
5806 dmi_get_system_info(DMI_PRODUCT_VERSION));
5807 iommu_identity_mapping |= IDENTMAP_AZALIA;
5811 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",