1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
344 static int domain_detach_iommu(struct dmar_domain *domain,
345 struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
366 #define IDENTMAP_ALL 1
367 #define IDENTMAP_GFX 2
368 #define IDENTMAP_AZALIA 4
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
379 * Iterate over elements in device_domain_list and call the specified
380 * callback @fn against each element.
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383 void *data), void *data)
387 struct device_domain_info *info;
389 spin_lock_irqsave(&device_domain_lock, flags);
390 list_for_each_entry(info, &device_domain_list, global) {
391 ret = fn(info, data);
393 spin_unlock_irqrestore(&device_domain_lock, flags);
397 spin_unlock_irqrestore(&device_domain_lock, flags);
402 const struct iommu_ops intel_iommu_ops;
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
406 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
411 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
414 static void init_translation_status(struct intel_iommu *iommu)
418 gsts = readl(iommu->reg + DMAR_GSTS_REG);
419 if (gsts & DMA_GSTS_TES)
420 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
426 return container_of(dom, struct dmar_domain, domain);
429 static int __init intel_iommu_setup(char *str)
434 if (!strncmp(str, "on", 2)) {
436 pr_info("IOMMU enabled\n");
437 } else if (!strncmp(str, "off", 3)) {
439 no_platform_optin = 1;
440 pr_info("IOMMU disabled\n");
441 } else if (!strncmp(str, "igfx_off", 8)) {
443 pr_info("Disable GFX device mapping\n");
444 } else if (!strncmp(str, "forcedac", 8)) {
445 pr_info("Forcing DAC for PCI devices\n");
447 } else if (!strncmp(str, "strict", 6)) {
448 pr_info("Disable batched IOTLB flush\n");
449 intel_iommu_strict = 1;
450 } else if (!strncmp(str, "sp_off", 6)) {
451 pr_info("Disable supported super page\n");
452 intel_iommu_superpage = 0;
453 } else if (!strncmp(str, "sm_on", 5)) {
454 pr_info("Intel-IOMMU: scalable mode supported\n");
456 } else if (!strncmp(str, "tboot_noforce", 13)) {
458 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459 intel_iommu_tboot_noforce = 1;
462 str += strcspn(str, ",");
468 __setup("intel_iommu=", intel_iommu_setup);
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
475 struct dmar_domain **domains;
478 domains = iommu->domains[idx];
482 return domains[did & 0xff];
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486 struct dmar_domain *domain)
488 struct dmar_domain **domains;
491 if (!iommu->domains[idx]) {
492 size_t size = 256 * sizeof(struct dmar_domain *);
493 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
496 domains = iommu->domains[idx];
497 if (WARN_ON(!domains))
500 domains[did & 0xff] = domain;
503 void *alloc_pgtable_page(int node)
508 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
510 vaddr = page_address(page);
514 void free_pgtable_page(void *vaddr)
516 free_page((unsigned long)vaddr);
519 static inline void *alloc_domain_mem(void)
521 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
524 static void free_domain_mem(void *vaddr)
526 kmem_cache_free(iommu_domain_cache, vaddr);
529 static inline void * alloc_devinfo_mem(void)
531 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
534 static inline void free_devinfo_mem(void *vaddr)
536 kmem_cache_free(iommu_devinfo_cache, vaddr);
539 static inline int domain_type_is_si(struct dmar_domain *domain)
541 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
547 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
549 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 sagaw = cap_sagaw(iommu->cap);
558 for (agaw = width_to_agaw(max_gaw);
560 if (test_bit(agaw, &sagaw))
568 * Calculate max SAGAW for each iommu.
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
572 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 * calculate agaw for each iommu.
577 * "SAGAW" may be different across iommus, use a default agaw, and
578 * get a supported less agaw for iommus that don't support the default agaw.
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
582 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 /* si_domain and vm domain should not get here. */
591 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
594 for_each_domain_iommu(iommu_id, domain)
597 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
600 return g_iommus[iommu_id];
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 struct dmar_drhd_unit *drhd;
606 struct intel_iommu *iommu;
610 domain->iommu_coherency = 1;
612 for_each_domain_iommu(i, domain) {
614 if (!ecap_coherent(g_iommus[i]->ecap)) {
615 domain->iommu_coherency = 0;
622 /* No hardware attached; use lowest common denominator */
624 for_each_active_iommu(iommu, drhd) {
625 if (!ecap_coherent(iommu->ecap)) {
626 domain->iommu_coherency = 0;
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
635 struct dmar_drhd_unit *drhd;
636 struct intel_iommu *iommu;
640 for_each_active_iommu(iommu, drhd) {
642 if (!ecap_sc_support(iommu->ecap)) {
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
655 struct dmar_drhd_unit *drhd;
656 struct intel_iommu *iommu;
659 if (!intel_iommu_superpage) {
663 /* set iommu_superpage to the smallest common denominator */
665 for_each_active_iommu(iommu, drhd) {
667 mask &= cap_super_page_val(iommu->cap);
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
680 domain_update_iommu_coherency(domain);
681 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
688 struct root_entry *root = &iommu->root_entry[bus];
689 struct context_entry *context;
693 if (sm_supported(iommu)) {
701 context = phys_to_virt(*entry & VTD_PAGE_MASK);
703 unsigned long phy_addr;
707 context = alloc_pgtable_page(iommu->node);
711 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712 phy_addr = virt_to_phys((void *)context);
713 *entry = phy_addr | 1;
714 __iommu_flush_cache(iommu, entry, sizeof(*entry));
716 return &context[devfn];
719 static int iommu_dummy(struct device *dev)
721 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
725 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726 * sub-hierarchy of a candidate PCI-PCI bridge
727 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728 * @bridge: the candidate PCI-PCI bridge
730 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
735 struct pci_dev *pdev, *pbridge;
737 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
740 pdev = to_pci_dev(dev);
741 pbridge = to_pci_dev(bridge);
743 if (pbridge->subordinate &&
744 pbridge->subordinate->number <= pdev->bus->number &&
745 pbridge->subordinate->busn_res.end >= pdev->bus->number)
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
753 struct dmar_drhd_unit *drhd = NULL;
754 struct intel_iommu *iommu;
756 struct pci_dev *pdev = NULL;
760 if (iommu_dummy(dev))
763 if (dev_is_pci(dev)) {
764 struct pci_dev *pf_pdev;
766 pdev = to_pci_dev(dev);
769 /* VMD child devices currently cannot be handled individually */
770 if (is_vmd(pdev->bus))
774 /* VFs aren't listed in scope tables; we need to look up
775 * the PF instead to find the IOMMU. */
776 pf_pdev = pci_physfn(pdev);
778 segment = pci_domain_nr(pdev->bus);
779 } else if (has_acpi_companion(dev))
780 dev = &ACPI_COMPANION(dev)->dev;
783 for_each_active_iommu(iommu, drhd) {
784 if (pdev && segment != drhd->segment)
787 for_each_active_dev_scope(drhd->devices,
788 drhd->devices_cnt, i, tmp) {
790 /* For a VF use its original BDF# not that of the PF
791 * which we used for the IOMMU lookup. Strictly speaking
792 * we could do this for all PCI devices; we only need to
793 * get the BDF# from the scope table for ACPI matches. */
794 if (pdev && pdev->is_virtfn)
797 *bus = drhd->devices[i].bus;
798 *devfn = drhd->devices[i].devfn;
802 if (is_downstream_to_pci_bridge(dev, tmp))
806 if (pdev && drhd->include_all) {
808 *bus = pdev->bus->number;
809 *devfn = pdev->devfn;
820 static void domain_flush_cache(struct dmar_domain *domain,
821 void *addr, int size)
823 if (!domain->iommu_coherency)
824 clflush_cache_range(addr, size);
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
829 struct context_entry *context;
833 spin_lock_irqsave(&iommu->lock, flags);
834 context = iommu_context_addr(iommu, bus, devfn, 0);
836 ret = context_present(context);
837 spin_unlock_irqrestore(&iommu->lock, flags);
841 static void free_context_table(struct intel_iommu *iommu)
845 struct context_entry *context;
847 spin_lock_irqsave(&iommu->lock, flags);
848 if (!iommu->root_entry) {
851 for (i = 0; i < ROOT_ENTRY_NR; i++) {
852 context = iommu_context_addr(iommu, i, 0, 0);
854 free_pgtable_page(context);
856 if (!sm_supported(iommu))
859 context = iommu_context_addr(iommu, i, 0x80, 0);
861 free_pgtable_page(context);
864 free_pgtable_page(iommu->root_entry);
865 iommu->root_entry = NULL;
867 spin_unlock_irqrestore(&iommu->lock, flags);
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871 unsigned long pfn, int *target_level)
873 struct dma_pte *parent, *pte;
874 int level = agaw_to_level(domain->agaw);
877 BUG_ON(!domain->pgd);
879 if (!domain_pfn_supported(domain, pfn))
880 /* Address beyond IOMMU's addressing capabilities. */
883 parent = domain->pgd;
888 offset = pfn_level_offset(pfn, level);
889 pte = &parent[offset];
890 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
892 if (level == *target_level)
895 if (!dma_pte_present(pte)) {
898 tmp_page = alloc_pgtable_page(domain->nid);
903 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905 if (cmpxchg64(&pte->val, 0ULL, pteval))
906 /* Someone else set it while we were thinking; use theirs. */
907 free_pgtable_page(tmp_page);
909 domain_flush_cache(domain, pte, sizeof(*pte));
914 parent = phys_to_virt(dma_pte_addr(pte));
919 *target_level = level;
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
927 int level, int *large_page)
929 struct dma_pte *parent, *pte;
930 int total = agaw_to_level(domain->agaw);
933 parent = domain->pgd;
934 while (level <= total) {
935 offset = pfn_level_offset(pfn, total);
936 pte = &parent[offset];
940 if (!dma_pte_present(pte)) {
945 if (dma_pte_superpage(pte)) {
950 parent = phys_to_virt(dma_pte_addr(pte));
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958 unsigned long start_pfn,
959 unsigned long last_pfn)
961 unsigned int large_page;
962 struct dma_pte *first_pte, *pte;
964 BUG_ON(!domain_pfn_supported(domain, start_pfn));
965 BUG_ON(!domain_pfn_supported(domain, last_pfn));
966 BUG_ON(start_pfn > last_pfn);
968 /* we don't need lock here; nobody else touches the iova range */
971 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
973 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
978 start_pfn += lvl_to_nr_pages(large_page);
980 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
982 domain_flush_cache(domain, first_pte,
983 (void *)pte - (void *)first_pte);
985 } while (start_pfn && start_pfn <= last_pfn);
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989 int retain_level, struct dma_pte *pte,
990 unsigned long pfn, unsigned long start_pfn,
991 unsigned long last_pfn)
993 pfn = max(start_pfn, pfn);
994 pte = &pte[pfn_level_offset(pfn, level)];
997 unsigned long level_pfn;
998 struct dma_pte *level_pte;
1000 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1003 level_pfn = pfn & level_mask(level);
1004 level_pte = phys_to_virt(dma_pte_addr(pte));
1007 dma_pte_free_level(domain, level - 1, retain_level,
1008 level_pte, level_pfn, start_pfn,
1013 * Free the page table if we're below the level we want to
1014 * retain and the range covers the entire table.
1016 if (level < retain_level && !(start_pfn > level_pfn ||
1017 last_pfn < level_pfn + level_size(level) - 1)) {
1019 domain_flush_cache(domain, pte, sizeof(*pte));
1020 free_pgtable_page(level_pte);
1023 pfn += level_size(level);
1024 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1028 * clear last level (leaf) ptes and free page table pages below the
1029 * level we wish to keep intact.
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn,
1036 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038 BUG_ON(start_pfn > last_pfn);
1040 dma_pte_clear_range(domain, start_pfn, last_pfn);
1042 /* We don't need lock here; nobody else touches the iova range */
1043 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044 domain->pgd, 0, start_pfn, last_pfn);
1047 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048 free_pgtable_page(domain->pgd);
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054 need to *modify* it at all. All we need to do is make a list of all the
1055 pages which can be freed just as soon as we've flushed the IOTLB and we
1056 know the hardware page-walk will no longer touch them.
1057 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060 int level, struct dma_pte *pte,
1061 struct page *freelist)
1065 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066 pg->freelist = freelist;
1072 pte = page_address(pg);
1074 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075 freelist = dma_pte_list_pagetables(domain, level - 1,
1078 } while (!first_pte_in_page(pte));
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084 struct dma_pte *pte, unsigned long pfn,
1085 unsigned long start_pfn,
1086 unsigned long last_pfn,
1087 struct page *freelist)
1089 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1091 pfn = max(start_pfn, pfn);
1092 pte = &pte[pfn_level_offset(pfn, level)];
1095 unsigned long level_pfn;
1097 if (!dma_pte_present(pte))
1100 level_pfn = pfn & level_mask(level);
1102 /* If range covers entire pagetable, free it */
1103 if (start_pfn <= level_pfn &&
1104 last_pfn >= level_pfn + level_size(level) - 1) {
1105 /* These suborbinate page tables are going away entirely. Don't
1106 bother to clear them; we're just going to *free* them. */
1107 if (level > 1 && !dma_pte_superpage(pte))
1108 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 } else if (level > 1) {
1115 /* Recurse down into a level that isn't *entirely* obsolete */
1116 freelist = dma_pte_clear_level(domain, level - 1,
1117 phys_to_virt(dma_pte_addr(pte)),
1118 level_pfn, start_pfn, last_pfn,
1122 pfn += level_size(level);
1123 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1126 domain_flush_cache(domain, first_pte,
1127 (void *)++last_pte - (void *)first_pte);
1132 /* We can't just free the pages because the IOMMU may still be walking
1133 the page tables, and may have cached the intermediate levels. The
1134 pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136 unsigned long start_pfn,
1137 unsigned long last_pfn)
1139 struct page *freelist;
1141 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143 BUG_ON(start_pfn > last_pfn);
1145 /* we don't need lock here; nobody else touches the iova range */
1146 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147 domain->pgd, 0, start_pfn, last_pfn, NULL);
1150 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151 struct page *pgd_page = virt_to_page(domain->pgd);
1152 pgd_page->freelist = freelist;
1153 freelist = pgd_page;
1161 static void dma_free_pagelist(struct page *freelist)
1165 while ((pg = freelist)) {
1166 freelist = pg->freelist;
1167 free_pgtable_page(page_address(pg));
1171 static void iova_entry_free(unsigned long data)
1173 struct page *freelist = (struct page *)data;
1175 dma_free_pagelist(freelist);
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1181 struct root_entry *root;
1182 unsigned long flags;
1184 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1186 pr_err("Allocating root entry for %s failed\n",
1191 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1193 spin_lock_irqsave(&iommu->lock, flags);
1194 iommu->root_entry = root;
1195 spin_unlock_irqrestore(&iommu->lock, flags);
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 addr = virt_to_phys(iommu->root_entry);
1207 if (sm_supported(iommu))
1208 addr |= DMA_RTADDR_SMT;
1210 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215 /* Make sure hardware complete it */
1216 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217 readl, (sts & DMA_GSTS_RTPS), sts);
1219 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1227 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1230 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1233 /* Make sure hardware complete it */
1234 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235 readl, (!(val & DMA_GSTS_WBFS)), val);
1237 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242 u16 did, u16 source_id, u8 function_mask,
1249 case DMA_CCMD_GLOBAL_INVL:
1250 val = DMA_CCMD_GLOBAL_INVL;
1252 case DMA_CCMD_DOMAIN_INVL:
1253 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1255 case DMA_CCMD_DEVICE_INVL:
1256 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1262 val |= DMA_CCMD_ICC;
1264 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1267 /* Make sure hardware complete it */
1268 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1271 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276 u64 addr, unsigned int size_order, u64 type)
1278 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279 u64 val = 0, val_iva = 0;
1283 case DMA_TLB_GLOBAL_FLUSH:
1284 /* global flush doesn't need set IVA_REG */
1285 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1287 case DMA_TLB_DSI_FLUSH:
1288 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 case DMA_TLB_PSI_FLUSH:
1291 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292 /* IH bit is passed in as part of address */
1293 val_iva = size_order | addr;
1298 /* Note: set drain read/write */
1301 * This is probably to be super secure.. Looks like we can
1302 * ignore it without any impact.
1304 if (cap_read_drain(iommu->cap))
1305 val |= DMA_TLB_READ_DRAIN;
1307 if (cap_write_drain(iommu->cap))
1308 val |= DMA_TLB_WRITE_DRAIN;
1310 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311 /* Note: Only uses first TLB reg currently */
1313 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1316 /* Make sure hardware complete it */
1317 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1320 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 /* check IOTLB invalidation granularity */
1323 if (DMA_TLB_IAIG(val) == 0)
1324 pr_err("Flush IOTLB failed\n");
1325 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327 (unsigned long long)DMA_TLB_IIRG(type),
1328 (unsigned long long)DMA_TLB_IAIG(val));
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1335 struct device_domain_info *info;
1337 assert_spin_locked(&device_domain_lock);
1342 list_for_each_entry(info, &domain->devices, link)
1343 if (info->iommu == iommu && info->bus == bus &&
1344 info->devfn == devfn) {
1345 if (info->ats_supported && info->dev)
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1355 struct device_domain_info *info;
1356 bool has_iotlb_device = false;
1358 assert_spin_locked(&device_domain_lock);
1360 list_for_each_entry(info, &domain->devices, link) {
1361 struct pci_dev *pdev;
1363 if (!info->dev || !dev_is_pci(info->dev))
1366 pdev = to_pci_dev(info->dev);
1367 if (pdev->ats_enabled) {
1368 has_iotlb_device = true;
1373 domain->has_iotlb_device = has_iotlb_device;
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1378 struct pci_dev *pdev;
1380 assert_spin_locked(&device_domain_lock);
1382 if (!info || !dev_is_pci(info->dev))
1385 pdev = to_pci_dev(info->dev);
1386 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389 * reserved, which should be set to 0.
1391 if (!ecap_dit(info->iommu->ecap))
1394 struct pci_dev *pf_pdev;
1396 /* pdev will be returned if device is not a vf */
1397 pf_pdev = pci_physfn(pdev);
1398 info->pfsid = pci_dev_id(pf_pdev);
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402 /* The PCIe spec, in its wisdom, declares that the behaviour of
1403 the device if you enable PASID support after ATS support is
1404 undefined. So always enable PASID support on devices which
1405 have it, even if we can't yet know if we're ever going to
1407 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408 info->pasid_enabled = 1;
1410 if (info->pri_supported &&
1411 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1412 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413 info->pri_enabled = 1;
1415 if (!pdev->untrusted && info->ats_supported &&
1416 pci_ats_page_aligned(pdev) &&
1417 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418 info->ats_enabled = 1;
1419 domain_update_iotlb(info->domain);
1420 info->ats_qdep = pci_ats_queue_depth(pdev);
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1426 struct pci_dev *pdev;
1428 assert_spin_locked(&device_domain_lock);
1430 if (!dev_is_pci(info->dev))
1433 pdev = to_pci_dev(info->dev);
1435 if (info->ats_enabled) {
1436 pci_disable_ats(pdev);
1437 info->ats_enabled = 0;
1438 domain_update_iotlb(info->domain);
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441 if (info->pri_enabled) {
1442 pci_disable_pri(pdev);
1443 info->pri_enabled = 0;
1445 if (info->pasid_enabled) {
1446 pci_disable_pasid(pdev);
1447 info->pasid_enabled = 0;
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453 u64 addr, unsigned mask)
1456 unsigned long flags;
1457 struct device_domain_info *info;
1459 if (!domain->has_iotlb_device)
1462 spin_lock_irqsave(&device_domain_lock, flags);
1463 list_for_each_entry(info, &domain->devices, link) {
1464 if (!info->ats_enabled)
1467 sid = info->bus << 8 | info->devfn;
1468 qdep = info->ats_qdep;
1469 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1472 spin_unlock_irqrestore(&device_domain_lock, flags);
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476 struct dmar_domain *domain,
1477 unsigned long pfn, unsigned int pages,
1480 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482 u16 did = domain->iommu_did[iommu->seq_id];
1489 * Fallback to domain selective flush if no PSI support or the size is
1491 * PSI requires page size to be 2 ^ x, and the base address is naturally
1492 * aligned to the size
1494 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1498 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1502 * In caching mode, changes of pages from non-present to present require
1503 * flush. However, device IOTLB doesn't need to be flushed in this case.
1505 if (!cap_caching_mode(iommu->cap) || !map)
1506 iommu_flush_dev_iotlb(domain, addr, mask);
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511 struct dmar_domain *domain,
1512 unsigned long pfn, unsigned int pages)
1514 /* It's a non-present to present mapping. Only flush if caching mode */
1515 if (cap_caching_mode(iommu->cap))
1516 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1518 iommu_flush_write_buffer(iommu);
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1523 struct dmar_domain *domain;
1526 domain = container_of(iovad, struct dmar_domain, iovad);
1528 for_each_domain_iommu(idx, domain) {
1529 struct intel_iommu *iommu = g_iommus[idx];
1530 u16 did = domain->iommu_did[iommu->seq_id];
1532 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1534 if (!cap_caching_mode(iommu->cap))
1535 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536 0, MAX_AGAW_PFN_WIDTH);
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1543 unsigned long flags;
1545 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1548 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550 pmen &= ~DMA_PMEN_EPM;
1551 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1553 /* wait for the protected region status bit to clear */
1554 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555 readl, !(pmen & DMA_PMEN_PRS), pmen);
1557 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1563 unsigned long flags;
1565 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566 iommu->gcmd |= DMA_GCMD_TE;
1567 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1569 /* Make sure hardware complete it */
1570 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571 readl, (sts & DMA_GSTS_TES), sts);
1573 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1581 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582 iommu->gcmd &= ~DMA_GCMD_TE;
1583 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585 /* Make sure hardware complete it */
1586 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587 readl, (!(sts & DMA_GSTS_TES)), sts);
1589 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1594 u32 ndomains, nlongs;
1597 ndomains = cap_ndoms(iommu->cap);
1598 pr_debug("%s: Number of Domains supported <%d>\n",
1599 iommu->name, ndomains);
1600 nlongs = BITS_TO_LONGS(ndomains);
1602 spin_lock_init(&iommu->lock);
1604 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605 if (!iommu->domain_ids) {
1606 pr_err("%s: Allocating domain id array failed\n",
1611 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612 iommu->domains = kzalloc(size, GFP_KERNEL);
1614 if (iommu->domains) {
1615 size = 256 * sizeof(struct dmar_domain *);
1616 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1619 if (!iommu->domains || !iommu->domains[0]) {
1620 pr_err("%s: Allocating domain array failed\n",
1622 kfree(iommu->domain_ids);
1623 kfree(iommu->domains);
1624 iommu->domain_ids = NULL;
1625 iommu->domains = NULL;
1630 * If Caching mode is set, then invalid translations are tagged
1631 * with domain-id 0, hence we need to pre-allocate it. We also
1632 * use domain-id 0 as a marker for non-allocated domain-id, so
1633 * make sure it is not used for a real domain.
1635 set_bit(0, iommu->domain_ids);
1638 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639 * entry for first-level or pass-through translation modes should
1640 * be programmed with a domain id different from those used for
1641 * second-level or nested translation. We reserve a domain id for
1644 if (sm_supported(iommu))
1645 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1652 struct device_domain_info *info, *tmp;
1653 unsigned long flags;
1655 if (!iommu->domains || !iommu->domain_ids)
1658 spin_lock_irqsave(&device_domain_lock, flags);
1659 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660 if (info->iommu != iommu)
1663 if (!info->dev || !info->domain)
1666 __dmar_remove_one_dev_info(info);
1668 spin_unlock_irqrestore(&device_domain_lock, flags);
1670 if (iommu->gcmd & DMA_GCMD_TE)
1671 iommu_disable_translation(iommu);
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1676 if ((iommu->domains) && (iommu->domain_ids)) {
1677 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1680 for (i = 0; i < elems; i++)
1681 kfree(iommu->domains[i]);
1682 kfree(iommu->domains);
1683 kfree(iommu->domain_ids);
1684 iommu->domains = NULL;
1685 iommu->domain_ids = NULL;
1688 g_iommus[iommu->seq_id] = NULL;
1690 /* free context mapping */
1691 free_context_table(iommu);
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694 if (pasid_supported(iommu)) {
1695 if (ecap_prs(iommu->ecap))
1696 intel_svm_finish_prq(iommu);
1701 static struct dmar_domain *alloc_domain(int flags)
1703 struct dmar_domain *domain;
1705 domain = alloc_domain_mem();
1709 memset(domain, 0, sizeof(*domain));
1710 domain->nid = NUMA_NO_NODE;
1711 domain->flags = flags;
1712 domain->has_iotlb_device = false;
1713 INIT_LIST_HEAD(&domain->devices);
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720 struct intel_iommu *iommu)
1722 unsigned long ndomains;
1725 assert_spin_locked(&device_domain_lock);
1726 assert_spin_locked(&iommu->lock);
1728 domain->iommu_refcnt[iommu->seq_id] += 1;
1729 domain->iommu_count += 1;
1730 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731 ndomains = cap_ndoms(iommu->cap);
1732 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1734 if (num >= ndomains) {
1735 pr_err("%s: No free domain ids\n", iommu->name);
1736 domain->iommu_refcnt[iommu->seq_id] -= 1;
1737 domain->iommu_count -= 1;
1741 set_bit(num, iommu->domain_ids);
1742 set_iommu_domain(iommu, num, domain);
1744 domain->iommu_did[iommu->seq_id] = num;
1745 domain->nid = iommu->node;
1747 domain_update_iommu_cap(domain);
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754 struct intel_iommu *iommu)
1758 assert_spin_locked(&device_domain_lock);
1759 assert_spin_locked(&iommu->lock);
1761 domain->iommu_refcnt[iommu->seq_id] -= 1;
1762 count = --domain->iommu_count;
1763 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764 num = domain->iommu_did[iommu->seq_id];
1765 clear_bit(num, iommu->domain_ids);
1766 set_iommu_domain(iommu, num, NULL);
1768 domain_update_iommu_cap(domain);
1769 domain->iommu_did[iommu->seq_id] = 0;
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1778 static int dmar_init_reserved_ranges(void)
1780 struct pci_dev *pdev = NULL;
1784 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1786 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787 &reserved_rbtree_key);
1789 /* IOAPIC ranges shouldn't be accessed by DMA */
1790 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791 IOVA_PFN(IOAPIC_RANGE_END));
1793 pr_err("Reserve IOAPIC range failed\n");
1797 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798 for_each_pci_dev(pdev) {
1801 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802 r = &pdev->resource[i];
1803 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1805 iova = reserve_iova(&reserved_iova_list,
1809 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1819 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1825 int r = (gaw - 12) % 9;
1836 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1839 int adjust_width, agaw;
1840 unsigned long sagaw;
1843 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1845 err = init_iova_flush_queue(&domain->iovad,
1846 iommu_flush_iova, iova_entry_free);
1850 domain_reserve_special_ranges(domain);
1852 /* calculate AGAW */
1853 if (guest_width > cap_mgaw(iommu->cap))
1854 guest_width = cap_mgaw(iommu->cap);
1855 domain->gaw = guest_width;
1856 adjust_width = guestwidth_to_adjustwidth(guest_width);
1857 agaw = width_to_agaw(adjust_width);
1858 sagaw = cap_sagaw(iommu->cap);
1859 if (!test_bit(agaw, &sagaw)) {
1860 /* hardware doesn't support it, choose a bigger one */
1861 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862 agaw = find_next_bit(&sagaw, 5, agaw);
1866 domain->agaw = agaw;
1868 if (ecap_coherent(iommu->ecap))
1869 domain->iommu_coherency = 1;
1871 domain->iommu_coherency = 0;
1873 if (ecap_sc_support(iommu->ecap))
1874 domain->iommu_snooping = 1;
1876 domain->iommu_snooping = 0;
1878 if (intel_iommu_superpage)
1879 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1881 domain->iommu_superpage = 0;
1883 domain->nid = iommu->node;
1885 /* always allocate the top pgd */
1886 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1889 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1893 static void domain_exit(struct dmar_domain *domain)
1895 struct page *freelist;
1897 /* Remove associated devices and clear attached or cached domains */
1898 domain_remove_dev_info(domain);
1901 put_iova_domain(&domain->iovad);
1903 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1905 dma_free_pagelist(freelist);
1907 free_domain_mem(domain);
1911 * Get the PASID directory size for scalable mode context entry.
1912 * Value of X in the PDTS field of a scalable mode context entry
1913 * indicates PASID directory with 2^(X + 7) entries.
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928 * Set the RID_PASID field of a scalable mode context entry. The
1929 * IOMMU hardware will use the PASID value set in this field for
1930 * DMA translations of DMA requests without PASID.
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1935 context->hi |= pasid & ((1 << 20) - 1);
1936 context->hi |= (1 << 20);
1940 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943 static inline void context_set_sm_dte(struct context_entry *context)
1945 context->lo |= (1 << 2);
1949 * Set the PRE(Page Request Enable) field of a scalable mode context
1952 static inline void context_set_sm_pre(struct context_entry *context)
1954 context->lo |= (1 << 4);
1957 /* Convert value to context PASID directory size field coding. */
1958 #define context_pdts(pds) (((pds) & 0x7) << 9)
1960 static int domain_context_mapping_one(struct dmar_domain *domain,
1961 struct intel_iommu *iommu,
1962 struct pasid_table *table,
1965 u16 did = domain->iommu_did[iommu->seq_id];
1966 int translation = CONTEXT_TT_MULTI_LEVEL;
1967 struct device_domain_info *info = NULL;
1968 struct context_entry *context;
1969 unsigned long flags;
1974 if (hw_pass_through && domain_type_is_si(domain))
1975 translation = CONTEXT_TT_PASS_THROUGH;
1977 pr_debug("Set context mapping for %02x:%02x.%d\n",
1978 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1980 BUG_ON(!domain->pgd);
1982 spin_lock_irqsave(&device_domain_lock, flags);
1983 spin_lock(&iommu->lock);
1986 context = iommu_context_addr(iommu, bus, devfn, 1);
1991 if (context_present(context))
1995 * For kdump cases, old valid entries may be cached due to the
1996 * in-flight DMA and copied pgtable, but there is no unmapping
1997 * behaviour for them, thus we need an explicit cache flush for
1998 * the newly-mapped device. For kdump, at this point, the device
1999 * is supposed to finish reset at its driver probe stage, so no
2000 * in-flight DMA will exist, and we don't need to worry anymore
2003 if (context_copied(context)) {
2004 u16 did_old = context_domain_id(context);
2006 if (did_old < cap_ndoms(iommu->cap)) {
2007 iommu->flush.flush_context(iommu, did_old,
2008 (((u16)bus) << 8) | devfn,
2009 DMA_CCMD_MASK_NOBIT,
2010 DMA_CCMD_DEVICE_INVL);
2011 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2016 context_clear_entry(context);
2018 if (sm_supported(iommu)) {
2023 /* Setup the PASID DIR pointer: */
2024 pds = context_get_sm_pds(table);
2025 context->lo = (u64)virt_to_phys(table->table) |
2028 /* Setup the RID_PASID field: */
2029 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032 * Setup the Device-TLB enable bit and Page request
2035 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2036 if (info && info->ats_supported)
2037 context_set_sm_dte(context);
2038 if (info && info->pri_supported)
2039 context_set_sm_pre(context);
2041 struct dma_pte *pgd = domain->pgd;
2044 context_set_domain_id(context, did);
2046 if (translation != CONTEXT_TT_PASS_THROUGH) {
2048 * Skip top levels of page tables for iommu which has
2049 * less agaw than default. Unnecessary for PT mode.
2051 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2053 pgd = phys_to_virt(dma_pte_addr(pgd));
2054 if (!dma_pte_present(pgd))
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 translation = CONTEXT_TT_DEV_IOTLB;
2062 translation = CONTEXT_TT_MULTI_LEVEL;
2064 context_set_address_root(context, virt_to_phys(pgd));
2065 context_set_address_width(context, agaw);
2068 * In pass through mode, AW must be programmed to
2069 * indicate the largest AGAW value supported by
2070 * hardware. And ASR is ignored by hardware.
2072 context_set_address_width(context, iommu->msagaw);
2075 context_set_translation_type(context, translation);
2078 context_set_fault_enable(context);
2079 context_set_present(context);
2080 domain_flush_cache(domain, context, sizeof(*context));
2083 * It's a non-present to present mapping. If hardware doesn't cache
2084 * non-present entry we only need to flush the write-buffer. If the
2085 * _does_ cache non-present entries, then it does so in the special
2086 * domain #0, which we have to flush:
2088 if (cap_caching_mode(iommu->cap)) {
2089 iommu->flush.flush_context(iommu, 0,
2090 (((u16)bus) << 8) | devfn,
2091 DMA_CCMD_MASK_NOBIT,
2092 DMA_CCMD_DEVICE_INVL);
2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2095 iommu_flush_write_buffer(iommu);
2097 iommu_enable_dev_iotlb(info);
2102 spin_unlock(&iommu->lock);
2103 spin_unlock_irqrestore(&device_domain_lock, flags);
2108 struct domain_context_mapping_data {
2109 struct dmar_domain *domain;
2110 struct intel_iommu *iommu;
2111 struct pasid_table *table;
2114 static int domain_context_mapping_cb(struct pci_dev *pdev,
2115 u16 alias, void *opaque)
2117 struct domain_context_mapping_data *data = opaque;
2119 return domain_context_mapping_one(data->domain, data->iommu,
2120 data->table, PCI_BUS_NUM(alias),
2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2127 struct domain_context_mapping_data data;
2128 struct pasid_table *table;
2129 struct intel_iommu *iommu;
2132 iommu = device_to_iommu(dev, &bus, &devfn);
2136 table = intel_pasid_get_table(dev);
2138 if (!dev_is_pci(dev))
2139 return domain_context_mapping_one(domain, iommu, table,
2142 data.domain = domain;
2146 return pci_for_each_dma_alias(to_pci_dev(dev),
2147 &domain_context_mapping_cb, &data);
2150 static int domain_context_mapped_cb(struct pci_dev *pdev,
2151 u16 alias, void *opaque)
2153 struct intel_iommu *iommu = opaque;
2155 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158 static int domain_context_mapped(struct device *dev)
2160 struct intel_iommu *iommu;
2163 iommu = device_to_iommu(dev, &bus, &devfn);
2167 if (!dev_is_pci(dev))
2168 return device_context_mapped(iommu, bus, devfn);
2170 return !pci_for_each_dma_alias(to_pci_dev(dev),
2171 domain_context_mapped_cb, iommu);
2174 /* Returns a number of VTD pages, but aligned to MM page size */
2175 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178 host_addr &= ~PAGE_MASK;
2179 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182 /* Return largest possible superpage level for a given mapping */
2183 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2184 unsigned long iov_pfn,
2185 unsigned long phy_pfn,
2186 unsigned long pages)
2188 int support, level = 1;
2189 unsigned long pfnmerge;
2191 support = domain->iommu_superpage;
2193 /* To use a large page, the virtual *and* physical addresses
2194 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2195 of them will mean we have to use smaller pages. So just
2196 merge them and check both at once. */
2197 pfnmerge = iov_pfn | phy_pfn;
2199 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2200 pages >>= VTD_STRIDE_SHIFT;
2203 pfnmerge >>= VTD_STRIDE_SHIFT;
2210 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2211 struct scatterlist *sg, unsigned long phys_pfn,
2212 unsigned long nr_pages, int prot)
2214 struct dma_pte *first_pte = NULL, *pte = NULL;
2215 phys_addr_t uninitialized_var(pteval);
2216 unsigned long sg_res = 0;
2217 unsigned int largepage_lvl = 0;
2218 unsigned long lvl_pages = 0;
2220 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2222 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2225 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2229 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2232 while (nr_pages > 0) {
2236 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2238 sg_res = aligned_nrpages(sg->offset, sg->length);
2239 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2240 sg->dma_length = sg->length;
2241 pteval = (sg_phys(sg) - pgoff) | prot;
2242 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2246 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2248 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251 /* It is large page*/
2252 if (largepage_lvl > 1) {
2253 unsigned long nr_superpages, end_pfn;
2255 pteval |= DMA_PTE_LARGE_PAGE;
2256 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2258 nr_superpages = sg_res / lvl_pages;
2259 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2262 * Ensure that old small page tables are
2263 * removed to make room for superpage(s).
2264 * We're adding new large pages, so make sure
2265 * we don't remove their parent tables.
2267 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2270 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2274 /* We don't need lock here, nobody else
2275 * touches the iova range
2277 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2279 static int dumps = 5;
2280 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2281 iov_pfn, tmp, (unsigned long long)pteval);
2284 debug_dma_dump_mappings(NULL);
2289 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2291 BUG_ON(nr_pages < lvl_pages);
2292 BUG_ON(sg_res < lvl_pages);
2294 nr_pages -= lvl_pages;
2295 iov_pfn += lvl_pages;
2296 phys_pfn += lvl_pages;
2297 pteval += lvl_pages * VTD_PAGE_SIZE;
2298 sg_res -= lvl_pages;
2300 /* If the next PTE would be the first in a new page, then we
2301 need to flush the cache on the entries we've just written.
2302 And then we'll need to recalculate 'pte', so clear it and
2303 let it get set again in the if (!pte) block above.
2305 If we're done (!nr_pages) we need to flush the cache too.
2307 Also if we've been setting superpages, we may need to
2308 recalculate 'pte' and switch back to smaller pages for the
2309 end of the mapping, if the trailing size is not enough to
2310 use another superpage (i.e. sg_res < lvl_pages). */
2312 if (!nr_pages || first_pte_in_page(pte) ||
2313 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2314 domain_flush_cache(domain, first_pte,
2315 (void *)pte - (void *)first_pte);
2319 if (!sg_res && nr_pages)
2325 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326 struct scatterlist *sg, unsigned long phys_pfn,
2327 unsigned long nr_pages, int prot)
2330 struct intel_iommu *iommu;
2332 /* Do the real mapping first */
2333 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2337 for_each_domain_iommu(iommu_id, domain) {
2338 iommu = g_iommus[iommu_id];
2339 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2345 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2346 struct scatterlist *sg, unsigned long nr_pages,
2349 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2352 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353 unsigned long phys_pfn, unsigned long nr_pages,
2356 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2359 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2361 unsigned long flags;
2362 struct context_entry *context;
2368 spin_lock_irqsave(&iommu->lock, flags);
2369 context = iommu_context_addr(iommu, bus, devfn, 0);
2371 spin_unlock_irqrestore(&iommu->lock, flags);
2374 did_old = context_domain_id(context);
2375 context_clear_entry(context);
2376 __iommu_flush_cache(iommu, context, sizeof(*context));
2377 spin_unlock_irqrestore(&iommu->lock, flags);
2378 iommu->flush.flush_context(iommu,
2380 (((u16)bus) << 8) | devfn,
2381 DMA_CCMD_MASK_NOBIT,
2382 DMA_CCMD_DEVICE_INVL);
2383 iommu->flush.flush_iotlb(iommu,
2390 static inline void unlink_domain_info(struct device_domain_info *info)
2392 assert_spin_locked(&device_domain_lock);
2393 list_del(&info->link);
2394 list_del(&info->global);
2396 info->dev->archdata.iommu = NULL;
2399 static void domain_remove_dev_info(struct dmar_domain *domain)
2401 struct device_domain_info *info, *tmp;
2402 unsigned long flags;
2404 spin_lock_irqsave(&device_domain_lock, flags);
2405 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2406 __dmar_remove_one_dev_info(info);
2407 spin_unlock_irqrestore(&device_domain_lock, flags);
2412 * Note: we use struct device->archdata.iommu stores the info
2414 static struct dmar_domain *find_domain(struct device *dev)
2416 struct device_domain_info *info;
2418 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2419 struct iommu_domain *domain;
2421 dev->archdata.iommu = NULL;
2422 domain = iommu_get_domain_for_dev(dev);
2424 intel_iommu_attach_device(domain, dev);
2427 /* No lock here, assumes no domain exit in normal case */
2428 info = dev->archdata.iommu;
2431 return info->domain;
2435 static inline struct device_domain_info *
2436 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2438 struct device_domain_info *info;
2440 list_for_each_entry(info, &device_domain_list, global)
2441 if (info->iommu->segment == segment && info->bus == bus &&
2442 info->devfn == devfn)
2448 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2451 struct dmar_domain *domain)
2453 struct dmar_domain *found = NULL;
2454 struct device_domain_info *info;
2455 unsigned long flags;
2458 info = alloc_devinfo_mem();
2463 info->devfn = devfn;
2464 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2465 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2468 info->domain = domain;
2469 info->iommu = iommu;
2470 info->pasid_table = NULL;
2471 info->auxd_enabled = 0;
2472 INIT_LIST_HEAD(&info->auxiliary_domains);
2474 if (dev && dev_is_pci(dev)) {
2475 struct pci_dev *pdev = to_pci_dev(info->dev);
2477 if (!pdev->untrusted &&
2478 !pci_ats_disabled() &&
2479 ecap_dev_iotlb_support(iommu->ecap) &&
2480 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2481 dmar_find_matched_atsr_unit(pdev))
2482 info->ats_supported = 1;
2484 if (sm_supported(iommu)) {
2485 if (pasid_supported(iommu)) {
2486 int features = pci_pasid_features(pdev);
2488 info->pasid_supported = features | 1;
2491 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2492 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2493 info->pri_supported = 1;
2497 spin_lock_irqsave(&device_domain_lock, flags);
2499 found = find_domain(dev);
2502 struct device_domain_info *info2;
2503 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2505 found = info2->domain;
2511 spin_unlock_irqrestore(&device_domain_lock, flags);
2512 free_devinfo_mem(info);
2513 /* Caller must free the original domain */
2517 spin_lock(&iommu->lock);
2518 ret = domain_attach_iommu(domain, iommu);
2519 spin_unlock(&iommu->lock);
2522 spin_unlock_irqrestore(&device_domain_lock, flags);
2523 free_devinfo_mem(info);
2527 list_add(&info->link, &domain->devices);
2528 list_add(&info->global, &device_domain_list);
2530 dev->archdata.iommu = info;
2531 spin_unlock_irqrestore(&device_domain_lock, flags);
2533 /* PASID table is mandatory for a PCI device in scalable mode. */
2534 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2535 ret = intel_pasid_alloc_table(dev);
2537 dev_err(dev, "PASID table allocation failed\n");
2538 dmar_remove_one_dev_info(dev);
2542 /* Setup the PASID entry for requests without PASID: */
2543 spin_lock(&iommu->lock);
2544 if (hw_pass_through && domain_type_is_si(domain))
2545 ret = intel_pasid_setup_pass_through(iommu, domain,
2546 dev, PASID_RID2PASID);
2548 ret = intel_pasid_setup_second_level(iommu, domain,
2549 dev, PASID_RID2PASID);
2550 spin_unlock(&iommu->lock);
2552 dev_err(dev, "Setup RID2PASID failed\n");
2553 dmar_remove_one_dev_info(dev);
2558 if (dev && domain_context_mapping(domain, dev)) {
2559 dev_err(dev, "Domain context map failed\n");
2560 dmar_remove_one_dev_info(dev);
2567 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2569 *(u16 *)opaque = alias;
2573 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2575 struct device_domain_info *info;
2576 struct dmar_domain *domain = NULL;
2577 struct intel_iommu *iommu;
2579 unsigned long flags;
2582 iommu = device_to_iommu(dev, &bus, &devfn);
2586 if (dev_is_pci(dev)) {
2587 struct pci_dev *pdev = to_pci_dev(dev);
2589 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2591 spin_lock_irqsave(&device_domain_lock, flags);
2592 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2593 PCI_BUS_NUM(dma_alias),
2596 iommu = info->iommu;
2597 domain = info->domain;
2599 spin_unlock_irqrestore(&device_domain_lock, flags);
2601 /* DMA alias already has a domain, use it */
2606 /* Allocate and initialize new domain for the device */
2607 domain = alloc_domain(0);
2610 if (domain_init(domain, iommu, gaw)) {
2611 domain_exit(domain);
2619 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2620 struct dmar_domain *domain)
2622 struct intel_iommu *iommu;
2623 struct dmar_domain *tmp;
2624 u16 req_id, dma_alias;
2627 iommu = device_to_iommu(dev, &bus, &devfn);
2631 req_id = ((u16)bus << 8) | devfn;
2633 if (dev_is_pci(dev)) {
2634 struct pci_dev *pdev = to_pci_dev(dev);
2636 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2638 /* register PCI DMA alias device */
2639 if (req_id != dma_alias) {
2640 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2641 dma_alias & 0xff, NULL, domain);
2643 if (!tmp || tmp != domain)
2648 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2649 if (!tmp || tmp != domain)
2655 static int iommu_domain_identity_map(struct dmar_domain *domain,
2656 unsigned long long start,
2657 unsigned long long end)
2659 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2660 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2662 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2663 dma_to_mm_pfn(last_vpfn))) {
2664 pr_err("Reserving iova failed\n");
2668 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2670 * RMRR range might have overlap with physical memory range,
2673 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2675 return __domain_mapping(domain, first_vpfn, NULL,
2676 first_vpfn, last_vpfn - first_vpfn + 1,
2677 DMA_PTE_READ|DMA_PTE_WRITE);
2680 static int domain_prepare_identity_map(struct device *dev,
2681 struct dmar_domain *domain,
2682 unsigned long long start,
2683 unsigned long long end)
2685 /* For _hardware_ passthrough, don't bother. But for software
2686 passthrough, we do it anyway -- it may indicate a memory
2687 range which is reserved in E820, so which didn't get set
2688 up to start with in si_domain */
2689 if (domain == si_domain && hw_pass_through) {
2690 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2695 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2698 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2699 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2700 dmi_get_system_info(DMI_BIOS_VENDOR),
2701 dmi_get_system_info(DMI_BIOS_VERSION),
2702 dmi_get_system_info(DMI_PRODUCT_VERSION));
2706 if (end >> agaw_to_width(domain->agaw)) {
2707 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2708 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2709 agaw_to_width(domain->agaw),
2710 dmi_get_system_info(DMI_BIOS_VENDOR),
2711 dmi_get_system_info(DMI_BIOS_VERSION),
2712 dmi_get_system_info(DMI_PRODUCT_VERSION));
2716 return iommu_domain_identity_map(domain, start, end);
2719 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2721 static int __init si_domain_init(int hw)
2723 struct dmar_rmrr_unit *rmrr;
2727 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2731 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2732 domain_exit(si_domain);
2739 for_each_online_node(nid) {
2740 unsigned long start_pfn, end_pfn;
2743 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2744 ret = iommu_domain_identity_map(si_domain,
2745 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2752 * Normally we use DMA domains for devices which have RMRRs. But we
2753 * loose this requirement for graphic and usb devices. Identity map
2754 * the RMRRs for graphic and USB devices so that they could use the
2757 for_each_rmrr_units(rmrr) {
2758 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2760 unsigned long long start = rmrr->base_address;
2761 unsigned long long end = rmrr->end_address;
2763 if (device_is_rmrr_locked(dev))
2766 if (WARN_ON(end < start ||
2767 end >> agaw_to_width(si_domain->agaw)))
2770 ret = iommu_domain_identity_map(si_domain, start, end);
2779 static int identity_mapping(struct device *dev)
2781 struct device_domain_info *info;
2783 info = dev->archdata.iommu;
2784 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2785 return (info->domain == si_domain);
2790 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2792 struct dmar_domain *ndomain;
2793 struct intel_iommu *iommu;
2796 iommu = device_to_iommu(dev, &bus, &devfn);
2800 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2801 if (ndomain != domain)
2807 static bool device_has_rmrr(struct device *dev)
2809 struct dmar_rmrr_unit *rmrr;
2814 for_each_rmrr_units(rmrr) {
2816 * Return TRUE if this RMRR contains the device that
2819 for_each_active_dev_scope(rmrr->devices,
2820 rmrr->devices_cnt, i, tmp)
2822 is_downstream_to_pci_bridge(dev, tmp)) {
2832 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2833 * is relaxable (ie. is allowed to be not enforced under some conditions)
2834 * @dev: device handle
2836 * We assume that PCI USB devices with RMRRs have them largely
2837 * for historical reasons and that the RMRR space is not actively used post
2838 * boot. This exclusion may change if vendors begin to abuse it.
2840 * The same exception is made for graphics devices, with the requirement that
2841 * any use of the RMRR regions will be torn down before assigning the device
2844 * Return: true if the RMRR is relaxable, false otherwise
2846 static bool device_rmrr_is_relaxable(struct device *dev)
2848 struct pci_dev *pdev;
2850 if (!dev_is_pci(dev))
2853 pdev = to_pci_dev(dev);
2854 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2861 * There are a couple cases where we need to restrict the functionality of
2862 * devices associated with RMRRs. The first is when evaluating a device for
2863 * identity mapping because problems exist when devices are moved in and out
2864 * of domains and their respective RMRR information is lost. This means that
2865 * a device with associated RMRRs will never be in a "passthrough" domain.
2866 * The second is use of the device through the IOMMU API. This interface
2867 * expects to have full control of the IOVA space for the device. We cannot
2868 * satisfy both the requirement that RMRR access is maintained and have an
2869 * unencumbered IOVA space. We also have no ability to quiesce the device's
2870 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2871 * We therefore prevent devices associated with an RMRR from participating in
2872 * the IOMMU API, which eliminates them from device assignment.
2874 * In both cases, devices which have relaxable RMRRs are not concerned by this
2875 * restriction. See device_rmrr_is_relaxable comment.
2877 static bool device_is_rmrr_locked(struct device *dev)
2879 if (!device_has_rmrr(dev))
2882 if (device_rmrr_is_relaxable(dev))
2889 * Return the required default domain type for a specific device.
2891 * @dev: the device in query
2892 * @startup: true if this is during early boot
2895 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2896 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2897 * - 0: both identity and dynamic domains work for this device
2899 static int device_def_domain_type(struct device *dev)
2901 if (dev_is_pci(dev)) {
2902 struct pci_dev *pdev = to_pci_dev(dev);
2904 if (device_is_rmrr_locked(dev))
2905 return IOMMU_DOMAIN_DMA;
2908 * Prevent any device marked as untrusted from getting
2909 * placed into the statically identity mapping domain.
2911 if (pdev->untrusted)
2912 return IOMMU_DOMAIN_DMA;
2914 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2915 return IOMMU_DOMAIN_IDENTITY;
2917 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2918 return IOMMU_DOMAIN_IDENTITY;
2921 * We want to start off with all devices in the 1:1 domain, and
2922 * take them out later if we find they can't access all of memory.
2924 * However, we can't do this for PCI devices behind bridges,
2925 * because all PCI devices behind the same bridge will end up
2926 * with the same source-id on their transactions.
2928 * Practically speaking, we can't change things around for these
2929 * devices at run-time, because we can't be sure there'll be no
2930 * DMA transactions in flight for any of their siblings.
2932 * So PCI devices (unless they're on the root bus) as well as
2933 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2934 * the 1:1 domain, just in _case_ one of their siblings turns out
2935 * not to be able to map all of memory.
2937 if (!pci_is_pcie(pdev)) {
2938 if (!pci_is_root_bus(pdev->bus))
2939 return IOMMU_DOMAIN_DMA;
2940 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2941 return IOMMU_DOMAIN_DMA;
2942 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2943 return IOMMU_DOMAIN_DMA;
2945 if (device_has_rmrr(dev))
2946 return IOMMU_DOMAIN_DMA;
2949 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2950 IOMMU_DOMAIN_IDENTITY : 0;
2953 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2956 * Start from the sane iommu hardware state.
2957 * If the queued invalidation is already initialized by us
2958 * (for example, while enabling interrupt-remapping) then
2959 * we got the things already rolling from a sane state.
2963 * Clear any previous faults.
2965 dmar_fault(-1, iommu);
2967 * Disable queued invalidation if supported and already enabled
2968 * before OS handover.
2970 dmar_disable_qi(iommu);
2973 if (dmar_enable_qi(iommu)) {
2975 * Queued Invalidate not enabled, use Register Based Invalidate
2977 iommu->flush.flush_context = __iommu_flush_context;
2978 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2979 pr_info("%s: Using Register based invalidation\n",
2982 iommu->flush.flush_context = qi_flush_context;
2983 iommu->flush.flush_iotlb = qi_flush_iotlb;
2984 pr_info("%s: Using Queued invalidation\n", iommu->name);
2988 static int copy_context_table(struct intel_iommu *iommu,
2989 struct root_entry *old_re,
2990 struct context_entry **tbl,
2993 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2994 struct context_entry *new_ce = NULL, ce;
2995 struct context_entry *old_ce = NULL;
2996 struct root_entry re;
2997 phys_addr_t old_ce_phys;
2999 tbl_idx = ext ? bus * 2 : bus;
3000 memcpy(&re, old_re, sizeof(re));
3002 for (devfn = 0; devfn < 256; devfn++) {
3003 /* First calculate the correct index */
3004 idx = (ext ? devfn * 2 : devfn) % 256;
3007 /* First save what we may have and clean up */
3009 tbl[tbl_idx] = new_ce;
3010 __iommu_flush_cache(iommu, new_ce,
3020 old_ce_phys = root_entry_lctp(&re);
3022 old_ce_phys = root_entry_uctp(&re);
3025 if (ext && devfn == 0) {
3026 /* No LCTP, try UCTP */
3035 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3040 new_ce = alloc_pgtable_page(iommu->node);
3047 /* Now copy the context entry */
3048 memcpy(&ce, old_ce + idx, sizeof(ce));
3050 if (!__context_present(&ce))
3053 did = context_domain_id(&ce);
3054 if (did >= 0 && did < cap_ndoms(iommu->cap))
3055 set_bit(did, iommu->domain_ids);
3058 * We need a marker for copied context entries. This
3059 * marker needs to work for the old format as well as
3060 * for extended context entries.
3062 * Bit 67 of the context entry is used. In the old
3063 * format this bit is available to software, in the
3064 * extended format it is the PGE bit, but PGE is ignored
3065 * by HW if PASIDs are disabled (and thus still
3068 * So disable PASIDs first and then mark the entry
3069 * copied. This means that we don't copy PASID
3070 * translations from the old kernel, but this is fine as
3071 * faults there are not fatal.
3073 context_clear_pasid_enable(&ce);
3074 context_set_copied(&ce);
3079 tbl[tbl_idx + pos] = new_ce;
3081 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3090 static int copy_translation_tables(struct intel_iommu *iommu)
3092 struct context_entry **ctxt_tbls;
3093 struct root_entry *old_rt;
3094 phys_addr_t old_rt_phys;
3095 int ctxt_table_entries;
3096 unsigned long flags;
3101 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3102 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3103 new_ext = !!ecap_ecs(iommu->ecap);
3106 * The RTT bit can only be changed when translation is disabled,
3107 * but disabling translation means to open a window for data
3108 * corruption. So bail out and don't copy anything if we would
3109 * have to change the bit.
3114 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3118 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3122 /* This is too big for the stack - allocate it from slab */
3123 ctxt_table_entries = ext ? 512 : 256;
3125 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3129 for (bus = 0; bus < 256; bus++) {
3130 ret = copy_context_table(iommu, &old_rt[bus],
3131 ctxt_tbls, bus, ext);
3133 pr_err("%s: Failed to copy context table for bus %d\n",
3139 spin_lock_irqsave(&iommu->lock, flags);
3141 /* Context tables are copied, now write them to the root_entry table */
3142 for (bus = 0; bus < 256; bus++) {
3143 int idx = ext ? bus * 2 : bus;
3146 if (ctxt_tbls[idx]) {
3147 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3148 iommu->root_entry[bus].lo = val;
3151 if (!ext || !ctxt_tbls[idx + 1])
3154 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3155 iommu->root_entry[bus].hi = val;
3158 spin_unlock_irqrestore(&iommu->lock, flags);
3162 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3172 static int __init init_dmars(void)
3174 struct dmar_drhd_unit *drhd;
3175 struct intel_iommu *iommu;
3181 * initialize and program root entry to not present
3184 for_each_drhd_unit(drhd) {
3186 * lock not needed as this is only incremented in the single
3187 * threaded kernel __init code path all other access are read
3190 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3194 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3197 /* Preallocate enough resources for IOMMU hot-addition */
3198 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3199 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3201 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3204 pr_err("Allocating global iommu array failed\n");
3209 for_each_iommu(iommu, drhd) {
3210 if (drhd->ignored) {
3211 iommu_disable_translation(iommu);
3216 * Find the max pasid size of all IOMMU's in the system.
3217 * We need to ensure the system pasid table is no bigger
3218 * than the smallest supported.
3220 if (pasid_supported(iommu)) {
3221 u32 temp = 2 << ecap_pss(iommu->ecap);
3223 intel_pasid_max_id = min_t(u32, temp,
3224 intel_pasid_max_id);
3227 g_iommus[iommu->seq_id] = iommu;
3229 intel_iommu_init_qi(iommu);
3231 ret = iommu_init_domains(iommu);
3235 init_translation_status(iommu);
3237 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3238 iommu_disable_translation(iommu);
3239 clear_translation_pre_enabled(iommu);
3240 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3246 * we could share the same root & context tables
3247 * among all IOMMU's. Need to Split it later.
3249 ret = iommu_alloc_root_entry(iommu);
3253 if (translation_pre_enabled(iommu)) {
3254 pr_info("Translation already enabled - trying to copy translation structures\n");
3256 ret = copy_translation_tables(iommu);
3259 * We found the IOMMU with translation
3260 * enabled - but failed to copy over the
3261 * old root-entry table. Try to proceed
3262 * by disabling translation now and
3263 * allocating a clean root-entry table.
3264 * This might cause DMAR faults, but
3265 * probably the dump will still succeed.
3267 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3269 iommu_disable_translation(iommu);
3270 clear_translation_pre_enabled(iommu);
3272 pr_info("Copied translation tables from previous kernel for %s\n",
3277 if (!ecap_pass_through(iommu->ecap))
3278 hw_pass_through = 0;
3279 #ifdef CONFIG_INTEL_IOMMU_SVM
3280 if (pasid_supported(iommu))
3281 intel_svm_init(iommu);
3286 * Now that qi is enabled on all iommus, set the root entry and flush
3287 * caches. This is required on some Intel X58 chipsets, otherwise the
3288 * flush_context function will loop forever and the boot hangs.
3290 for_each_active_iommu(iommu, drhd) {
3291 iommu_flush_write_buffer(iommu);
3292 iommu_set_root_entry(iommu);
3293 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3294 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3297 if (iommu_pass_through)
3298 iommu_identity_mapping |= IDENTMAP_ALL;
3300 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3305 iommu_identity_mapping |= IDENTMAP_GFX;
3307 check_tylersburg_isoch();
3309 ret = si_domain_init(hw_pass_through);
3316 * global invalidate context cache
3317 * global invalidate iotlb
3318 * enable translation
3320 for_each_iommu(iommu, drhd) {
3321 if (drhd->ignored) {
3323 * we always have to disable PMRs or DMA may fail on
3327 iommu_disable_protect_mem_regions(iommu);
3331 iommu_flush_write_buffer(iommu);
3333 #ifdef CONFIG_INTEL_IOMMU_SVM
3334 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3336 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3337 * could cause possible lock race condition.
3339 up_write(&dmar_global_lock);
3340 ret = intel_svm_enable_prq(iommu);
3341 down_write(&dmar_global_lock);
3346 ret = dmar_set_interrupt(iommu);
3354 for_each_active_iommu(iommu, drhd) {
3355 disable_dmar_iommu(iommu);
3356 free_dmar_iommu(iommu);
3365 /* This takes a number of _MM_ pages, not VTD pages */
3366 static unsigned long intel_alloc_iova(struct device *dev,
3367 struct dmar_domain *domain,
3368 unsigned long nrpages, uint64_t dma_mask)
3370 unsigned long iova_pfn;
3372 /* Restrict dma_mask to the width that the iommu can handle */
3373 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3374 /* Ensure we reserve the whole size-aligned region */
3375 nrpages = __roundup_pow_of_two(nrpages);
3377 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3379 * First try to allocate an io virtual address in
3380 * DMA_BIT_MASK(32) and if that fails then try allocating
3383 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3384 IOVA_PFN(DMA_BIT_MASK(32)), false);
3388 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3389 IOVA_PFN(dma_mask), true);
3390 if (unlikely(!iova_pfn)) {
3391 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3398 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3400 struct dmar_domain *domain, *tmp;
3401 struct dmar_rmrr_unit *rmrr;
3402 struct device *i_dev;
3405 /* Device shouldn't be attached by any domains. */
3406 domain = find_domain(dev);
3410 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3414 /* We have a new domain - setup possible RMRRs for the device */
3416 for_each_rmrr_units(rmrr) {
3417 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3422 ret = domain_prepare_identity_map(dev, domain,
3426 dev_err(dev, "Mapping reserved region failed\n");
3431 tmp = set_domain_for_dev(dev, domain);
3432 if (!tmp || domain != tmp) {
3433 domain_exit(domain);
3439 dev_err(dev, "Allocating domain failed\n");
3441 domain->domain.type = IOMMU_DOMAIN_DMA;
3446 /* Check if the dev needs to go through non-identity map and unmap process.*/
3447 static bool iommu_need_mapping(struct device *dev)
3451 if (iommu_dummy(dev))
3454 ret = identity_mapping(dev);
3456 u64 dma_mask = *dev->dma_mask;
3458 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3459 dma_mask = dev->coherent_dma_mask;
3461 if (dma_mask >= dma_get_required_mask(dev))
3465 * 32 bit DMA is removed from si_domain and fall back to
3466 * non-identity mapping.
3468 dmar_remove_one_dev_info(dev);
3469 ret = iommu_request_dma_domain_for_dev(dev);
3471 struct iommu_domain *domain;
3472 struct dmar_domain *dmar_domain;
3474 domain = iommu_get_domain_for_dev(dev);
3476 dmar_domain = to_dmar_domain(domain);
3477 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3479 get_private_domain_for_dev(dev);
3482 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3488 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3489 size_t size, int dir, u64 dma_mask)
3491 struct dmar_domain *domain;
3492 phys_addr_t start_paddr;
3493 unsigned long iova_pfn;
3496 struct intel_iommu *iommu;
3497 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3499 BUG_ON(dir == DMA_NONE);
3501 domain = find_domain(dev);
3503 return DMA_MAPPING_ERROR;
3505 iommu = domain_get_iommu(domain);
3506 size = aligned_nrpages(paddr, size);
3508 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3513 * Check if DMAR supports zero-length reads on write only
3516 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3517 !cap_zlr(iommu->cap))
3518 prot |= DMA_PTE_READ;
3519 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3520 prot |= DMA_PTE_WRITE;
3522 * paddr - (paddr + size) might be partial page, we should map the whole
3523 * page. Note: if two part of one page are separately mapped, we
3524 * might have two guest_addr mapping to the same host paddr, but this
3525 * is not a big problem
3527 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3528 mm_to_dma_pfn(paddr_pfn), size, prot);
3532 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3533 start_paddr += paddr & ~PAGE_MASK;
3538 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3539 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3540 size, (unsigned long long)paddr, dir);
3541 return DMA_MAPPING_ERROR;
3544 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3545 unsigned long offset, size_t size,
3546 enum dma_data_direction dir,
3547 unsigned long attrs)
3549 if (iommu_need_mapping(dev))
3550 return __intel_map_single(dev, page_to_phys(page) + offset,
3551 size, dir, *dev->dma_mask);
3552 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3555 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3556 size_t size, enum dma_data_direction dir,
3557 unsigned long attrs)
3559 if (iommu_need_mapping(dev))
3560 return __intel_map_single(dev, phys_addr, size, dir,
3562 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3565 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3567 struct dmar_domain *domain;
3568 unsigned long start_pfn, last_pfn;
3569 unsigned long nrpages;
3570 unsigned long iova_pfn;
3571 struct intel_iommu *iommu;
3572 struct page *freelist;
3573 struct pci_dev *pdev = NULL;
3575 domain = find_domain(dev);
3578 iommu = domain_get_iommu(domain);
3580 iova_pfn = IOVA_PFN(dev_addr);
3582 nrpages = aligned_nrpages(dev_addr, size);
3583 start_pfn = mm_to_dma_pfn(iova_pfn);
3584 last_pfn = start_pfn + nrpages - 1;
3586 if (dev_is_pci(dev))
3587 pdev = to_pci_dev(dev);
3589 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3591 freelist = domain_unmap(domain, start_pfn, last_pfn);
3593 if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3594 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3595 nrpages, !freelist, 0);
3597 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3598 dma_free_pagelist(freelist);
3600 queue_iova(&domain->iovad, iova_pfn, nrpages,
3601 (unsigned long)freelist);
3603 * queue up the release of the unmap to save the 1/6th of the
3604 * cpu used up by the iotlb flush operation...
3609 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3610 size_t size, enum dma_data_direction dir,
3611 unsigned long attrs)
3613 if (iommu_need_mapping(dev))
3614 intel_unmap(dev, dev_addr, size);
3616 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3619 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3620 size_t size, enum dma_data_direction dir, unsigned long attrs)
3622 if (iommu_need_mapping(dev))
3623 intel_unmap(dev, dev_addr, size);
3626 static void *intel_alloc_coherent(struct device *dev, size_t size,
3627 dma_addr_t *dma_handle, gfp_t flags,
3628 unsigned long attrs)
3630 struct page *page = NULL;
3633 if (!iommu_need_mapping(dev))
3634 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3636 size = PAGE_ALIGN(size);
3637 order = get_order(size);
3639 if (gfpflags_allow_blocking(flags)) {
3640 unsigned int count = size >> PAGE_SHIFT;
3642 page = dma_alloc_from_contiguous(dev, count, order,
3643 flags & __GFP_NOWARN);
3647 page = alloc_pages(flags, order);
3650 memset(page_address(page), 0, size);
3652 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3654 dev->coherent_dma_mask);
3655 if (*dma_handle != DMA_MAPPING_ERROR)
3656 return page_address(page);
3657 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3658 __free_pages(page, order);
3663 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3664 dma_addr_t dma_handle, unsigned long attrs)
3667 struct page *page = virt_to_page(vaddr);
3669 if (!iommu_need_mapping(dev))
3670 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3672 size = PAGE_ALIGN(size);
3673 order = get_order(size);
3675 intel_unmap(dev, dma_handle, size);
3676 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3677 __free_pages(page, order);
3680 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3681 int nelems, enum dma_data_direction dir,
3682 unsigned long attrs)
3684 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3685 unsigned long nrpages = 0;
3686 struct scatterlist *sg;
3689 if (!iommu_need_mapping(dev))
3690 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3692 for_each_sg(sglist, sg, nelems, i) {
3693 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3696 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3699 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3700 enum dma_data_direction dir, unsigned long attrs)
3703 struct dmar_domain *domain;
3706 unsigned long iova_pfn;
3708 struct scatterlist *sg;
3709 unsigned long start_vpfn;
3710 struct intel_iommu *iommu;
3712 BUG_ON(dir == DMA_NONE);
3713 if (!iommu_need_mapping(dev))
3714 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3716 domain = find_domain(dev);
3720 iommu = domain_get_iommu(domain);
3722 for_each_sg(sglist, sg, nelems, i)
3723 size += aligned_nrpages(sg->offset, sg->length);
3725 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3728 sglist->dma_length = 0;
3733 * Check if DMAR supports zero-length reads on write only
3736 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3737 !cap_zlr(iommu->cap))
3738 prot |= DMA_PTE_READ;
3739 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3740 prot |= DMA_PTE_WRITE;
3742 start_vpfn = mm_to_dma_pfn(iova_pfn);
3744 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3745 if (unlikely(ret)) {
3746 dma_pte_free_pagetable(domain, start_vpfn,
3747 start_vpfn + size - 1,
3748 agaw_to_level(domain->agaw) + 1);
3749 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3756 static const struct dma_map_ops intel_dma_ops = {
3757 .alloc = intel_alloc_coherent,
3758 .free = intel_free_coherent,
3759 .map_sg = intel_map_sg,
3760 .unmap_sg = intel_unmap_sg,
3761 .map_page = intel_map_page,
3762 .unmap_page = intel_unmap_page,
3763 .map_resource = intel_map_resource,
3764 .unmap_resource = intel_unmap_resource,
3765 .dma_supported = dma_direct_supported,
3768 static inline int iommu_domain_cache_init(void)
3772 iommu_domain_cache = kmem_cache_create("iommu_domain",
3773 sizeof(struct dmar_domain),
3778 if (!iommu_domain_cache) {
3779 pr_err("Couldn't create iommu_domain cache\n");
3786 static inline int iommu_devinfo_cache_init(void)
3790 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3791 sizeof(struct device_domain_info),
3795 if (!iommu_devinfo_cache) {
3796 pr_err("Couldn't create devinfo cache\n");
3803 static int __init iommu_init_mempool(void)
3806 ret = iova_cache_get();
3810 ret = iommu_domain_cache_init();
3814 ret = iommu_devinfo_cache_init();
3818 kmem_cache_destroy(iommu_domain_cache);
3825 static void __init iommu_exit_mempool(void)
3827 kmem_cache_destroy(iommu_devinfo_cache);
3828 kmem_cache_destroy(iommu_domain_cache);
3832 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3834 struct dmar_drhd_unit *drhd;
3838 /* We know that this device on this chipset has its own IOMMU.
3839 * If we find it under a different IOMMU, then the BIOS is lying
3840 * to us. Hope that the IOMMU for this device is actually
3841 * disabled, and it needs no translation...
3843 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3845 /* "can't" happen */
3846 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3849 vtbar &= 0xffff0000;
3851 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3852 drhd = dmar_find_matched_drhd_unit(pdev);
3853 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3854 TAINT_FIRMWARE_WORKAROUND,
3855 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3856 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3858 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3860 static void __init init_no_remapping_devices(void)
3862 struct dmar_drhd_unit *drhd;
3866 for_each_drhd_unit(drhd) {
3867 if (!drhd->include_all) {
3868 for_each_active_dev_scope(drhd->devices,
3869 drhd->devices_cnt, i, dev)
3871 /* ignore DMAR unit if no devices exist */
3872 if (i == drhd->devices_cnt)
3877 for_each_active_drhd_unit(drhd) {
3878 if (drhd->include_all)
3881 for_each_active_dev_scope(drhd->devices,
3882 drhd->devices_cnt, i, dev)
3883 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3885 if (i < drhd->devices_cnt)
3888 /* This IOMMU has *only* gfx devices. Either bypass it or
3889 set the gfx_mapped flag, as appropriate */
3890 if (!dmar_map_gfx) {
3892 for_each_active_dev_scope(drhd->devices,
3893 drhd->devices_cnt, i, dev)
3894 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3899 #ifdef CONFIG_SUSPEND
3900 static int init_iommu_hw(void)
3902 struct dmar_drhd_unit *drhd;
3903 struct intel_iommu *iommu = NULL;
3905 for_each_active_iommu(iommu, drhd)
3907 dmar_reenable_qi(iommu);
3909 for_each_iommu(iommu, drhd) {
3910 if (drhd->ignored) {
3912 * we always have to disable PMRs or DMA may fail on
3916 iommu_disable_protect_mem_regions(iommu);
3920 iommu_flush_write_buffer(iommu);
3922 iommu_set_root_entry(iommu);
3924 iommu->flush.flush_context(iommu, 0, 0, 0,
3925 DMA_CCMD_GLOBAL_INVL);
3926 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3927 iommu_enable_translation(iommu);
3928 iommu_disable_protect_mem_regions(iommu);
3934 static void iommu_flush_all(void)
3936 struct dmar_drhd_unit *drhd;
3937 struct intel_iommu *iommu;
3939 for_each_active_iommu(iommu, drhd) {
3940 iommu->flush.flush_context(iommu, 0, 0, 0,
3941 DMA_CCMD_GLOBAL_INVL);
3942 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3943 DMA_TLB_GLOBAL_FLUSH);
3947 static int iommu_suspend(void)
3949 struct dmar_drhd_unit *drhd;
3950 struct intel_iommu *iommu = NULL;
3953 for_each_active_iommu(iommu, drhd) {
3954 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3956 if (!iommu->iommu_state)
3962 for_each_active_iommu(iommu, drhd) {
3963 iommu_disable_translation(iommu);
3965 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3967 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3968 readl(iommu->reg + DMAR_FECTL_REG);
3969 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3970 readl(iommu->reg + DMAR_FEDATA_REG);
3971 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3972 readl(iommu->reg + DMAR_FEADDR_REG);
3973 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3974 readl(iommu->reg + DMAR_FEUADDR_REG);
3976 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3981 for_each_active_iommu(iommu, drhd)
3982 kfree(iommu->iommu_state);
3987 static void iommu_resume(void)
3989 struct dmar_drhd_unit *drhd;
3990 struct intel_iommu *iommu = NULL;
3993 if (init_iommu_hw()) {
3995 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3997 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4001 for_each_active_iommu(iommu, drhd) {
4003 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4005 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4006 iommu->reg + DMAR_FECTL_REG);
4007 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4008 iommu->reg + DMAR_FEDATA_REG);
4009 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4010 iommu->reg + DMAR_FEADDR_REG);
4011 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4012 iommu->reg + DMAR_FEUADDR_REG);
4014 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4017 for_each_active_iommu(iommu, drhd)
4018 kfree(iommu->iommu_state);
4021 static struct syscore_ops iommu_syscore_ops = {
4022 .resume = iommu_resume,
4023 .suspend = iommu_suspend,
4026 static void __init init_iommu_pm_ops(void)
4028 register_syscore_ops(&iommu_syscore_ops);
4032 static inline void init_iommu_pm_ops(void) {}
4033 #endif /* CONFIG_PM */
4035 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4037 struct acpi_dmar_reserved_memory *rmrr;
4038 struct dmar_rmrr_unit *rmrru;
4040 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4044 rmrru->hdr = header;
4045 rmrr = (struct acpi_dmar_reserved_memory *)header;
4046 rmrru->base_address = rmrr->base_address;
4047 rmrru->end_address = rmrr->end_address;
4049 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4050 ((void *)rmrr) + rmrr->header.length,
4051 &rmrru->devices_cnt);
4052 if (rmrru->devices_cnt && rmrru->devices == NULL)
4055 list_add(&rmrru->list, &dmar_rmrr_units);
4064 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4066 struct dmar_atsr_unit *atsru;
4067 struct acpi_dmar_atsr *tmp;
4069 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4070 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4071 if (atsr->segment != tmp->segment)
4073 if (atsr->header.length != tmp->header.length)
4075 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4082 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4084 struct acpi_dmar_atsr *atsr;
4085 struct dmar_atsr_unit *atsru;
4087 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4090 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4091 atsru = dmar_find_atsr(atsr);
4095 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4100 * If memory is allocated from slab by ACPI _DSM method, we need to
4101 * copy the memory content because the memory buffer will be freed
4104 atsru->hdr = (void *)(atsru + 1);
4105 memcpy(atsru->hdr, hdr, hdr->length);
4106 atsru->include_all = atsr->flags & 0x1;
4107 if (!atsru->include_all) {
4108 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4109 (void *)atsr + atsr->header.length,
4110 &atsru->devices_cnt);
4111 if (atsru->devices_cnt && atsru->devices == NULL) {
4117 list_add_rcu(&atsru->list, &dmar_atsr_units);
4122 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4124 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4128 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4130 struct acpi_dmar_atsr *atsr;
4131 struct dmar_atsr_unit *atsru;
4133 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4134 atsru = dmar_find_atsr(atsr);
4136 list_del_rcu(&atsru->list);
4138 intel_iommu_free_atsr(atsru);
4144 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4148 struct acpi_dmar_atsr *atsr;
4149 struct dmar_atsr_unit *atsru;
4151 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4152 atsru = dmar_find_atsr(atsr);
4156 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4157 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4165 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4168 struct intel_iommu *iommu = dmaru->iommu;
4170 if (g_iommus[iommu->seq_id])
4173 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4174 pr_warn("%s: Doesn't support hardware pass through.\n",
4178 if (!ecap_sc_support(iommu->ecap) &&
4179 domain_update_iommu_snooping(iommu)) {
4180 pr_warn("%s: Doesn't support snooping.\n",
4184 sp = domain_update_iommu_superpage(iommu) - 1;
4185 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4186 pr_warn("%s: Doesn't support large page.\n",
4192 * Disable translation if already enabled prior to OS handover.
4194 if (iommu->gcmd & DMA_GCMD_TE)
4195 iommu_disable_translation(iommu);
4197 g_iommus[iommu->seq_id] = iommu;
4198 ret = iommu_init_domains(iommu);
4200 ret = iommu_alloc_root_entry(iommu);
4204 #ifdef CONFIG_INTEL_IOMMU_SVM
4205 if (pasid_supported(iommu))
4206 intel_svm_init(iommu);
4209 if (dmaru->ignored) {
4211 * we always have to disable PMRs or DMA may fail on this device
4214 iommu_disable_protect_mem_regions(iommu);
4218 intel_iommu_init_qi(iommu);
4219 iommu_flush_write_buffer(iommu);
4221 #ifdef CONFIG_INTEL_IOMMU_SVM
4222 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4223 ret = intel_svm_enable_prq(iommu);
4228 ret = dmar_set_interrupt(iommu);
4232 iommu_set_root_entry(iommu);
4233 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4234 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4235 iommu_enable_translation(iommu);
4237 iommu_disable_protect_mem_regions(iommu);
4241 disable_dmar_iommu(iommu);
4243 free_dmar_iommu(iommu);
4247 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4250 struct intel_iommu *iommu = dmaru->iommu;
4252 if (!intel_iommu_enabled)
4258 ret = intel_iommu_add(dmaru);
4260 disable_dmar_iommu(iommu);
4261 free_dmar_iommu(iommu);
4267 static void intel_iommu_free_dmars(void)
4269 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4270 struct dmar_atsr_unit *atsru, *atsr_n;
4272 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4273 list_del(&rmrru->list);
4274 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4278 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4279 list_del(&atsru->list);
4280 intel_iommu_free_atsr(atsru);
4284 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4287 struct pci_bus *bus;
4288 struct pci_dev *bridge = NULL;
4290 struct acpi_dmar_atsr *atsr;
4291 struct dmar_atsr_unit *atsru;
4293 dev = pci_physfn(dev);
4294 for (bus = dev->bus; bus; bus = bus->parent) {
4296 /* If it's an integrated device, allow ATS */
4299 /* Connected via non-PCIe: no ATS */
4300 if (!pci_is_pcie(bridge) ||
4301 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4303 /* If we found the root port, look it up in the ATSR */
4304 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4309 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4310 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4311 if (atsr->segment != pci_domain_nr(dev->bus))
4314 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4315 if (tmp == &bridge->dev)
4318 if (atsru->include_all)
4328 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4331 struct dmar_rmrr_unit *rmrru;
4332 struct dmar_atsr_unit *atsru;
4333 struct acpi_dmar_atsr *atsr;
4334 struct acpi_dmar_reserved_memory *rmrr;
4336 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4339 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4340 rmrr = container_of(rmrru->hdr,
4341 struct acpi_dmar_reserved_memory, header);
4342 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4343 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4344 ((void *)rmrr) + rmrr->header.length,
4345 rmrr->segment, rmrru->devices,
4346 rmrru->devices_cnt);
4349 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4350 dmar_remove_dev_scope(info, rmrr->segment,
4351 rmrru->devices, rmrru->devices_cnt);
4355 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4356 if (atsru->include_all)
4359 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4360 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4361 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4362 (void *)atsr + atsr->header.length,
4363 atsr->segment, atsru->devices,
4364 atsru->devices_cnt);
4369 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4370 if (dmar_remove_dev_scope(info, atsr->segment,
4371 atsru->devices, atsru->devices_cnt))
4379 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4380 unsigned long val, void *v)
4382 struct memory_notify *mhp = v;
4383 unsigned long long start, end;
4384 unsigned long start_vpfn, last_vpfn;
4387 case MEM_GOING_ONLINE:
4388 start = mhp->start_pfn << PAGE_SHIFT;
4389 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4390 if (iommu_domain_identity_map(si_domain, start, end)) {
4391 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4398 case MEM_CANCEL_ONLINE:
4399 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4400 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4401 while (start_vpfn <= last_vpfn) {
4403 struct dmar_drhd_unit *drhd;
4404 struct intel_iommu *iommu;
4405 struct page *freelist;
4407 iova = find_iova(&si_domain->iovad, start_vpfn);
4409 pr_debug("Failed get IOVA for PFN %lx\n",
4414 iova = split_and_remove_iova(&si_domain->iovad, iova,
4415 start_vpfn, last_vpfn);
4417 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4418 start_vpfn, last_vpfn);
4422 freelist = domain_unmap(si_domain, iova->pfn_lo,
4426 for_each_active_iommu(iommu, drhd)
4427 iommu_flush_iotlb_psi(iommu, si_domain,
4428 iova->pfn_lo, iova_size(iova),
4431 dma_free_pagelist(freelist);
4433 start_vpfn = iova->pfn_hi + 1;
4434 free_iova_mem(iova);
4442 static struct notifier_block intel_iommu_memory_nb = {
4443 .notifier_call = intel_iommu_memory_notifier,
4447 static void free_all_cpu_cached_iovas(unsigned int cpu)
4451 for (i = 0; i < g_num_of_iommus; i++) {
4452 struct intel_iommu *iommu = g_iommus[i];
4453 struct dmar_domain *domain;
4459 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4460 domain = get_iommu_domain(iommu, (u16)did);
4464 free_cpu_cached_iovas(cpu, &domain->iovad);
4469 static int intel_iommu_cpu_dead(unsigned int cpu)
4471 free_all_cpu_cached_iovas(cpu);
4475 static void intel_disable_iommus(void)
4477 struct intel_iommu *iommu = NULL;
4478 struct dmar_drhd_unit *drhd;
4480 for_each_iommu(iommu, drhd)
4481 iommu_disable_translation(iommu);
4484 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4486 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4488 return container_of(iommu_dev, struct intel_iommu, iommu);
4491 static ssize_t intel_iommu_show_version(struct device *dev,
4492 struct device_attribute *attr,
4495 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4496 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4497 return sprintf(buf, "%d:%d\n",
4498 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4500 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4502 static ssize_t intel_iommu_show_address(struct device *dev,
4503 struct device_attribute *attr,
4506 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4507 return sprintf(buf, "%llx\n", iommu->reg_phys);
4509 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4511 static ssize_t intel_iommu_show_cap(struct device *dev,
4512 struct device_attribute *attr,
4515 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4516 return sprintf(buf, "%llx\n", iommu->cap);
4518 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4520 static ssize_t intel_iommu_show_ecap(struct device *dev,
4521 struct device_attribute *attr,
4524 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4525 return sprintf(buf, "%llx\n", iommu->ecap);
4527 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4529 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4530 struct device_attribute *attr,
4533 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4534 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4536 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4538 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4539 struct device_attribute *attr,
4542 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4543 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4544 cap_ndoms(iommu->cap)));
4546 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4548 static struct attribute *intel_iommu_attrs[] = {
4549 &dev_attr_version.attr,
4550 &dev_attr_address.attr,
4552 &dev_attr_ecap.attr,
4553 &dev_attr_domains_supported.attr,
4554 &dev_attr_domains_used.attr,
4558 static struct attribute_group intel_iommu_group = {
4559 .name = "intel-iommu",
4560 .attrs = intel_iommu_attrs,
4563 const struct attribute_group *intel_iommu_groups[] = {
4568 static int __init platform_optin_force_iommu(void)
4570 struct pci_dev *pdev = NULL;
4571 bool has_untrusted_dev = false;
4573 if (!dmar_platform_optin() || no_platform_optin)
4576 for_each_pci_dev(pdev) {
4577 if (pdev->untrusted) {
4578 has_untrusted_dev = true;
4583 if (!has_untrusted_dev)
4586 if (no_iommu || dmar_disabled)
4587 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4590 * If Intel-IOMMU is disabled by default, we will apply identity
4591 * map for all devices except those marked as being untrusted.
4594 iommu_identity_mapping |= IDENTMAP_ALL;
4597 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4605 static int __init probe_acpi_namespace_devices(void)
4607 struct dmar_drhd_unit *drhd;
4608 /* To avoid a -Wunused-but-set-variable warning. */
4609 struct intel_iommu *iommu __maybe_unused;
4613 for_each_active_iommu(iommu, drhd) {
4614 for_each_active_dev_scope(drhd->devices,
4615 drhd->devices_cnt, i, dev) {
4616 struct acpi_device_physical_node *pn;
4617 struct iommu_group *group;
4618 struct acpi_device *adev;
4620 if (dev->bus != &acpi_bus_type)
4623 adev = to_acpi_device(dev);
4624 mutex_lock(&adev->physical_node_lock);
4625 list_for_each_entry(pn,
4626 &adev->physical_node_list, node) {
4627 group = iommu_group_get(pn->dev);
4629 iommu_group_put(group);
4633 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4634 ret = iommu_probe_device(pn->dev);
4638 mutex_unlock(&adev->physical_node_lock);
4648 int __init intel_iommu_init(void)
4651 struct dmar_drhd_unit *drhd;
4652 struct intel_iommu *iommu;
4655 * Intel IOMMU is required for a TXT/tboot launch or platform
4656 * opt in, so enforce that.
4658 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4660 if (iommu_init_mempool()) {
4662 panic("tboot: Failed to initialize iommu memory\n");
4666 down_write(&dmar_global_lock);
4667 if (dmar_table_init()) {
4669 panic("tboot: Failed to initialize DMAR table\n");
4673 if (dmar_dev_scope_init() < 0) {
4675 panic("tboot: Failed to initialize DMAR device scope\n");
4679 up_write(&dmar_global_lock);
4682 * The bus notifier takes the dmar_global_lock, so lockdep will
4683 * complain later when we register it under the lock.
4685 dmar_register_bus_notifier();
4687 down_write(&dmar_global_lock);
4689 if (no_iommu || dmar_disabled) {
4691 * We exit the function here to ensure IOMMU's remapping and
4692 * mempool aren't setup, which means that the IOMMU's PMRs
4693 * won't be disabled via the call to init_dmars(). So disable
4694 * it explicitly here. The PMRs were setup by tboot prior to
4695 * calling SENTER, but the kernel is expected to reset/tear
4698 if (intel_iommu_tboot_noforce) {
4699 for_each_iommu(iommu, drhd)
4700 iommu_disable_protect_mem_regions(iommu);
4704 * Make sure the IOMMUs are switched off, even when we
4705 * boot into a kexec kernel and the previous kernel left
4708 intel_disable_iommus();
4712 if (list_empty(&dmar_rmrr_units))
4713 pr_info("No RMRR found\n");
4715 if (list_empty(&dmar_atsr_units))
4716 pr_info("No ATSR found\n");
4718 if (dmar_init_reserved_ranges()) {
4720 panic("tboot: Failed to reserve iommu ranges\n");
4721 goto out_free_reserved_range;
4725 intel_iommu_gfx_mapped = 1;
4727 init_no_remapping_devices();
4732 panic("tboot: Failed to initialize DMARs\n");
4733 pr_err("Initialization failed\n");
4734 goto out_free_reserved_range;
4736 up_write(&dmar_global_lock);
4738 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4741 dma_ops = &intel_dma_ops;
4743 init_iommu_pm_ops();
4745 for_each_active_iommu(iommu, drhd) {
4746 iommu_device_sysfs_add(&iommu->iommu, NULL,
4749 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4750 iommu_device_register(&iommu->iommu);
4753 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4754 if (si_domain && !hw_pass_through)
4755 register_memory_notifier(&intel_iommu_memory_nb);
4756 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4757 intel_iommu_cpu_dead);
4759 down_read(&dmar_global_lock);
4760 if (probe_acpi_namespace_devices())
4761 pr_warn("ACPI name space devices didn't probe correctly\n");
4762 up_read(&dmar_global_lock);
4764 /* Finally, we enable the DMA remapping hardware. */
4765 for_each_iommu(iommu, drhd) {
4766 if (!drhd->ignored && !translation_pre_enabled(iommu))
4767 iommu_enable_translation(iommu);
4769 iommu_disable_protect_mem_regions(iommu);
4771 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4773 intel_iommu_enabled = 1;
4774 intel_iommu_debugfs_init();
4778 out_free_reserved_range:
4779 put_iova_domain(&reserved_iova_list);
4781 intel_iommu_free_dmars();
4782 up_write(&dmar_global_lock);
4783 iommu_exit_mempool();
4787 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4789 struct intel_iommu *iommu = opaque;
4791 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4796 * NB - intel-iommu lacks any sort of reference counting for the users of
4797 * dependent devices. If multiple endpoints have intersecting dependent
4798 * devices, unbinding the driver from any one of them will possibly leave
4799 * the others unable to operate.
4801 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4803 if (!iommu || !dev || !dev_is_pci(dev))
4806 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4809 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4811 struct dmar_domain *domain;
4812 struct intel_iommu *iommu;
4813 unsigned long flags;
4815 assert_spin_locked(&device_domain_lock);
4820 iommu = info->iommu;
4821 domain = info->domain;
4824 if (dev_is_pci(info->dev) && sm_supported(iommu))
4825 intel_pasid_tear_down_entry(iommu, info->dev,
4828 iommu_disable_dev_iotlb(info);
4829 domain_context_clear(iommu, info->dev);
4830 intel_pasid_free_table(info->dev);
4833 unlink_domain_info(info);
4835 spin_lock_irqsave(&iommu->lock, flags);
4836 domain_detach_iommu(domain, iommu);
4837 spin_unlock_irqrestore(&iommu->lock, flags);
4839 /* free the private domain */
4840 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4841 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4842 domain_exit(info->domain);
4844 free_devinfo_mem(info);
4847 static void dmar_remove_one_dev_info(struct device *dev)
4849 struct device_domain_info *info;
4850 unsigned long flags;
4852 spin_lock_irqsave(&device_domain_lock, flags);
4853 info = dev->archdata.iommu;
4854 __dmar_remove_one_dev_info(info);
4855 spin_unlock_irqrestore(&device_domain_lock, flags);
4858 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4862 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4863 domain_reserve_special_ranges(domain);
4865 /* calculate AGAW */
4866 domain->gaw = guest_width;
4867 adjust_width = guestwidth_to_adjustwidth(guest_width);
4868 domain->agaw = width_to_agaw(adjust_width);
4870 domain->iommu_coherency = 0;
4871 domain->iommu_snooping = 0;
4872 domain->iommu_superpage = 0;
4873 domain->max_addr = 0;
4875 /* always allocate the top pgd */
4876 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4879 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4883 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4885 struct dmar_domain *dmar_domain;
4886 struct iommu_domain *domain;
4889 case IOMMU_DOMAIN_DMA:
4891 case IOMMU_DOMAIN_UNMANAGED:
4892 dmar_domain = alloc_domain(0);
4894 pr_err("Can't allocate dmar_domain\n");
4897 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4898 pr_err("Domain initialization failed\n");
4899 domain_exit(dmar_domain);
4903 if (type == IOMMU_DOMAIN_DMA &&
4904 init_iova_flush_queue(&dmar_domain->iovad,
4905 iommu_flush_iova, iova_entry_free)) {
4906 pr_warn("iova flush queue initialization failed\n");
4907 intel_iommu_strict = 1;
4910 domain_update_iommu_cap(dmar_domain);
4912 domain = &dmar_domain->domain;
4913 domain->geometry.aperture_start = 0;
4914 domain->geometry.aperture_end =
4915 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4916 domain->geometry.force_aperture = true;
4919 case IOMMU_DOMAIN_IDENTITY:
4920 return &si_domain->domain;
4928 static void intel_iommu_domain_free(struct iommu_domain *domain)
4930 if (domain != &si_domain->domain)
4931 domain_exit(to_dmar_domain(domain));
4935 * Check whether a @domain could be attached to the @dev through the
4936 * aux-domain attach/detach APIs.
4939 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4941 struct device_domain_info *info = dev->archdata.iommu;
4943 return info && info->auxd_enabled &&
4944 domain->type == IOMMU_DOMAIN_UNMANAGED;
4947 static void auxiliary_link_device(struct dmar_domain *domain,
4950 struct device_domain_info *info = dev->archdata.iommu;
4952 assert_spin_locked(&device_domain_lock);
4956 domain->auxd_refcnt++;
4957 list_add(&domain->auxd, &info->auxiliary_domains);
4960 static void auxiliary_unlink_device(struct dmar_domain *domain,
4963 struct device_domain_info *info = dev->archdata.iommu;
4965 assert_spin_locked(&device_domain_lock);
4969 list_del(&domain->auxd);
4970 domain->auxd_refcnt--;
4972 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4973 intel_pasid_free_id(domain->default_pasid);
4976 static int aux_domain_add_dev(struct dmar_domain *domain,
4981 unsigned long flags;
4982 struct intel_iommu *iommu;
4984 iommu = device_to_iommu(dev, &bus, &devfn);
4988 if (domain->default_pasid <= 0) {
4991 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4992 pci_max_pasids(to_pci_dev(dev)),
4995 pr_err("Can't allocate default pasid\n");
4998 domain->default_pasid = pasid;
5001 spin_lock_irqsave(&device_domain_lock, flags);
5003 * iommu->lock must be held to attach domain to iommu and setup the
5004 * pasid entry for second level translation.
5006 spin_lock(&iommu->lock);
5007 ret = domain_attach_iommu(domain, iommu);
5011 /* Setup the PASID entry for mediated devices: */
5012 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5013 domain->default_pasid);
5016 spin_unlock(&iommu->lock);
5018 auxiliary_link_device(domain, dev);
5020 spin_unlock_irqrestore(&device_domain_lock, flags);
5025 domain_detach_iommu(domain, iommu);
5027 spin_unlock(&iommu->lock);
5028 spin_unlock_irqrestore(&device_domain_lock, flags);
5029 if (!domain->auxd_refcnt && domain->default_pasid > 0)
5030 intel_pasid_free_id(domain->default_pasid);
5035 static void aux_domain_remove_dev(struct dmar_domain *domain,
5038 struct device_domain_info *info;
5039 struct intel_iommu *iommu;
5040 unsigned long flags;
5042 if (!is_aux_domain(dev, &domain->domain))
5045 spin_lock_irqsave(&device_domain_lock, flags);
5046 info = dev->archdata.iommu;
5047 iommu = info->iommu;
5049 auxiliary_unlink_device(domain, dev);
5051 spin_lock(&iommu->lock);
5052 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5053 domain_detach_iommu(domain, iommu);
5054 spin_unlock(&iommu->lock);
5056 spin_unlock_irqrestore(&device_domain_lock, flags);
5059 static int prepare_domain_attach_device(struct iommu_domain *domain,
5062 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5063 struct intel_iommu *iommu;
5067 iommu = device_to_iommu(dev, &bus, &devfn);
5071 /* check if this iommu agaw is sufficient for max mapped address */
5072 addr_width = agaw_to_width(iommu->agaw);
5073 if (addr_width > cap_mgaw(iommu->cap))
5074 addr_width = cap_mgaw(iommu->cap);
5076 if (dmar_domain->max_addr > (1LL << addr_width)) {
5077 dev_err(dev, "%s: iommu width (%d) is not "
5078 "sufficient for the mapped address (%llx)\n",
5079 __func__, addr_width, dmar_domain->max_addr);
5082 dmar_domain->gaw = addr_width;
5085 * Knock out extra levels of page tables if necessary
5087 while (iommu->agaw < dmar_domain->agaw) {
5088 struct dma_pte *pte;
5090 pte = dmar_domain->pgd;
5091 if (dma_pte_present(pte)) {
5092 dmar_domain->pgd = (struct dma_pte *)
5093 phys_to_virt(dma_pte_addr(pte));
5094 free_pgtable_page(pte);
5096 dmar_domain->agaw--;
5102 static int intel_iommu_attach_device(struct iommu_domain *domain,
5107 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5108 device_is_rmrr_locked(dev)) {
5109 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5113 if (is_aux_domain(dev, domain))
5116 /* normally dev is not mapped */
5117 if (unlikely(domain_context_mapped(dev))) {
5118 struct dmar_domain *old_domain;
5120 old_domain = find_domain(dev);
5122 dmar_remove_one_dev_info(dev);
5125 ret = prepare_domain_attach_device(domain, dev);
5129 return domain_add_dev_info(to_dmar_domain(domain), dev);
5132 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5137 if (!is_aux_domain(dev, domain))
5140 ret = prepare_domain_attach_device(domain, dev);
5144 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5147 static void intel_iommu_detach_device(struct iommu_domain *domain,
5150 dmar_remove_one_dev_info(dev);
5153 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5156 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5159 static int intel_iommu_map(struct iommu_domain *domain,
5160 unsigned long iova, phys_addr_t hpa,
5161 size_t size, int iommu_prot)
5163 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5168 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5171 if (iommu_prot & IOMMU_READ)
5172 prot |= DMA_PTE_READ;
5173 if (iommu_prot & IOMMU_WRITE)
5174 prot |= DMA_PTE_WRITE;
5175 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5176 prot |= DMA_PTE_SNP;
5178 max_addr = iova + size;
5179 if (dmar_domain->max_addr < max_addr) {
5182 /* check if minimum agaw is sufficient for mapped address */
5183 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5184 if (end < max_addr) {
5185 pr_err("%s: iommu width (%d) is not "
5186 "sufficient for the mapped address (%llx)\n",
5187 __func__, dmar_domain->gaw, max_addr);
5190 dmar_domain->max_addr = max_addr;
5192 /* Round up size to next multiple of PAGE_SIZE, if it and
5193 the low bits of hpa would take us onto the next page */
5194 size = aligned_nrpages(hpa, size);
5195 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5196 hpa >> VTD_PAGE_SHIFT, size, prot);
5200 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5201 unsigned long iova, size_t size)
5203 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5204 struct page *freelist = NULL;
5205 unsigned long start_pfn, last_pfn;
5206 unsigned int npages;
5207 int iommu_id, level = 0;
5209 /* Cope with horrid API which requires us to unmap more than the
5210 size argument if it happens to be a large-page mapping. */
5211 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5212 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5215 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5216 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5218 start_pfn = iova >> VTD_PAGE_SHIFT;
5219 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5221 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5223 npages = last_pfn - start_pfn + 1;
5225 for_each_domain_iommu(iommu_id, dmar_domain)
5226 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5227 start_pfn, npages, !freelist, 0);
5229 dma_free_pagelist(freelist);
5231 if (dmar_domain->max_addr == iova + size)
5232 dmar_domain->max_addr = iova;
5237 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5240 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5241 struct dma_pte *pte;
5245 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5248 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5250 phys = dma_pte_addr(pte);
5255 static inline bool scalable_mode_support(void)
5257 struct dmar_drhd_unit *drhd;
5258 struct intel_iommu *iommu;
5262 for_each_active_iommu(iommu, drhd) {
5263 if (!sm_supported(iommu)) {
5273 static inline bool iommu_pasid_support(void)
5275 struct dmar_drhd_unit *drhd;
5276 struct intel_iommu *iommu;
5280 for_each_active_iommu(iommu, drhd) {
5281 if (!pasid_supported(iommu)) {
5291 static bool intel_iommu_capable(enum iommu_cap cap)
5293 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5294 return domain_update_iommu_snooping(NULL) == 1;
5295 if (cap == IOMMU_CAP_INTR_REMAP)
5296 return irq_remapping_enabled == 1;
5301 static int intel_iommu_add_device(struct device *dev)
5303 struct dmar_domain *dmar_domain;
5304 struct iommu_domain *domain;
5305 struct intel_iommu *iommu;
5306 struct iommu_group *group;
5310 iommu = device_to_iommu(dev, &bus, &devfn);
5314 iommu_device_link(&iommu->iommu, dev);
5316 if (translation_pre_enabled(iommu))
5317 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5319 group = iommu_group_get_for_dev(dev);
5322 return PTR_ERR(group);
5324 iommu_group_put(group);
5326 domain = iommu_get_domain_for_dev(dev);
5327 dmar_domain = to_dmar_domain(domain);
5328 if (domain->type == IOMMU_DOMAIN_DMA) {
5329 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5330 ret = iommu_request_dm_for_dev(dev);
5332 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5333 domain_add_dev_info(si_domain, dev);
5335 "Device uses a private identity domain.\n");
5339 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5340 ret = iommu_request_dma_domain_for_dev(dev);
5342 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5343 if (!get_private_domain_for_dev(dev)) {
5345 "Failed to get a private domain.\n");
5350 "Device uses a private dma domain.\n");
5358 static void intel_iommu_remove_device(struct device *dev)
5360 struct intel_iommu *iommu;
5363 iommu = device_to_iommu(dev, &bus, &devfn);
5367 iommu_group_remove_device(dev);
5369 iommu_device_unlink(&iommu->iommu, dev);
5372 static void intel_iommu_get_resv_regions(struct device *device,
5373 struct list_head *head)
5375 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5376 struct iommu_resv_region *reg;
5377 struct dmar_rmrr_unit *rmrr;
5378 struct device *i_dev;
5381 down_read(&dmar_global_lock);
5382 for_each_rmrr_units(rmrr) {
5383 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5385 struct iommu_resv_region *resv;
5386 enum iommu_resv_type type;
5389 if (i_dev != device &&
5390 !is_downstream_to_pci_bridge(device, i_dev))
5393 length = rmrr->end_address - rmrr->base_address + 1;
5395 type = device_rmrr_is_relaxable(device) ?
5396 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5398 resv = iommu_alloc_resv_region(rmrr->base_address,
5399 length, prot, type);
5403 list_add_tail(&resv->list, head);
5406 up_read(&dmar_global_lock);
5408 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5409 if (dev_is_pci(device)) {
5410 struct pci_dev *pdev = to_pci_dev(device);
5412 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5413 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5416 list_add_tail(®->list, head);
5419 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5421 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5422 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5426 list_add_tail(®->list, head);
5429 static void intel_iommu_put_resv_regions(struct device *dev,
5430 struct list_head *head)
5432 struct iommu_resv_region *entry, *next;
5434 list_for_each_entry_safe(entry, next, head, list)
5438 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5440 struct device_domain_info *info;
5441 struct context_entry *context;
5442 struct dmar_domain *domain;
5443 unsigned long flags;
5447 domain = find_domain(dev);
5451 spin_lock_irqsave(&device_domain_lock, flags);
5452 spin_lock(&iommu->lock);
5455 info = dev->archdata.iommu;
5456 if (!info || !info->pasid_supported)
5459 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5460 if (WARN_ON(!context))
5463 ctx_lo = context[0].lo;
5465 if (!(ctx_lo & CONTEXT_PASIDE)) {
5466 ctx_lo |= CONTEXT_PASIDE;
5467 context[0].lo = ctx_lo;
5469 iommu->flush.flush_context(iommu,
5470 domain->iommu_did[iommu->seq_id],
5471 PCI_DEVID(info->bus, info->devfn),
5472 DMA_CCMD_MASK_NOBIT,
5473 DMA_CCMD_DEVICE_INVL);
5476 /* Enable PASID support in the device, if it wasn't already */
5477 if (!info->pasid_enabled)
5478 iommu_enable_dev_iotlb(info);
5483 spin_unlock(&iommu->lock);
5484 spin_unlock_irqrestore(&device_domain_lock, flags);
5489 static void intel_iommu_apply_resv_region(struct device *dev,
5490 struct iommu_domain *domain,
5491 struct iommu_resv_region *region)
5493 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5494 unsigned long start, end;
5496 start = IOVA_PFN(region->start);
5497 end = IOVA_PFN(region->start + region->length - 1);
5499 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5502 #ifdef CONFIG_INTEL_IOMMU_SVM
5503 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5505 struct intel_iommu *iommu;
5508 if (iommu_dummy(dev)) {
5510 "No IOMMU translation for device; cannot enable SVM\n");
5514 iommu = device_to_iommu(dev, &bus, &devfn);
5516 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5522 #endif /* CONFIG_INTEL_IOMMU_SVM */
5524 static int intel_iommu_enable_auxd(struct device *dev)
5526 struct device_domain_info *info;
5527 struct intel_iommu *iommu;
5528 unsigned long flags;
5532 iommu = device_to_iommu(dev, &bus, &devfn);
5533 if (!iommu || dmar_disabled)
5536 if (!sm_supported(iommu) || !pasid_supported(iommu))
5539 ret = intel_iommu_enable_pasid(iommu, dev);
5543 spin_lock_irqsave(&device_domain_lock, flags);
5544 info = dev->archdata.iommu;
5545 info->auxd_enabled = 1;
5546 spin_unlock_irqrestore(&device_domain_lock, flags);
5551 static int intel_iommu_disable_auxd(struct device *dev)
5553 struct device_domain_info *info;
5554 unsigned long flags;
5556 spin_lock_irqsave(&device_domain_lock, flags);
5557 info = dev->archdata.iommu;
5558 if (!WARN_ON(!info))
5559 info->auxd_enabled = 0;
5560 spin_unlock_irqrestore(&device_domain_lock, flags);
5566 * A PCI express designated vendor specific extended capability is defined
5567 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5568 * for system software and tools to detect endpoint devices supporting the
5569 * Intel scalable IO virtualization without host driver dependency.
5571 * Returns the address of the matching extended capability structure within
5572 * the device's PCI configuration space or 0 if the device does not support
5575 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5580 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5582 pci_read_config_word(pdev, pos + 4, &vendor);
5583 pci_read_config_word(pdev, pos + 8, &id);
5584 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5587 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5594 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5596 if (feat == IOMMU_DEV_FEAT_AUX) {
5599 if (!dev_is_pci(dev) || dmar_disabled ||
5600 !scalable_mode_support() || !iommu_pasid_support())
5603 ret = pci_pasid_features(to_pci_dev(dev));
5607 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5614 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5616 if (feat == IOMMU_DEV_FEAT_AUX)
5617 return intel_iommu_enable_auxd(dev);
5623 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5625 if (feat == IOMMU_DEV_FEAT_AUX)
5626 return intel_iommu_disable_auxd(dev);
5632 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5634 struct device_domain_info *info = dev->archdata.iommu;
5636 if (feat == IOMMU_DEV_FEAT_AUX)
5637 return scalable_mode_support() && info && info->auxd_enabled;
5643 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5645 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5647 return dmar_domain->default_pasid > 0 ?
5648 dmar_domain->default_pasid : -EINVAL;
5651 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5654 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5657 const struct iommu_ops intel_iommu_ops = {
5658 .capable = intel_iommu_capable,
5659 .domain_alloc = intel_iommu_domain_alloc,
5660 .domain_free = intel_iommu_domain_free,
5661 .attach_dev = intel_iommu_attach_device,
5662 .detach_dev = intel_iommu_detach_device,
5663 .aux_attach_dev = intel_iommu_aux_attach_device,
5664 .aux_detach_dev = intel_iommu_aux_detach_device,
5665 .aux_get_pasid = intel_iommu_aux_get_pasid,
5666 .map = intel_iommu_map,
5667 .unmap = intel_iommu_unmap,
5668 .iova_to_phys = intel_iommu_iova_to_phys,
5669 .add_device = intel_iommu_add_device,
5670 .remove_device = intel_iommu_remove_device,
5671 .get_resv_regions = intel_iommu_get_resv_regions,
5672 .put_resv_regions = intel_iommu_put_resv_regions,
5673 .apply_resv_region = intel_iommu_apply_resv_region,
5674 .device_group = pci_device_group,
5675 .dev_has_feat = intel_iommu_dev_has_feat,
5676 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5677 .dev_enable_feat = intel_iommu_dev_enable_feat,
5678 .dev_disable_feat = intel_iommu_dev_disable_feat,
5679 .is_attach_deferred = intel_iommu_is_attach_deferred,
5680 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5683 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5685 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5686 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5698 static void quirk_iommu_rwbf(struct pci_dev *dev)
5701 * Mobile 4 Series Chipset neglects to set RWBF capability,
5702 * but needs it. Same seems to hold for the desktop versions.
5704 pci_info(dev, "Forcing write-buffer flush capability\n");
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5717 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5718 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5719 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5720 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5721 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5722 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5723 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5724 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5726 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5730 if (pci_read_config_word(dev, GGC, &ggc))
5733 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5734 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5736 } else if (dmar_map_gfx) {
5737 /* we have to ensure the gfx device is idle before we flush */
5738 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5739 intel_iommu_strict = 1;
5742 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5743 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5744 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5745 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5747 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5748 ISOCH DMAR unit for the Azalia sound device, but not give it any
5749 TLB entries, which causes it to deadlock. Check for that. We do
5750 this in a function called from init_dmars(), instead of in a PCI
5751 quirk, because we don't want to print the obnoxious "BIOS broken"
5752 message if VT-d is actually disabled.
5754 static void __init check_tylersburg_isoch(void)
5756 struct pci_dev *pdev;
5757 uint32_t vtisochctrl;
5759 /* If there's no Azalia in the system anyway, forget it. */
5760 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5765 /* System Management Registers. Might be hidden, in which case
5766 we can't do the sanity check. But that's OK, because the
5767 known-broken BIOSes _don't_ actually hide it, so far. */
5768 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5772 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5779 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5780 if (vtisochctrl & 1)
5783 /* Drop all bits other than the number of TLB entries */
5784 vtisochctrl &= 0x1c;
5786 /* If we have the recommended number of TLB entries (16), fine. */
5787 if (vtisochctrl == 0x10)
5790 /* Zero TLB entries? You get to ride the short bus to school. */
5792 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5793 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5794 dmi_get_system_info(DMI_BIOS_VENDOR),
5795 dmi_get_system_info(DMI_BIOS_VERSION),
5796 dmi_get_system_info(DMI_PRODUCT_VERSION));
5797 iommu_identity_mapping |= IDENTMAP_AZALIA;
5801 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",