1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
344 static int domain_detach_iommu(struct dmar_domain *domain,
345 struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
366 #define IDENTMAP_ALL 1
367 #define IDENTMAP_GFX 2
368 #define IDENTMAP_AZALIA 4
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
379 * Iterate over elements in device_domain_list and call the specified
380 * callback @fn against each element.
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383 void *data), void *data)
387 struct device_domain_info *info;
389 spin_lock_irqsave(&device_domain_lock, flags);
390 list_for_each_entry(info, &device_domain_list, global) {
391 ret = fn(info, data);
393 spin_unlock_irqrestore(&device_domain_lock, flags);
397 spin_unlock_irqrestore(&device_domain_lock, flags);
402 const struct iommu_ops intel_iommu_ops;
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
406 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
411 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
414 static void init_translation_status(struct intel_iommu *iommu)
418 gsts = readl(iommu->reg + DMAR_GSTS_REG);
419 if (gsts & DMA_GSTS_TES)
420 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
426 return container_of(dom, struct dmar_domain, domain);
429 static int __init intel_iommu_setup(char *str)
434 if (!strncmp(str, "on", 2)) {
436 pr_info("IOMMU enabled\n");
437 } else if (!strncmp(str, "off", 3)) {
439 no_platform_optin = 1;
440 pr_info("IOMMU disabled\n");
441 } else if (!strncmp(str, "igfx_off", 8)) {
443 pr_info("Disable GFX device mapping\n");
444 } else if (!strncmp(str, "forcedac", 8)) {
445 pr_info("Forcing DAC for PCI devices\n");
447 } else if (!strncmp(str, "strict", 6)) {
448 pr_info("Disable batched IOTLB flush\n");
449 intel_iommu_strict = 1;
450 } else if (!strncmp(str, "sp_off", 6)) {
451 pr_info("Disable supported super page\n");
452 intel_iommu_superpage = 0;
453 } else if (!strncmp(str, "sm_on", 5)) {
454 pr_info("Intel-IOMMU: scalable mode supported\n");
456 } else if (!strncmp(str, "tboot_noforce", 13)) {
458 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459 intel_iommu_tboot_noforce = 1;
462 str += strcspn(str, ",");
468 __setup("intel_iommu=", intel_iommu_setup);
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
475 struct dmar_domain **domains;
478 domains = iommu->domains[idx];
482 return domains[did & 0xff];
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486 struct dmar_domain *domain)
488 struct dmar_domain **domains;
491 if (!iommu->domains[idx]) {
492 size_t size = 256 * sizeof(struct dmar_domain *);
493 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
496 domains = iommu->domains[idx];
497 if (WARN_ON(!domains))
500 domains[did & 0xff] = domain;
503 void *alloc_pgtable_page(int node)
508 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
510 vaddr = page_address(page);
514 void free_pgtable_page(void *vaddr)
516 free_page((unsigned long)vaddr);
519 static inline void *alloc_domain_mem(void)
521 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
524 static void free_domain_mem(void *vaddr)
526 kmem_cache_free(iommu_domain_cache, vaddr);
529 static inline void * alloc_devinfo_mem(void)
531 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
534 static inline void free_devinfo_mem(void *vaddr)
536 kmem_cache_free(iommu_devinfo_cache, vaddr);
539 static inline int domain_type_is_si(struct dmar_domain *domain)
541 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
547 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
549 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
557 sagaw = cap_sagaw(iommu->cap);
558 for (agaw = width_to_agaw(max_gaw);
560 if (test_bit(agaw, &sagaw))
568 * Calculate max SAGAW for each iommu.
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
572 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
576 * calculate agaw for each iommu.
577 * "SAGAW" may be different across iommus, use a default agaw, and
578 * get a supported less agaw for iommus that don't support the default agaw.
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
582 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
590 /* si_domain and vm domain should not get here. */
591 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
594 for_each_domain_iommu(iommu_id, domain)
597 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
600 return g_iommus[iommu_id];
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
605 struct dmar_drhd_unit *drhd;
606 struct intel_iommu *iommu;
610 domain->iommu_coherency = 1;
612 for_each_domain_iommu(i, domain) {
614 if (!ecap_coherent(g_iommus[i]->ecap)) {
615 domain->iommu_coherency = 0;
622 /* No hardware attached; use lowest common denominator */
624 for_each_active_iommu(iommu, drhd) {
625 if (!ecap_coherent(iommu->ecap)) {
626 domain->iommu_coherency = 0;
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
635 struct dmar_drhd_unit *drhd;
636 struct intel_iommu *iommu;
640 for_each_active_iommu(iommu, drhd) {
642 if (!ecap_sc_support(iommu->ecap)) {
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
655 struct dmar_drhd_unit *drhd;
656 struct intel_iommu *iommu;
659 if (!intel_iommu_superpage) {
663 /* set iommu_superpage to the smallest common denominator */
665 for_each_active_iommu(iommu, drhd) {
667 mask &= cap_super_page_val(iommu->cap);
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
680 domain_update_iommu_coherency(domain);
681 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
688 struct root_entry *root = &iommu->root_entry[bus];
689 struct context_entry *context;
693 if (sm_supported(iommu)) {
701 context = phys_to_virt(*entry & VTD_PAGE_MASK);
703 unsigned long phy_addr;
707 context = alloc_pgtable_page(iommu->node);
711 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712 phy_addr = virt_to_phys((void *)context);
713 *entry = phy_addr | 1;
714 __iommu_flush_cache(iommu, entry, sizeof(*entry));
716 return &context[devfn];
719 static int iommu_dummy(struct device *dev)
721 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
725 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726 * sub-hierarchy of a candidate PCI-PCI bridge
727 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728 * @bridge: the candidate PCI-PCI bridge
730 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
735 struct pci_dev *pdev, *pbridge;
737 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
740 pdev = to_pci_dev(dev);
741 pbridge = to_pci_dev(bridge);
743 if (pbridge->subordinate &&
744 pbridge->subordinate->number <= pdev->bus->number &&
745 pbridge->subordinate->busn_res.end >= pdev->bus->number)
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
753 struct dmar_drhd_unit *drhd = NULL;
754 struct intel_iommu *iommu;
756 struct pci_dev *pdev = NULL;
760 if (iommu_dummy(dev))
763 if (dev_is_pci(dev)) {
764 struct pci_dev *pf_pdev;
766 pdev = to_pci_dev(dev);
769 /* VMD child devices currently cannot be handled individually */
770 if (is_vmd(pdev->bus))
774 /* VFs aren't listed in scope tables; we need to look up
775 * the PF instead to find the IOMMU. */
776 pf_pdev = pci_physfn(pdev);
778 segment = pci_domain_nr(pdev->bus);
779 } else if (has_acpi_companion(dev))
780 dev = &ACPI_COMPANION(dev)->dev;
783 for_each_active_iommu(iommu, drhd) {
784 if (pdev && segment != drhd->segment)
787 for_each_active_dev_scope(drhd->devices,
788 drhd->devices_cnt, i, tmp) {
790 /* For a VF use its original BDF# not that of the PF
791 * which we used for the IOMMU lookup. Strictly speaking
792 * we could do this for all PCI devices; we only need to
793 * get the BDF# from the scope table for ACPI matches. */
794 if (pdev && pdev->is_virtfn)
797 *bus = drhd->devices[i].bus;
798 *devfn = drhd->devices[i].devfn;
802 if (is_downstream_to_pci_bridge(dev, tmp))
806 if (pdev && drhd->include_all) {
808 *bus = pdev->bus->number;
809 *devfn = pdev->devfn;
820 static void domain_flush_cache(struct dmar_domain *domain,
821 void *addr, int size)
823 if (!domain->iommu_coherency)
824 clflush_cache_range(addr, size);
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
829 struct context_entry *context;
833 spin_lock_irqsave(&iommu->lock, flags);
834 context = iommu_context_addr(iommu, bus, devfn, 0);
836 ret = context_present(context);
837 spin_unlock_irqrestore(&iommu->lock, flags);
841 static void free_context_table(struct intel_iommu *iommu)
845 struct context_entry *context;
847 spin_lock_irqsave(&iommu->lock, flags);
848 if (!iommu->root_entry) {
851 for (i = 0; i < ROOT_ENTRY_NR; i++) {
852 context = iommu_context_addr(iommu, i, 0, 0);
854 free_pgtable_page(context);
856 if (!sm_supported(iommu))
859 context = iommu_context_addr(iommu, i, 0x80, 0);
861 free_pgtable_page(context);
864 free_pgtable_page(iommu->root_entry);
865 iommu->root_entry = NULL;
867 spin_unlock_irqrestore(&iommu->lock, flags);
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871 unsigned long pfn, int *target_level)
873 struct dma_pte *parent, *pte;
874 int level = agaw_to_level(domain->agaw);
877 BUG_ON(!domain->pgd);
879 if (!domain_pfn_supported(domain, pfn))
880 /* Address beyond IOMMU's addressing capabilities. */
883 parent = domain->pgd;
888 offset = pfn_level_offset(pfn, level);
889 pte = &parent[offset];
890 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
892 if (level == *target_level)
895 if (!dma_pte_present(pte)) {
898 tmp_page = alloc_pgtable_page(domain->nid);
903 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905 if (cmpxchg64(&pte->val, 0ULL, pteval))
906 /* Someone else set it while we were thinking; use theirs. */
907 free_pgtable_page(tmp_page);
909 domain_flush_cache(domain, pte, sizeof(*pte));
914 parent = phys_to_virt(dma_pte_addr(pte));
919 *target_level = level;
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
927 int level, int *large_page)
929 struct dma_pte *parent, *pte;
930 int total = agaw_to_level(domain->agaw);
933 parent = domain->pgd;
934 while (level <= total) {
935 offset = pfn_level_offset(pfn, total);
936 pte = &parent[offset];
940 if (!dma_pte_present(pte)) {
945 if (dma_pte_superpage(pte)) {
950 parent = phys_to_virt(dma_pte_addr(pte));
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958 unsigned long start_pfn,
959 unsigned long last_pfn)
961 unsigned int large_page;
962 struct dma_pte *first_pte, *pte;
964 BUG_ON(!domain_pfn_supported(domain, start_pfn));
965 BUG_ON(!domain_pfn_supported(domain, last_pfn));
966 BUG_ON(start_pfn > last_pfn);
968 /* we don't need lock here; nobody else touches the iova range */
971 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
973 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
978 start_pfn += lvl_to_nr_pages(large_page);
980 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
982 domain_flush_cache(domain, first_pte,
983 (void *)pte - (void *)first_pte);
985 } while (start_pfn && start_pfn <= last_pfn);
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989 int retain_level, struct dma_pte *pte,
990 unsigned long pfn, unsigned long start_pfn,
991 unsigned long last_pfn)
993 pfn = max(start_pfn, pfn);
994 pte = &pte[pfn_level_offset(pfn, level)];
997 unsigned long level_pfn;
998 struct dma_pte *level_pte;
1000 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1003 level_pfn = pfn & level_mask(level);
1004 level_pte = phys_to_virt(dma_pte_addr(pte));
1007 dma_pte_free_level(domain, level - 1, retain_level,
1008 level_pte, level_pfn, start_pfn,
1013 * Free the page table if we're below the level we want to
1014 * retain and the range covers the entire table.
1016 if (level < retain_level && !(start_pfn > level_pfn ||
1017 last_pfn < level_pfn + level_size(level) - 1)) {
1019 domain_flush_cache(domain, pte, sizeof(*pte));
1020 free_pgtable_page(level_pte);
1023 pfn += level_size(level);
1024 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1028 * clear last level (leaf) ptes and free page table pages below the
1029 * level we wish to keep intact.
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032 unsigned long start_pfn,
1033 unsigned long last_pfn,
1036 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038 BUG_ON(start_pfn > last_pfn);
1040 dma_pte_clear_range(domain, start_pfn, last_pfn);
1042 /* We don't need lock here; nobody else touches the iova range */
1043 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044 domain->pgd, 0, start_pfn, last_pfn);
1047 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048 free_pgtable_page(domain->pgd);
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054 need to *modify* it at all. All we need to do is make a list of all the
1055 pages which can be freed just as soon as we've flushed the IOTLB and we
1056 know the hardware page-walk will no longer touch them.
1057 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060 int level, struct dma_pte *pte,
1061 struct page *freelist)
1065 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066 pg->freelist = freelist;
1072 pte = page_address(pg);
1074 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075 freelist = dma_pte_list_pagetables(domain, level - 1,
1078 } while (!first_pte_in_page(pte));
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084 struct dma_pte *pte, unsigned long pfn,
1085 unsigned long start_pfn,
1086 unsigned long last_pfn,
1087 struct page *freelist)
1089 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1091 pfn = max(start_pfn, pfn);
1092 pte = &pte[pfn_level_offset(pfn, level)];
1095 unsigned long level_pfn;
1097 if (!dma_pte_present(pte))
1100 level_pfn = pfn & level_mask(level);
1102 /* If range covers entire pagetable, free it */
1103 if (start_pfn <= level_pfn &&
1104 last_pfn >= level_pfn + level_size(level) - 1) {
1105 /* These suborbinate page tables are going away entirely. Don't
1106 bother to clear them; we're just going to *free* them. */
1107 if (level > 1 && !dma_pte_superpage(pte))
1108 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1114 } else if (level > 1) {
1115 /* Recurse down into a level that isn't *entirely* obsolete */
1116 freelist = dma_pte_clear_level(domain, level - 1,
1117 phys_to_virt(dma_pte_addr(pte)),
1118 level_pfn, start_pfn, last_pfn,
1122 pfn += level_size(level);
1123 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1126 domain_flush_cache(domain, first_pte,
1127 (void *)++last_pte - (void *)first_pte);
1132 /* We can't just free the pages because the IOMMU may still be walking
1133 the page tables, and may have cached the intermediate levels. The
1134 pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136 unsigned long start_pfn,
1137 unsigned long last_pfn)
1139 struct page *freelist;
1141 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143 BUG_ON(start_pfn > last_pfn);
1145 /* we don't need lock here; nobody else touches the iova range */
1146 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147 domain->pgd, 0, start_pfn, last_pfn, NULL);
1150 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151 struct page *pgd_page = virt_to_page(domain->pgd);
1152 pgd_page->freelist = freelist;
1153 freelist = pgd_page;
1161 static void dma_free_pagelist(struct page *freelist)
1165 while ((pg = freelist)) {
1166 freelist = pg->freelist;
1167 free_pgtable_page(page_address(pg));
1171 static void iova_entry_free(unsigned long data)
1173 struct page *freelist = (struct page *)data;
1175 dma_free_pagelist(freelist);
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1181 struct root_entry *root;
1182 unsigned long flags;
1184 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1186 pr_err("Allocating root entry for %s failed\n",
1191 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1193 spin_lock_irqsave(&iommu->lock, flags);
1194 iommu->root_entry = root;
1195 spin_unlock_irqrestore(&iommu->lock, flags);
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1206 addr = virt_to_phys(iommu->root_entry);
1207 if (sm_supported(iommu))
1208 addr |= DMA_RTADDR_SMT;
1210 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1213 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1215 /* Make sure hardware complete it */
1216 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217 readl, (sts & DMA_GSTS_RTPS), sts);
1219 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1227 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1230 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1233 /* Make sure hardware complete it */
1234 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235 readl, (!(val & DMA_GSTS_WBFS)), val);
1237 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242 u16 did, u16 source_id, u8 function_mask,
1249 case DMA_CCMD_GLOBAL_INVL:
1250 val = DMA_CCMD_GLOBAL_INVL;
1252 case DMA_CCMD_DOMAIN_INVL:
1253 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1255 case DMA_CCMD_DEVICE_INVL:
1256 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1262 val |= DMA_CCMD_ICC;
1264 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1267 /* Make sure hardware complete it */
1268 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1271 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276 u64 addr, unsigned int size_order, u64 type)
1278 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279 u64 val = 0, val_iva = 0;
1283 case DMA_TLB_GLOBAL_FLUSH:
1284 /* global flush doesn't need set IVA_REG */
1285 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1287 case DMA_TLB_DSI_FLUSH:
1288 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 case DMA_TLB_PSI_FLUSH:
1291 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292 /* IH bit is passed in as part of address */
1293 val_iva = size_order | addr;
1298 /* Note: set drain read/write */
1301 * This is probably to be super secure.. Looks like we can
1302 * ignore it without any impact.
1304 if (cap_read_drain(iommu->cap))
1305 val |= DMA_TLB_READ_DRAIN;
1307 if (cap_write_drain(iommu->cap))
1308 val |= DMA_TLB_WRITE_DRAIN;
1310 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311 /* Note: Only uses first TLB reg currently */
1313 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1316 /* Make sure hardware complete it */
1317 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1320 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1322 /* check IOTLB invalidation granularity */
1323 if (DMA_TLB_IAIG(val) == 0)
1324 pr_err("Flush IOTLB failed\n");
1325 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327 (unsigned long long)DMA_TLB_IIRG(type),
1328 (unsigned long long)DMA_TLB_IAIG(val));
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1335 struct device_domain_info *info;
1337 assert_spin_locked(&device_domain_lock);
1342 list_for_each_entry(info, &domain->devices, link)
1343 if (info->iommu == iommu && info->bus == bus &&
1344 info->devfn == devfn) {
1345 if (info->ats_supported && info->dev)
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1355 struct device_domain_info *info;
1356 bool has_iotlb_device = false;
1358 assert_spin_locked(&device_domain_lock);
1360 list_for_each_entry(info, &domain->devices, link) {
1361 struct pci_dev *pdev;
1363 if (!info->dev || !dev_is_pci(info->dev))
1366 pdev = to_pci_dev(info->dev);
1367 if (pdev->ats_enabled) {
1368 has_iotlb_device = true;
1373 domain->has_iotlb_device = has_iotlb_device;
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1378 struct pci_dev *pdev;
1380 assert_spin_locked(&device_domain_lock);
1382 if (!info || !dev_is_pci(info->dev))
1385 pdev = to_pci_dev(info->dev);
1386 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389 * reserved, which should be set to 0.
1391 if (!ecap_dit(info->iommu->ecap))
1394 struct pci_dev *pf_pdev;
1396 /* pdev will be returned if device is not a vf */
1397 pf_pdev = pci_physfn(pdev);
1398 info->pfsid = pci_dev_id(pf_pdev);
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402 /* The PCIe spec, in its wisdom, declares that the behaviour of
1403 the device if you enable PASID support after ATS support is
1404 undefined. So always enable PASID support on devices which
1405 have it, even if we can't yet know if we're ever going to
1407 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408 info->pasid_enabled = 1;
1410 if (info->pri_supported &&
1411 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1412 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413 info->pri_enabled = 1;
1415 if (!pdev->untrusted && info->ats_supported &&
1416 pci_ats_page_aligned(pdev) &&
1417 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418 info->ats_enabled = 1;
1419 domain_update_iotlb(info->domain);
1420 info->ats_qdep = pci_ats_queue_depth(pdev);
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1426 struct pci_dev *pdev;
1428 assert_spin_locked(&device_domain_lock);
1430 if (!dev_is_pci(info->dev))
1433 pdev = to_pci_dev(info->dev);
1435 if (info->ats_enabled) {
1436 pci_disable_ats(pdev);
1437 info->ats_enabled = 0;
1438 domain_update_iotlb(info->domain);
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441 if (info->pri_enabled) {
1442 pci_disable_pri(pdev);
1443 info->pri_enabled = 0;
1445 if (info->pasid_enabled) {
1446 pci_disable_pasid(pdev);
1447 info->pasid_enabled = 0;
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453 u64 addr, unsigned mask)
1456 unsigned long flags;
1457 struct device_domain_info *info;
1459 if (!domain->has_iotlb_device)
1462 spin_lock_irqsave(&device_domain_lock, flags);
1463 list_for_each_entry(info, &domain->devices, link) {
1464 if (!info->ats_enabled)
1467 sid = info->bus << 8 | info->devfn;
1468 qdep = info->ats_qdep;
1469 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1472 spin_unlock_irqrestore(&device_domain_lock, flags);
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476 struct dmar_domain *domain,
1477 unsigned long pfn, unsigned int pages,
1480 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482 u16 did = domain->iommu_did[iommu->seq_id];
1489 * Fallback to domain selective flush if no PSI support or the size is
1491 * PSI requires page size to be 2 ^ x, and the base address is naturally
1492 * aligned to the size
1494 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1498 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1502 * In caching mode, changes of pages from non-present to present require
1503 * flush. However, device IOTLB doesn't need to be flushed in this case.
1505 if (!cap_caching_mode(iommu->cap) || !map)
1506 iommu_flush_dev_iotlb(domain, addr, mask);
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511 struct dmar_domain *domain,
1512 unsigned long pfn, unsigned int pages)
1514 /* It's a non-present to present mapping. Only flush if caching mode */
1515 if (cap_caching_mode(iommu->cap))
1516 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1518 iommu_flush_write_buffer(iommu);
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1523 struct dmar_domain *domain;
1526 domain = container_of(iovad, struct dmar_domain, iovad);
1528 for_each_domain_iommu(idx, domain) {
1529 struct intel_iommu *iommu = g_iommus[idx];
1530 u16 did = domain->iommu_did[iommu->seq_id];
1532 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1534 if (!cap_caching_mode(iommu->cap))
1535 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536 0, MAX_AGAW_PFN_WIDTH);
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1543 unsigned long flags;
1545 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1548 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550 pmen &= ~DMA_PMEN_EPM;
1551 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1553 /* wait for the protected region status bit to clear */
1554 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555 readl, !(pmen & DMA_PMEN_PRS), pmen);
1557 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1563 unsigned long flags;
1565 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566 iommu->gcmd |= DMA_GCMD_TE;
1567 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1569 /* Make sure hardware complete it */
1570 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571 readl, (sts & DMA_GSTS_TES), sts);
1573 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1581 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582 iommu->gcmd &= ~DMA_GCMD_TE;
1583 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1585 /* Make sure hardware complete it */
1586 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587 readl, (!(sts & DMA_GSTS_TES)), sts);
1589 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1594 u32 ndomains, nlongs;
1597 ndomains = cap_ndoms(iommu->cap);
1598 pr_debug("%s: Number of Domains supported <%d>\n",
1599 iommu->name, ndomains);
1600 nlongs = BITS_TO_LONGS(ndomains);
1602 spin_lock_init(&iommu->lock);
1604 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605 if (!iommu->domain_ids) {
1606 pr_err("%s: Allocating domain id array failed\n",
1611 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612 iommu->domains = kzalloc(size, GFP_KERNEL);
1614 if (iommu->domains) {
1615 size = 256 * sizeof(struct dmar_domain *);
1616 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1619 if (!iommu->domains || !iommu->domains[0]) {
1620 pr_err("%s: Allocating domain array failed\n",
1622 kfree(iommu->domain_ids);
1623 kfree(iommu->domains);
1624 iommu->domain_ids = NULL;
1625 iommu->domains = NULL;
1630 * If Caching mode is set, then invalid translations are tagged
1631 * with domain-id 0, hence we need to pre-allocate it. We also
1632 * use domain-id 0 as a marker for non-allocated domain-id, so
1633 * make sure it is not used for a real domain.
1635 set_bit(0, iommu->domain_ids);
1638 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639 * entry for first-level or pass-through translation modes should
1640 * be programmed with a domain id different from those used for
1641 * second-level or nested translation. We reserve a domain id for
1644 if (sm_supported(iommu))
1645 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1652 struct device_domain_info *info, *tmp;
1653 unsigned long flags;
1655 if (!iommu->domains || !iommu->domain_ids)
1658 spin_lock_irqsave(&device_domain_lock, flags);
1659 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660 if (info->iommu != iommu)
1663 if (!info->dev || !info->domain)
1666 __dmar_remove_one_dev_info(info);
1668 spin_unlock_irqrestore(&device_domain_lock, flags);
1670 if (iommu->gcmd & DMA_GCMD_TE)
1671 iommu_disable_translation(iommu);
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1676 if ((iommu->domains) && (iommu->domain_ids)) {
1677 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1680 for (i = 0; i < elems; i++)
1681 kfree(iommu->domains[i]);
1682 kfree(iommu->domains);
1683 kfree(iommu->domain_ids);
1684 iommu->domains = NULL;
1685 iommu->domain_ids = NULL;
1688 g_iommus[iommu->seq_id] = NULL;
1690 /* free context mapping */
1691 free_context_table(iommu);
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694 if (pasid_supported(iommu)) {
1695 if (ecap_prs(iommu->ecap))
1696 intel_svm_finish_prq(iommu);
1701 static struct dmar_domain *alloc_domain(int flags)
1703 struct dmar_domain *domain;
1705 domain = alloc_domain_mem();
1709 memset(domain, 0, sizeof(*domain));
1710 domain->nid = NUMA_NO_NODE;
1711 domain->flags = flags;
1712 domain->has_iotlb_device = false;
1713 INIT_LIST_HEAD(&domain->devices);
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720 struct intel_iommu *iommu)
1722 unsigned long ndomains;
1725 assert_spin_locked(&device_domain_lock);
1726 assert_spin_locked(&iommu->lock);
1728 domain->iommu_refcnt[iommu->seq_id] += 1;
1729 domain->iommu_count += 1;
1730 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731 ndomains = cap_ndoms(iommu->cap);
1732 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1734 if (num >= ndomains) {
1735 pr_err("%s: No free domain ids\n", iommu->name);
1736 domain->iommu_refcnt[iommu->seq_id] -= 1;
1737 domain->iommu_count -= 1;
1741 set_bit(num, iommu->domain_ids);
1742 set_iommu_domain(iommu, num, domain);
1744 domain->iommu_did[iommu->seq_id] = num;
1745 domain->nid = iommu->node;
1747 domain_update_iommu_cap(domain);
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754 struct intel_iommu *iommu)
1758 assert_spin_locked(&device_domain_lock);
1759 assert_spin_locked(&iommu->lock);
1761 domain->iommu_refcnt[iommu->seq_id] -= 1;
1762 count = --domain->iommu_count;
1763 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764 num = domain->iommu_did[iommu->seq_id];
1765 clear_bit(num, iommu->domain_ids);
1766 set_iommu_domain(iommu, num, NULL);
1768 domain_update_iommu_cap(domain);
1769 domain->iommu_did[iommu->seq_id] = 0;
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1778 static int dmar_init_reserved_ranges(void)
1780 struct pci_dev *pdev = NULL;
1784 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1786 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787 &reserved_rbtree_key);
1789 /* IOAPIC ranges shouldn't be accessed by DMA */
1790 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791 IOVA_PFN(IOAPIC_RANGE_END));
1793 pr_err("Reserve IOAPIC range failed\n");
1797 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798 for_each_pci_dev(pdev) {
1801 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802 r = &pdev->resource[i];
1803 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1805 iova = reserve_iova(&reserved_iova_list,
1809 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1819 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1825 int r = (gaw - 12) % 9;
1836 static void domain_exit(struct dmar_domain *domain)
1838 struct page *freelist;
1840 /* Remove associated devices and clear attached or cached domains */
1841 domain_remove_dev_info(domain);
1844 put_iova_domain(&domain->iovad);
1846 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1848 dma_free_pagelist(freelist);
1850 free_domain_mem(domain);
1854 * Get the PASID directory size for scalable mode context entry.
1855 * Value of X in the PDTS field of a scalable mode context entry
1856 * indicates PASID directory with 2^(X + 7) entries.
1858 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1862 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1863 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1871 * Set the RID_PASID field of a scalable mode context entry. The
1872 * IOMMU hardware will use the PASID value set in this field for
1873 * DMA translations of DMA requests without PASID.
1876 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1878 context->hi |= pasid & ((1 << 20) - 1);
1879 context->hi |= (1 << 20);
1883 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1886 static inline void context_set_sm_dte(struct context_entry *context)
1888 context->lo |= (1 << 2);
1892 * Set the PRE(Page Request Enable) field of a scalable mode context
1895 static inline void context_set_sm_pre(struct context_entry *context)
1897 context->lo |= (1 << 4);
1900 /* Convert value to context PASID directory size field coding. */
1901 #define context_pdts(pds) (((pds) & 0x7) << 9)
1903 static int domain_context_mapping_one(struct dmar_domain *domain,
1904 struct intel_iommu *iommu,
1905 struct pasid_table *table,
1908 u16 did = domain->iommu_did[iommu->seq_id];
1909 int translation = CONTEXT_TT_MULTI_LEVEL;
1910 struct device_domain_info *info = NULL;
1911 struct context_entry *context;
1912 unsigned long flags;
1917 if (hw_pass_through && domain_type_is_si(domain))
1918 translation = CONTEXT_TT_PASS_THROUGH;
1920 pr_debug("Set context mapping for %02x:%02x.%d\n",
1921 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1923 BUG_ON(!domain->pgd);
1925 spin_lock_irqsave(&device_domain_lock, flags);
1926 spin_lock(&iommu->lock);
1929 context = iommu_context_addr(iommu, bus, devfn, 1);
1934 if (context_present(context))
1938 * For kdump cases, old valid entries may be cached due to the
1939 * in-flight DMA and copied pgtable, but there is no unmapping
1940 * behaviour for them, thus we need an explicit cache flush for
1941 * the newly-mapped device. For kdump, at this point, the device
1942 * is supposed to finish reset at its driver probe stage, so no
1943 * in-flight DMA will exist, and we don't need to worry anymore
1946 if (context_copied(context)) {
1947 u16 did_old = context_domain_id(context);
1949 if (did_old < cap_ndoms(iommu->cap)) {
1950 iommu->flush.flush_context(iommu, did_old,
1951 (((u16)bus) << 8) | devfn,
1952 DMA_CCMD_MASK_NOBIT,
1953 DMA_CCMD_DEVICE_INVL);
1954 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1959 context_clear_entry(context);
1961 if (sm_supported(iommu)) {
1966 /* Setup the PASID DIR pointer: */
1967 pds = context_get_sm_pds(table);
1968 context->lo = (u64)virt_to_phys(table->table) |
1971 /* Setup the RID_PASID field: */
1972 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1975 * Setup the Device-TLB enable bit and Page request
1978 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1979 if (info && info->ats_supported)
1980 context_set_sm_dte(context);
1981 if (info && info->pri_supported)
1982 context_set_sm_pre(context);
1984 struct dma_pte *pgd = domain->pgd;
1987 context_set_domain_id(context, did);
1989 if (translation != CONTEXT_TT_PASS_THROUGH) {
1991 * Skip top levels of page tables for iommu which has
1992 * less agaw than default. Unnecessary for PT mode.
1994 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1996 pgd = phys_to_virt(dma_pte_addr(pgd));
1997 if (!dma_pte_present(pgd))
2001 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2002 if (info && info->ats_supported)
2003 translation = CONTEXT_TT_DEV_IOTLB;
2005 translation = CONTEXT_TT_MULTI_LEVEL;
2007 context_set_address_root(context, virt_to_phys(pgd));
2008 context_set_address_width(context, agaw);
2011 * In pass through mode, AW must be programmed to
2012 * indicate the largest AGAW value supported by
2013 * hardware. And ASR is ignored by hardware.
2015 context_set_address_width(context, iommu->msagaw);
2018 context_set_translation_type(context, translation);
2021 context_set_fault_enable(context);
2022 context_set_present(context);
2023 domain_flush_cache(domain, context, sizeof(*context));
2026 * It's a non-present to present mapping. If hardware doesn't cache
2027 * non-present entry we only need to flush the write-buffer. If the
2028 * _does_ cache non-present entries, then it does so in the special
2029 * domain #0, which we have to flush:
2031 if (cap_caching_mode(iommu->cap)) {
2032 iommu->flush.flush_context(iommu, 0,
2033 (((u16)bus) << 8) | devfn,
2034 DMA_CCMD_MASK_NOBIT,
2035 DMA_CCMD_DEVICE_INVL);
2036 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2038 iommu_flush_write_buffer(iommu);
2040 iommu_enable_dev_iotlb(info);
2045 spin_unlock(&iommu->lock);
2046 spin_unlock_irqrestore(&device_domain_lock, flags);
2051 struct domain_context_mapping_data {
2052 struct dmar_domain *domain;
2053 struct intel_iommu *iommu;
2054 struct pasid_table *table;
2057 static int domain_context_mapping_cb(struct pci_dev *pdev,
2058 u16 alias, void *opaque)
2060 struct domain_context_mapping_data *data = opaque;
2062 return domain_context_mapping_one(data->domain, data->iommu,
2063 data->table, PCI_BUS_NUM(alias),
2068 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2070 struct domain_context_mapping_data data;
2071 struct pasid_table *table;
2072 struct intel_iommu *iommu;
2075 iommu = device_to_iommu(dev, &bus, &devfn);
2079 table = intel_pasid_get_table(dev);
2081 if (!dev_is_pci(dev))
2082 return domain_context_mapping_one(domain, iommu, table,
2085 data.domain = domain;
2089 return pci_for_each_dma_alias(to_pci_dev(dev),
2090 &domain_context_mapping_cb, &data);
2093 static int domain_context_mapped_cb(struct pci_dev *pdev,
2094 u16 alias, void *opaque)
2096 struct intel_iommu *iommu = opaque;
2098 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2101 static int domain_context_mapped(struct device *dev)
2103 struct intel_iommu *iommu;
2106 iommu = device_to_iommu(dev, &bus, &devfn);
2110 if (!dev_is_pci(dev))
2111 return device_context_mapped(iommu, bus, devfn);
2113 return !pci_for_each_dma_alias(to_pci_dev(dev),
2114 domain_context_mapped_cb, iommu);
2117 /* Returns a number of VTD pages, but aligned to MM page size */
2118 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2121 host_addr &= ~PAGE_MASK;
2122 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2125 /* Return largest possible superpage level for a given mapping */
2126 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2127 unsigned long iov_pfn,
2128 unsigned long phy_pfn,
2129 unsigned long pages)
2131 int support, level = 1;
2132 unsigned long pfnmerge;
2134 support = domain->iommu_superpage;
2136 /* To use a large page, the virtual *and* physical addresses
2137 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2138 of them will mean we have to use smaller pages. So just
2139 merge them and check both at once. */
2140 pfnmerge = iov_pfn | phy_pfn;
2142 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2143 pages >>= VTD_STRIDE_SHIFT;
2146 pfnmerge >>= VTD_STRIDE_SHIFT;
2153 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2154 struct scatterlist *sg, unsigned long phys_pfn,
2155 unsigned long nr_pages, int prot)
2157 struct dma_pte *first_pte = NULL, *pte = NULL;
2158 phys_addr_t uninitialized_var(pteval);
2159 unsigned long sg_res = 0;
2160 unsigned int largepage_lvl = 0;
2161 unsigned long lvl_pages = 0;
2163 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2165 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2168 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2172 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2175 while (nr_pages > 0) {
2179 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2181 sg_res = aligned_nrpages(sg->offset, sg->length);
2182 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2183 sg->dma_length = sg->length;
2184 pteval = (sg_phys(sg) - pgoff) | prot;
2185 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2189 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2191 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2194 /* It is large page*/
2195 if (largepage_lvl > 1) {
2196 unsigned long nr_superpages, end_pfn;
2198 pteval |= DMA_PTE_LARGE_PAGE;
2199 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2201 nr_superpages = sg_res / lvl_pages;
2202 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2205 * Ensure that old small page tables are
2206 * removed to make room for superpage(s).
2207 * We're adding new large pages, so make sure
2208 * we don't remove their parent tables.
2210 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2213 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2217 /* We don't need lock here, nobody else
2218 * touches the iova range
2220 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2222 static int dumps = 5;
2223 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2224 iov_pfn, tmp, (unsigned long long)pteval);
2227 debug_dma_dump_mappings(NULL);
2232 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2234 BUG_ON(nr_pages < lvl_pages);
2235 BUG_ON(sg_res < lvl_pages);
2237 nr_pages -= lvl_pages;
2238 iov_pfn += lvl_pages;
2239 phys_pfn += lvl_pages;
2240 pteval += lvl_pages * VTD_PAGE_SIZE;
2241 sg_res -= lvl_pages;
2243 /* If the next PTE would be the first in a new page, then we
2244 need to flush the cache on the entries we've just written.
2245 And then we'll need to recalculate 'pte', so clear it and
2246 let it get set again in the if (!pte) block above.
2248 If we're done (!nr_pages) we need to flush the cache too.
2250 Also if we've been setting superpages, we may need to
2251 recalculate 'pte' and switch back to smaller pages for the
2252 end of the mapping, if the trailing size is not enough to
2253 use another superpage (i.e. sg_res < lvl_pages). */
2255 if (!nr_pages || first_pte_in_page(pte) ||
2256 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2257 domain_flush_cache(domain, first_pte,
2258 (void *)pte - (void *)first_pte);
2262 if (!sg_res && nr_pages)
2268 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2269 struct scatterlist *sg, unsigned long phys_pfn,
2270 unsigned long nr_pages, int prot)
2273 struct intel_iommu *iommu;
2275 /* Do the real mapping first */
2276 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2280 for_each_domain_iommu(iommu_id, domain) {
2281 iommu = g_iommus[iommu_id];
2282 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2288 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2289 struct scatterlist *sg, unsigned long nr_pages,
2292 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2295 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2296 unsigned long phys_pfn, unsigned long nr_pages,
2299 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2302 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2304 unsigned long flags;
2305 struct context_entry *context;
2311 spin_lock_irqsave(&iommu->lock, flags);
2312 context = iommu_context_addr(iommu, bus, devfn, 0);
2314 spin_unlock_irqrestore(&iommu->lock, flags);
2317 did_old = context_domain_id(context);
2318 context_clear_entry(context);
2319 __iommu_flush_cache(iommu, context, sizeof(*context));
2320 spin_unlock_irqrestore(&iommu->lock, flags);
2321 iommu->flush.flush_context(iommu,
2323 (((u16)bus) << 8) | devfn,
2324 DMA_CCMD_MASK_NOBIT,
2325 DMA_CCMD_DEVICE_INVL);
2326 iommu->flush.flush_iotlb(iommu,
2333 static inline void unlink_domain_info(struct device_domain_info *info)
2335 assert_spin_locked(&device_domain_lock);
2336 list_del(&info->link);
2337 list_del(&info->global);
2339 info->dev->archdata.iommu = NULL;
2342 static void domain_remove_dev_info(struct dmar_domain *domain)
2344 struct device_domain_info *info, *tmp;
2345 unsigned long flags;
2347 spin_lock_irqsave(&device_domain_lock, flags);
2348 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2349 __dmar_remove_one_dev_info(info);
2350 spin_unlock_irqrestore(&device_domain_lock, flags);
2355 * Note: we use struct device->archdata.iommu stores the info
2357 static struct dmar_domain *find_domain(struct device *dev)
2359 struct device_domain_info *info;
2361 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2362 struct iommu_domain *domain;
2364 dev->archdata.iommu = NULL;
2365 domain = iommu_get_domain_for_dev(dev);
2367 intel_iommu_attach_device(domain, dev);
2370 /* No lock here, assumes no domain exit in normal case */
2371 info = dev->archdata.iommu;
2374 return info->domain;
2378 static inline struct device_domain_info *
2379 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2381 struct device_domain_info *info;
2383 list_for_each_entry(info, &device_domain_list, global)
2384 if (info->iommu->segment == segment && info->bus == bus &&
2385 info->devfn == devfn)
2391 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2394 struct dmar_domain *domain)
2396 struct dmar_domain *found = NULL;
2397 struct device_domain_info *info;
2398 unsigned long flags;
2401 info = alloc_devinfo_mem();
2406 info->devfn = devfn;
2407 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2408 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2411 info->domain = domain;
2412 info->iommu = iommu;
2413 info->pasid_table = NULL;
2414 info->auxd_enabled = 0;
2415 INIT_LIST_HEAD(&info->auxiliary_domains);
2417 if (dev && dev_is_pci(dev)) {
2418 struct pci_dev *pdev = to_pci_dev(info->dev);
2420 if (!pdev->untrusted &&
2421 !pci_ats_disabled() &&
2422 ecap_dev_iotlb_support(iommu->ecap) &&
2423 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2424 dmar_find_matched_atsr_unit(pdev))
2425 info->ats_supported = 1;
2427 if (sm_supported(iommu)) {
2428 if (pasid_supported(iommu)) {
2429 int features = pci_pasid_features(pdev);
2431 info->pasid_supported = features | 1;
2434 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2435 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2436 info->pri_supported = 1;
2440 spin_lock_irqsave(&device_domain_lock, flags);
2442 found = find_domain(dev);
2445 struct device_domain_info *info2;
2446 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2448 found = info2->domain;
2454 spin_unlock_irqrestore(&device_domain_lock, flags);
2455 free_devinfo_mem(info);
2456 /* Caller must free the original domain */
2460 spin_lock(&iommu->lock);
2461 ret = domain_attach_iommu(domain, iommu);
2462 spin_unlock(&iommu->lock);
2465 spin_unlock_irqrestore(&device_domain_lock, flags);
2466 free_devinfo_mem(info);
2470 list_add(&info->link, &domain->devices);
2471 list_add(&info->global, &device_domain_list);
2473 dev->archdata.iommu = info;
2474 spin_unlock_irqrestore(&device_domain_lock, flags);
2476 /* PASID table is mandatory for a PCI device in scalable mode. */
2477 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2478 ret = intel_pasid_alloc_table(dev);
2480 dev_err(dev, "PASID table allocation failed\n");
2481 dmar_remove_one_dev_info(dev);
2485 /* Setup the PASID entry for requests without PASID: */
2486 spin_lock(&iommu->lock);
2487 if (hw_pass_through && domain_type_is_si(domain))
2488 ret = intel_pasid_setup_pass_through(iommu, domain,
2489 dev, PASID_RID2PASID);
2491 ret = intel_pasid_setup_second_level(iommu, domain,
2492 dev, PASID_RID2PASID);
2493 spin_unlock(&iommu->lock);
2495 dev_err(dev, "Setup RID2PASID failed\n");
2496 dmar_remove_one_dev_info(dev);
2501 if (dev && domain_context_mapping(domain, dev)) {
2502 dev_err(dev, "Domain context map failed\n");
2503 dmar_remove_one_dev_info(dev);
2510 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2512 *(u16 *)opaque = alias;
2516 static int domain_init(struct dmar_domain *domain, int guest_width)
2520 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
2521 domain_reserve_special_ranges(domain);
2523 /* calculate AGAW */
2524 domain->gaw = guest_width;
2525 adjust_width = guestwidth_to_adjustwidth(guest_width);
2526 domain->agaw = width_to_agaw(adjust_width);
2528 domain->iommu_coherency = 0;
2529 domain->iommu_snooping = 0;
2530 domain->iommu_superpage = 0;
2531 domain->max_addr = 0;
2533 /* always allocate the top pgd */
2534 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2537 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2541 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2543 struct device_domain_info *info;
2544 struct dmar_domain *domain = NULL;
2545 struct intel_iommu *iommu;
2547 unsigned long flags;
2550 iommu = device_to_iommu(dev, &bus, &devfn);
2554 if (dev_is_pci(dev)) {
2555 struct pci_dev *pdev = to_pci_dev(dev);
2557 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2559 spin_lock_irqsave(&device_domain_lock, flags);
2560 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2561 PCI_BUS_NUM(dma_alias),
2564 iommu = info->iommu;
2565 domain = info->domain;
2567 spin_unlock_irqrestore(&device_domain_lock, flags);
2569 /* DMA alias already has a domain, use it */
2574 /* Allocate and initialize new domain for the device */
2575 domain = alloc_domain(0);
2579 if (domain_init(domain, gaw)) {
2580 domain_exit(domain);
2584 if (init_iova_flush_queue(&domain->iovad,
2587 pr_warn("iova flush queue initialization failed\n");
2588 intel_iommu_strict = 1;
2595 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2596 struct dmar_domain *domain)
2598 struct intel_iommu *iommu;
2599 struct dmar_domain *tmp;
2600 u16 req_id, dma_alias;
2603 iommu = device_to_iommu(dev, &bus, &devfn);
2607 req_id = ((u16)bus << 8) | devfn;
2609 if (dev_is_pci(dev)) {
2610 struct pci_dev *pdev = to_pci_dev(dev);
2612 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2614 /* register PCI DMA alias device */
2615 if (req_id != dma_alias) {
2616 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2617 dma_alias & 0xff, NULL, domain);
2619 if (!tmp || tmp != domain)
2624 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2625 if (!tmp || tmp != domain)
2631 static int iommu_domain_identity_map(struct dmar_domain *domain,
2632 unsigned long long start,
2633 unsigned long long end)
2635 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2636 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2638 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2639 dma_to_mm_pfn(last_vpfn))) {
2640 pr_err("Reserving iova failed\n");
2644 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2646 * RMRR range might have overlap with physical memory range,
2649 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2651 return __domain_mapping(domain, first_vpfn, NULL,
2652 first_vpfn, last_vpfn - first_vpfn + 1,
2653 DMA_PTE_READ|DMA_PTE_WRITE);
2656 static int domain_prepare_identity_map(struct device *dev,
2657 struct dmar_domain *domain,
2658 unsigned long long start,
2659 unsigned long long end)
2661 /* For _hardware_ passthrough, don't bother. But for software
2662 passthrough, we do it anyway -- it may indicate a memory
2663 range which is reserved in E820, so which didn't get set
2664 up to start with in si_domain */
2665 if (domain == si_domain && hw_pass_through) {
2666 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2671 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2674 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2675 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2676 dmi_get_system_info(DMI_BIOS_VENDOR),
2677 dmi_get_system_info(DMI_BIOS_VERSION),
2678 dmi_get_system_info(DMI_PRODUCT_VERSION));
2682 if (end >> agaw_to_width(domain->agaw)) {
2683 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2684 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2685 agaw_to_width(domain->agaw),
2686 dmi_get_system_info(DMI_BIOS_VENDOR),
2687 dmi_get_system_info(DMI_BIOS_VERSION),
2688 dmi_get_system_info(DMI_PRODUCT_VERSION));
2692 return iommu_domain_identity_map(domain, start, end);
2695 static int __init si_domain_init(int hw)
2697 struct dmar_rmrr_unit *rmrr;
2701 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2705 if (domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2706 domain_exit(si_domain);
2713 for_each_online_node(nid) {
2714 unsigned long start_pfn, end_pfn;
2717 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2718 ret = iommu_domain_identity_map(si_domain,
2719 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2726 * Normally we use DMA domains for devices which have RMRRs. But we
2727 * loose this requirement for graphic and usb devices. Identity map
2728 * the RMRRs for graphic and USB devices so that they could use the
2731 for_each_rmrr_units(rmrr) {
2732 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2734 unsigned long long start = rmrr->base_address;
2735 unsigned long long end = rmrr->end_address;
2737 if (device_is_rmrr_locked(dev))
2740 if (WARN_ON(end < start ||
2741 end >> agaw_to_width(si_domain->agaw)))
2744 ret = iommu_domain_identity_map(si_domain, start, end);
2753 static int identity_mapping(struct device *dev)
2755 struct device_domain_info *info;
2757 info = dev->archdata.iommu;
2758 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2759 return (info->domain == si_domain);
2764 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2766 struct dmar_domain *ndomain;
2767 struct intel_iommu *iommu;
2770 iommu = device_to_iommu(dev, &bus, &devfn);
2774 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2775 if (ndomain != domain)
2781 static bool device_has_rmrr(struct device *dev)
2783 struct dmar_rmrr_unit *rmrr;
2788 for_each_rmrr_units(rmrr) {
2790 * Return TRUE if this RMRR contains the device that
2793 for_each_active_dev_scope(rmrr->devices,
2794 rmrr->devices_cnt, i, tmp)
2796 is_downstream_to_pci_bridge(dev, tmp)) {
2806 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2807 * is relaxable (ie. is allowed to be not enforced under some conditions)
2808 * @dev: device handle
2810 * We assume that PCI USB devices with RMRRs have them largely
2811 * for historical reasons and that the RMRR space is not actively used post
2812 * boot. This exclusion may change if vendors begin to abuse it.
2814 * The same exception is made for graphics devices, with the requirement that
2815 * any use of the RMRR regions will be torn down before assigning the device
2818 * Return: true if the RMRR is relaxable, false otherwise
2820 static bool device_rmrr_is_relaxable(struct device *dev)
2822 struct pci_dev *pdev;
2824 if (!dev_is_pci(dev))
2827 pdev = to_pci_dev(dev);
2828 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2835 * There are a couple cases where we need to restrict the functionality of
2836 * devices associated with RMRRs. The first is when evaluating a device for
2837 * identity mapping because problems exist when devices are moved in and out
2838 * of domains and their respective RMRR information is lost. This means that
2839 * a device with associated RMRRs will never be in a "passthrough" domain.
2840 * The second is use of the device through the IOMMU API. This interface
2841 * expects to have full control of the IOVA space for the device. We cannot
2842 * satisfy both the requirement that RMRR access is maintained and have an
2843 * unencumbered IOVA space. We also have no ability to quiesce the device's
2844 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2845 * We therefore prevent devices associated with an RMRR from participating in
2846 * the IOMMU API, which eliminates them from device assignment.
2848 * In both cases, devices which have relaxable RMRRs are not concerned by this
2849 * restriction. See device_rmrr_is_relaxable comment.
2851 static bool device_is_rmrr_locked(struct device *dev)
2853 if (!device_has_rmrr(dev))
2856 if (device_rmrr_is_relaxable(dev))
2863 * Return the required default domain type for a specific device.
2865 * @dev: the device in query
2866 * @startup: true if this is during early boot
2869 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2870 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2871 * - 0: both identity and dynamic domains work for this device
2873 static int device_def_domain_type(struct device *dev)
2875 if (dev_is_pci(dev)) {
2876 struct pci_dev *pdev = to_pci_dev(dev);
2878 if (device_is_rmrr_locked(dev))
2879 return IOMMU_DOMAIN_DMA;
2882 * Prevent any device marked as untrusted from getting
2883 * placed into the statically identity mapping domain.
2885 if (pdev->untrusted)
2886 return IOMMU_DOMAIN_DMA;
2888 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2889 return IOMMU_DOMAIN_IDENTITY;
2891 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2892 return IOMMU_DOMAIN_IDENTITY;
2895 * We want to start off with all devices in the 1:1 domain, and
2896 * take them out later if we find they can't access all of memory.
2898 * However, we can't do this for PCI devices behind bridges,
2899 * because all PCI devices behind the same bridge will end up
2900 * with the same source-id on their transactions.
2902 * Practically speaking, we can't change things around for these
2903 * devices at run-time, because we can't be sure there'll be no
2904 * DMA transactions in flight for any of their siblings.
2906 * So PCI devices (unless they're on the root bus) as well as
2907 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2908 * the 1:1 domain, just in _case_ one of their siblings turns out
2909 * not to be able to map all of memory.
2911 if (!pci_is_pcie(pdev)) {
2912 if (!pci_is_root_bus(pdev->bus))
2913 return IOMMU_DOMAIN_DMA;
2914 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2915 return IOMMU_DOMAIN_DMA;
2916 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2917 return IOMMU_DOMAIN_DMA;
2919 if (device_has_rmrr(dev))
2920 return IOMMU_DOMAIN_DMA;
2923 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2924 IOMMU_DOMAIN_IDENTITY : 0;
2927 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2930 * Start from the sane iommu hardware state.
2931 * If the queued invalidation is already initialized by us
2932 * (for example, while enabling interrupt-remapping) then
2933 * we got the things already rolling from a sane state.
2937 * Clear any previous faults.
2939 dmar_fault(-1, iommu);
2941 * Disable queued invalidation if supported and already enabled
2942 * before OS handover.
2944 dmar_disable_qi(iommu);
2947 if (dmar_enable_qi(iommu)) {
2949 * Queued Invalidate not enabled, use Register Based Invalidate
2951 iommu->flush.flush_context = __iommu_flush_context;
2952 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2953 pr_info("%s: Using Register based invalidation\n",
2956 iommu->flush.flush_context = qi_flush_context;
2957 iommu->flush.flush_iotlb = qi_flush_iotlb;
2958 pr_info("%s: Using Queued invalidation\n", iommu->name);
2962 static int copy_context_table(struct intel_iommu *iommu,
2963 struct root_entry *old_re,
2964 struct context_entry **tbl,
2967 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2968 struct context_entry *new_ce = NULL, ce;
2969 struct context_entry *old_ce = NULL;
2970 struct root_entry re;
2971 phys_addr_t old_ce_phys;
2973 tbl_idx = ext ? bus * 2 : bus;
2974 memcpy(&re, old_re, sizeof(re));
2976 for (devfn = 0; devfn < 256; devfn++) {
2977 /* First calculate the correct index */
2978 idx = (ext ? devfn * 2 : devfn) % 256;
2981 /* First save what we may have and clean up */
2983 tbl[tbl_idx] = new_ce;
2984 __iommu_flush_cache(iommu, new_ce,
2994 old_ce_phys = root_entry_lctp(&re);
2996 old_ce_phys = root_entry_uctp(&re);
2999 if (ext && devfn == 0) {
3000 /* No LCTP, try UCTP */
3009 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3014 new_ce = alloc_pgtable_page(iommu->node);
3021 /* Now copy the context entry */
3022 memcpy(&ce, old_ce + idx, sizeof(ce));
3024 if (!__context_present(&ce))
3027 did = context_domain_id(&ce);
3028 if (did >= 0 && did < cap_ndoms(iommu->cap))
3029 set_bit(did, iommu->domain_ids);
3032 * We need a marker for copied context entries. This
3033 * marker needs to work for the old format as well as
3034 * for extended context entries.
3036 * Bit 67 of the context entry is used. In the old
3037 * format this bit is available to software, in the
3038 * extended format it is the PGE bit, but PGE is ignored
3039 * by HW if PASIDs are disabled (and thus still
3042 * So disable PASIDs first and then mark the entry
3043 * copied. This means that we don't copy PASID
3044 * translations from the old kernel, but this is fine as
3045 * faults there are not fatal.
3047 context_clear_pasid_enable(&ce);
3048 context_set_copied(&ce);
3053 tbl[tbl_idx + pos] = new_ce;
3055 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3064 static int copy_translation_tables(struct intel_iommu *iommu)
3066 struct context_entry **ctxt_tbls;
3067 struct root_entry *old_rt;
3068 phys_addr_t old_rt_phys;
3069 int ctxt_table_entries;
3070 unsigned long flags;
3075 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3076 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3077 new_ext = !!ecap_ecs(iommu->ecap);
3080 * The RTT bit can only be changed when translation is disabled,
3081 * but disabling translation means to open a window for data
3082 * corruption. So bail out and don't copy anything if we would
3083 * have to change the bit.
3088 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3092 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3096 /* This is too big for the stack - allocate it from slab */
3097 ctxt_table_entries = ext ? 512 : 256;
3099 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3103 for (bus = 0; bus < 256; bus++) {
3104 ret = copy_context_table(iommu, &old_rt[bus],
3105 ctxt_tbls, bus, ext);
3107 pr_err("%s: Failed to copy context table for bus %d\n",
3113 spin_lock_irqsave(&iommu->lock, flags);
3115 /* Context tables are copied, now write them to the root_entry table */
3116 for (bus = 0; bus < 256; bus++) {
3117 int idx = ext ? bus * 2 : bus;
3120 if (ctxt_tbls[idx]) {
3121 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3122 iommu->root_entry[bus].lo = val;
3125 if (!ext || !ctxt_tbls[idx + 1])
3128 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3129 iommu->root_entry[bus].hi = val;
3132 spin_unlock_irqrestore(&iommu->lock, flags);
3136 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3146 static int __init init_dmars(void)
3148 struct dmar_drhd_unit *drhd;
3149 struct intel_iommu *iommu;
3155 * initialize and program root entry to not present
3158 for_each_drhd_unit(drhd) {
3160 * lock not needed as this is only incremented in the single
3161 * threaded kernel __init code path all other access are read
3164 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3168 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3171 /* Preallocate enough resources for IOMMU hot-addition */
3172 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3173 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3175 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3178 pr_err("Allocating global iommu array failed\n");
3183 for_each_iommu(iommu, drhd) {
3184 if (drhd->ignored) {
3185 iommu_disable_translation(iommu);
3190 * Find the max pasid size of all IOMMU's in the system.
3191 * We need to ensure the system pasid table is no bigger
3192 * than the smallest supported.
3194 if (pasid_supported(iommu)) {
3195 u32 temp = 2 << ecap_pss(iommu->ecap);
3197 intel_pasid_max_id = min_t(u32, temp,
3198 intel_pasid_max_id);
3201 g_iommus[iommu->seq_id] = iommu;
3203 intel_iommu_init_qi(iommu);
3205 ret = iommu_init_domains(iommu);
3209 init_translation_status(iommu);
3211 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3212 iommu_disable_translation(iommu);
3213 clear_translation_pre_enabled(iommu);
3214 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3220 * we could share the same root & context tables
3221 * among all IOMMU's. Need to Split it later.
3223 ret = iommu_alloc_root_entry(iommu);
3227 if (translation_pre_enabled(iommu)) {
3228 pr_info("Translation already enabled - trying to copy translation structures\n");
3230 ret = copy_translation_tables(iommu);
3233 * We found the IOMMU with translation
3234 * enabled - but failed to copy over the
3235 * old root-entry table. Try to proceed
3236 * by disabling translation now and
3237 * allocating a clean root-entry table.
3238 * This might cause DMAR faults, but
3239 * probably the dump will still succeed.
3241 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3243 iommu_disable_translation(iommu);
3244 clear_translation_pre_enabled(iommu);
3246 pr_info("Copied translation tables from previous kernel for %s\n",
3251 if (!ecap_pass_through(iommu->ecap))
3252 hw_pass_through = 0;
3253 #ifdef CONFIG_INTEL_IOMMU_SVM
3254 if (pasid_supported(iommu))
3255 intel_svm_init(iommu);
3260 * Now that qi is enabled on all iommus, set the root entry and flush
3261 * caches. This is required on some Intel X58 chipsets, otherwise the
3262 * flush_context function will loop forever and the boot hangs.
3264 for_each_active_iommu(iommu, drhd) {
3265 iommu_flush_write_buffer(iommu);
3266 iommu_set_root_entry(iommu);
3267 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3268 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3271 if (iommu_pass_through)
3272 iommu_identity_mapping |= IDENTMAP_ALL;
3274 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3279 iommu_identity_mapping |= IDENTMAP_GFX;
3281 check_tylersburg_isoch();
3283 ret = si_domain_init(hw_pass_through);
3290 * global invalidate context cache
3291 * global invalidate iotlb
3292 * enable translation
3294 for_each_iommu(iommu, drhd) {
3295 if (drhd->ignored) {
3297 * we always have to disable PMRs or DMA may fail on
3301 iommu_disable_protect_mem_regions(iommu);
3305 iommu_flush_write_buffer(iommu);
3307 #ifdef CONFIG_INTEL_IOMMU_SVM
3308 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3310 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3311 * could cause possible lock race condition.
3313 up_write(&dmar_global_lock);
3314 ret = intel_svm_enable_prq(iommu);
3315 down_write(&dmar_global_lock);
3320 ret = dmar_set_interrupt(iommu);
3328 for_each_active_iommu(iommu, drhd) {
3329 disable_dmar_iommu(iommu);
3330 free_dmar_iommu(iommu);
3339 /* This takes a number of _MM_ pages, not VTD pages */
3340 static unsigned long intel_alloc_iova(struct device *dev,
3341 struct dmar_domain *domain,
3342 unsigned long nrpages, uint64_t dma_mask)
3344 unsigned long iova_pfn;
3346 /* Restrict dma_mask to the width that the iommu can handle */
3347 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3348 /* Ensure we reserve the whole size-aligned region */
3349 nrpages = __roundup_pow_of_two(nrpages);
3351 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3353 * First try to allocate an io virtual address in
3354 * DMA_BIT_MASK(32) and if that fails then try allocating
3357 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3358 IOVA_PFN(DMA_BIT_MASK(32)), false);
3362 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3363 IOVA_PFN(dma_mask), true);
3364 if (unlikely(!iova_pfn)) {
3365 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3372 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3374 struct dmar_domain *domain, *tmp;
3375 struct dmar_rmrr_unit *rmrr;
3376 struct device *i_dev;
3379 /* Device shouldn't be attached by any domains. */
3380 domain = find_domain(dev);
3384 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3388 /* We have a new domain - setup possible RMRRs for the device */
3390 for_each_rmrr_units(rmrr) {
3391 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3396 ret = domain_prepare_identity_map(dev, domain,
3400 dev_err(dev, "Mapping reserved region failed\n");
3405 tmp = set_domain_for_dev(dev, domain);
3406 if (!tmp || domain != tmp) {
3407 domain_exit(domain);
3413 dev_err(dev, "Allocating domain failed\n");
3415 domain->domain.type = IOMMU_DOMAIN_DMA;
3420 /* Check if the dev needs to go through non-identity map and unmap process.*/
3421 static bool iommu_need_mapping(struct device *dev)
3425 if (iommu_dummy(dev))
3428 ret = identity_mapping(dev);
3430 u64 dma_mask = *dev->dma_mask;
3432 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3433 dma_mask = dev->coherent_dma_mask;
3435 if (dma_mask >= dma_get_required_mask(dev))
3439 * 32 bit DMA is removed from si_domain and fall back to
3440 * non-identity mapping.
3442 dmar_remove_one_dev_info(dev);
3443 ret = iommu_request_dma_domain_for_dev(dev);
3445 struct iommu_domain *domain;
3446 struct dmar_domain *dmar_domain;
3448 domain = iommu_get_domain_for_dev(dev);
3450 dmar_domain = to_dmar_domain(domain);
3451 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3453 get_private_domain_for_dev(dev);
3456 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3462 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3463 size_t size, int dir, u64 dma_mask)
3465 struct dmar_domain *domain;
3466 phys_addr_t start_paddr;
3467 unsigned long iova_pfn;
3470 struct intel_iommu *iommu;
3471 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3473 BUG_ON(dir == DMA_NONE);
3475 domain = find_domain(dev);
3477 return DMA_MAPPING_ERROR;
3479 iommu = domain_get_iommu(domain);
3480 size = aligned_nrpages(paddr, size);
3482 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3487 * Check if DMAR supports zero-length reads on write only
3490 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3491 !cap_zlr(iommu->cap))
3492 prot |= DMA_PTE_READ;
3493 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3494 prot |= DMA_PTE_WRITE;
3496 * paddr - (paddr + size) might be partial page, we should map the whole
3497 * page. Note: if two part of one page are separately mapped, we
3498 * might have two guest_addr mapping to the same host paddr, but this
3499 * is not a big problem
3501 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3502 mm_to_dma_pfn(paddr_pfn), size, prot);
3506 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3507 start_paddr += paddr & ~PAGE_MASK;
3512 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3513 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3514 size, (unsigned long long)paddr, dir);
3515 return DMA_MAPPING_ERROR;
3518 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3519 unsigned long offset, size_t size,
3520 enum dma_data_direction dir,
3521 unsigned long attrs)
3523 if (iommu_need_mapping(dev))
3524 return __intel_map_single(dev, page_to_phys(page) + offset,
3525 size, dir, *dev->dma_mask);
3526 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3529 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3530 size_t size, enum dma_data_direction dir,
3531 unsigned long attrs)
3533 if (iommu_need_mapping(dev))
3534 return __intel_map_single(dev, phys_addr, size, dir,
3536 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3539 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3541 struct dmar_domain *domain;
3542 unsigned long start_pfn, last_pfn;
3543 unsigned long nrpages;
3544 unsigned long iova_pfn;
3545 struct intel_iommu *iommu;
3546 struct page *freelist;
3547 struct pci_dev *pdev = NULL;
3549 domain = find_domain(dev);
3552 iommu = domain_get_iommu(domain);
3554 iova_pfn = IOVA_PFN(dev_addr);
3556 nrpages = aligned_nrpages(dev_addr, size);
3557 start_pfn = mm_to_dma_pfn(iova_pfn);
3558 last_pfn = start_pfn + nrpages - 1;
3560 if (dev_is_pci(dev))
3561 pdev = to_pci_dev(dev);
3563 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3565 freelist = domain_unmap(domain, start_pfn, last_pfn);
3567 if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3568 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3569 nrpages, !freelist, 0);
3571 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3572 dma_free_pagelist(freelist);
3574 queue_iova(&domain->iovad, iova_pfn, nrpages,
3575 (unsigned long)freelist);
3577 * queue up the release of the unmap to save the 1/6th of the
3578 * cpu used up by the iotlb flush operation...
3583 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3584 size_t size, enum dma_data_direction dir,
3585 unsigned long attrs)
3587 if (iommu_need_mapping(dev))
3588 intel_unmap(dev, dev_addr, size);
3590 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3593 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3594 size_t size, enum dma_data_direction dir, unsigned long attrs)
3596 if (iommu_need_mapping(dev))
3597 intel_unmap(dev, dev_addr, size);
3600 static void *intel_alloc_coherent(struct device *dev, size_t size,
3601 dma_addr_t *dma_handle, gfp_t flags,
3602 unsigned long attrs)
3604 struct page *page = NULL;
3607 if (!iommu_need_mapping(dev))
3608 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3610 size = PAGE_ALIGN(size);
3611 order = get_order(size);
3613 if (gfpflags_allow_blocking(flags)) {
3614 unsigned int count = size >> PAGE_SHIFT;
3616 page = dma_alloc_from_contiguous(dev, count, order,
3617 flags & __GFP_NOWARN);
3621 page = alloc_pages(flags, order);
3624 memset(page_address(page), 0, size);
3626 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3628 dev->coherent_dma_mask);
3629 if (*dma_handle != DMA_MAPPING_ERROR)
3630 return page_address(page);
3631 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3632 __free_pages(page, order);
3637 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3638 dma_addr_t dma_handle, unsigned long attrs)
3641 struct page *page = virt_to_page(vaddr);
3643 if (!iommu_need_mapping(dev))
3644 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3646 size = PAGE_ALIGN(size);
3647 order = get_order(size);
3649 intel_unmap(dev, dma_handle, size);
3650 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651 __free_pages(page, order);
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655 int nelems, enum dma_data_direction dir,
3656 unsigned long attrs)
3658 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659 unsigned long nrpages = 0;
3660 struct scatterlist *sg;
3663 if (!iommu_need_mapping(dev))
3664 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3666 for_each_sg(sglist, sg, nelems, i) {
3667 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3670 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3674 enum dma_data_direction dir, unsigned long attrs)
3677 struct dmar_domain *domain;
3680 unsigned long iova_pfn;
3682 struct scatterlist *sg;
3683 unsigned long start_vpfn;
3684 struct intel_iommu *iommu;
3686 BUG_ON(dir == DMA_NONE);
3687 if (!iommu_need_mapping(dev))
3688 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3690 domain = find_domain(dev);
3694 iommu = domain_get_iommu(domain);
3696 for_each_sg(sglist, sg, nelems, i)
3697 size += aligned_nrpages(sg->offset, sg->length);
3699 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3702 sglist->dma_length = 0;
3707 * Check if DMAR supports zero-length reads on write only
3710 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711 !cap_zlr(iommu->cap))
3712 prot |= DMA_PTE_READ;
3713 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714 prot |= DMA_PTE_WRITE;
3716 start_vpfn = mm_to_dma_pfn(iova_pfn);
3718 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719 if (unlikely(ret)) {
3720 dma_pte_free_pagetable(domain, start_vpfn,
3721 start_vpfn + size - 1,
3722 agaw_to_level(domain->agaw) + 1);
3723 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3730 static const struct dma_map_ops intel_dma_ops = {
3731 .alloc = intel_alloc_coherent,
3732 .free = intel_free_coherent,
3733 .map_sg = intel_map_sg,
3734 .unmap_sg = intel_unmap_sg,
3735 .map_page = intel_map_page,
3736 .unmap_page = intel_unmap_page,
3737 .map_resource = intel_map_resource,
3738 .unmap_resource = intel_unmap_resource,
3739 .dma_supported = dma_direct_supported,
3742 static inline int iommu_domain_cache_init(void)
3746 iommu_domain_cache = kmem_cache_create("iommu_domain",
3747 sizeof(struct dmar_domain),
3752 if (!iommu_domain_cache) {
3753 pr_err("Couldn't create iommu_domain cache\n");
3760 static inline int iommu_devinfo_cache_init(void)
3764 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3765 sizeof(struct device_domain_info),
3769 if (!iommu_devinfo_cache) {
3770 pr_err("Couldn't create devinfo cache\n");
3777 static int __init iommu_init_mempool(void)
3780 ret = iova_cache_get();
3784 ret = iommu_domain_cache_init();
3788 ret = iommu_devinfo_cache_init();
3792 kmem_cache_destroy(iommu_domain_cache);
3799 static void __init iommu_exit_mempool(void)
3801 kmem_cache_destroy(iommu_devinfo_cache);
3802 kmem_cache_destroy(iommu_domain_cache);
3806 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3808 struct dmar_drhd_unit *drhd;
3812 /* We know that this device on this chipset has its own IOMMU.
3813 * If we find it under a different IOMMU, then the BIOS is lying
3814 * to us. Hope that the IOMMU for this device is actually
3815 * disabled, and it needs no translation...
3817 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3819 /* "can't" happen */
3820 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3823 vtbar &= 0xffff0000;
3825 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3826 drhd = dmar_find_matched_drhd_unit(pdev);
3827 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3828 TAINT_FIRMWARE_WORKAROUND,
3829 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3830 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3832 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3834 static void __init init_no_remapping_devices(void)
3836 struct dmar_drhd_unit *drhd;
3840 for_each_drhd_unit(drhd) {
3841 if (!drhd->include_all) {
3842 for_each_active_dev_scope(drhd->devices,
3843 drhd->devices_cnt, i, dev)
3845 /* ignore DMAR unit if no devices exist */
3846 if (i == drhd->devices_cnt)
3851 for_each_active_drhd_unit(drhd) {
3852 if (drhd->include_all)
3855 for_each_active_dev_scope(drhd->devices,
3856 drhd->devices_cnt, i, dev)
3857 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3859 if (i < drhd->devices_cnt)
3862 /* This IOMMU has *only* gfx devices. Either bypass it or
3863 set the gfx_mapped flag, as appropriate */
3864 if (!dmar_map_gfx) {
3866 for_each_active_dev_scope(drhd->devices,
3867 drhd->devices_cnt, i, dev)
3868 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3873 #ifdef CONFIG_SUSPEND
3874 static int init_iommu_hw(void)
3876 struct dmar_drhd_unit *drhd;
3877 struct intel_iommu *iommu = NULL;
3879 for_each_active_iommu(iommu, drhd)
3881 dmar_reenable_qi(iommu);
3883 for_each_iommu(iommu, drhd) {
3884 if (drhd->ignored) {
3886 * we always have to disable PMRs or DMA may fail on
3890 iommu_disable_protect_mem_regions(iommu);
3894 iommu_flush_write_buffer(iommu);
3896 iommu_set_root_entry(iommu);
3898 iommu->flush.flush_context(iommu, 0, 0, 0,
3899 DMA_CCMD_GLOBAL_INVL);
3900 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3901 iommu_enable_translation(iommu);
3902 iommu_disable_protect_mem_regions(iommu);
3908 static void iommu_flush_all(void)
3910 struct dmar_drhd_unit *drhd;
3911 struct intel_iommu *iommu;
3913 for_each_active_iommu(iommu, drhd) {
3914 iommu->flush.flush_context(iommu, 0, 0, 0,
3915 DMA_CCMD_GLOBAL_INVL);
3916 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3917 DMA_TLB_GLOBAL_FLUSH);
3921 static int iommu_suspend(void)
3923 struct dmar_drhd_unit *drhd;
3924 struct intel_iommu *iommu = NULL;
3927 for_each_active_iommu(iommu, drhd) {
3928 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3930 if (!iommu->iommu_state)
3936 for_each_active_iommu(iommu, drhd) {
3937 iommu_disable_translation(iommu);
3939 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3941 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3942 readl(iommu->reg + DMAR_FECTL_REG);
3943 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3944 readl(iommu->reg + DMAR_FEDATA_REG);
3945 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3946 readl(iommu->reg + DMAR_FEADDR_REG);
3947 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3948 readl(iommu->reg + DMAR_FEUADDR_REG);
3950 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3955 for_each_active_iommu(iommu, drhd)
3956 kfree(iommu->iommu_state);
3961 static void iommu_resume(void)
3963 struct dmar_drhd_unit *drhd;
3964 struct intel_iommu *iommu = NULL;
3967 if (init_iommu_hw()) {
3969 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3971 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3975 for_each_active_iommu(iommu, drhd) {
3977 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3979 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3980 iommu->reg + DMAR_FECTL_REG);
3981 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3982 iommu->reg + DMAR_FEDATA_REG);
3983 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3984 iommu->reg + DMAR_FEADDR_REG);
3985 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3986 iommu->reg + DMAR_FEUADDR_REG);
3988 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3991 for_each_active_iommu(iommu, drhd)
3992 kfree(iommu->iommu_state);
3995 static struct syscore_ops iommu_syscore_ops = {
3996 .resume = iommu_resume,
3997 .suspend = iommu_suspend,
4000 static void __init init_iommu_pm_ops(void)
4002 register_syscore_ops(&iommu_syscore_ops);
4006 static inline void init_iommu_pm_ops(void) {}
4007 #endif /* CONFIG_PM */
4009 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4011 struct acpi_dmar_reserved_memory *rmrr;
4012 struct dmar_rmrr_unit *rmrru;
4014 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4018 rmrru->hdr = header;
4019 rmrr = (struct acpi_dmar_reserved_memory *)header;
4020 rmrru->base_address = rmrr->base_address;
4021 rmrru->end_address = rmrr->end_address;
4023 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4024 ((void *)rmrr) + rmrr->header.length,
4025 &rmrru->devices_cnt);
4026 if (rmrru->devices_cnt && rmrru->devices == NULL)
4029 list_add(&rmrru->list, &dmar_rmrr_units);
4038 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4040 struct dmar_atsr_unit *atsru;
4041 struct acpi_dmar_atsr *tmp;
4043 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4044 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4045 if (atsr->segment != tmp->segment)
4047 if (atsr->header.length != tmp->header.length)
4049 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4056 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4058 struct acpi_dmar_atsr *atsr;
4059 struct dmar_atsr_unit *atsru;
4061 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4064 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4065 atsru = dmar_find_atsr(atsr);
4069 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4074 * If memory is allocated from slab by ACPI _DSM method, we need to
4075 * copy the memory content because the memory buffer will be freed
4078 atsru->hdr = (void *)(atsru + 1);
4079 memcpy(atsru->hdr, hdr, hdr->length);
4080 atsru->include_all = atsr->flags & 0x1;
4081 if (!atsru->include_all) {
4082 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4083 (void *)atsr + atsr->header.length,
4084 &atsru->devices_cnt);
4085 if (atsru->devices_cnt && atsru->devices == NULL) {
4091 list_add_rcu(&atsru->list, &dmar_atsr_units);
4096 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4098 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4102 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4104 struct acpi_dmar_atsr *atsr;
4105 struct dmar_atsr_unit *atsru;
4107 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4108 atsru = dmar_find_atsr(atsr);
4110 list_del_rcu(&atsru->list);
4112 intel_iommu_free_atsr(atsru);
4118 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4122 struct acpi_dmar_atsr *atsr;
4123 struct dmar_atsr_unit *atsru;
4125 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4126 atsru = dmar_find_atsr(atsr);
4130 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4131 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4139 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4142 struct intel_iommu *iommu = dmaru->iommu;
4144 if (g_iommus[iommu->seq_id])
4147 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4148 pr_warn("%s: Doesn't support hardware pass through.\n",
4152 if (!ecap_sc_support(iommu->ecap) &&
4153 domain_update_iommu_snooping(iommu)) {
4154 pr_warn("%s: Doesn't support snooping.\n",
4158 sp = domain_update_iommu_superpage(iommu) - 1;
4159 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4160 pr_warn("%s: Doesn't support large page.\n",
4166 * Disable translation if already enabled prior to OS handover.
4168 if (iommu->gcmd & DMA_GCMD_TE)
4169 iommu_disable_translation(iommu);
4171 g_iommus[iommu->seq_id] = iommu;
4172 ret = iommu_init_domains(iommu);
4174 ret = iommu_alloc_root_entry(iommu);
4178 #ifdef CONFIG_INTEL_IOMMU_SVM
4179 if (pasid_supported(iommu))
4180 intel_svm_init(iommu);
4183 if (dmaru->ignored) {
4185 * we always have to disable PMRs or DMA may fail on this device
4188 iommu_disable_protect_mem_regions(iommu);
4192 intel_iommu_init_qi(iommu);
4193 iommu_flush_write_buffer(iommu);
4195 #ifdef CONFIG_INTEL_IOMMU_SVM
4196 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4197 ret = intel_svm_enable_prq(iommu);
4202 ret = dmar_set_interrupt(iommu);
4206 iommu_set_root_entry(iommu);
4207 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4208 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4209 iommu_enable_translation(iommu);
4211 iommu_disable_protect_mem_regions(iommu);
4215 disable_dmar_iommu(iommu);
4217 free_dmar_iommu(iommu);
4221 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4224 struct intel_iommu *iommu = dmaru->iommu;
4226 if (!intel_iommu_enabled)
4232 ret = intel_iommu_add(dmaru);
4234 disable_dmar_iommu(iommu);
4235 free_dmar_iommu(iommu);
4241 static void intel_iommu_free_dmars(void)
4243 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4244 struct dmar_atsr_unit *atsru, *atsr_n;
4246 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4247 list_del(&rmrru->list);
4248 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4252 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4253 list_del(&atsru->list);
4254 intel_iommu_free_atsr(atsru);
4258 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4261 struct pci_bus *bus;
4262 struct pci_dev *bridge = NULL;
4264 struct acpi_dmar_atsr *atsr;
4265 struct dmar_atsr_unit *atsru;
4267 dev = pci_physfn(dev);
4268 for (bus = dev->bus; bus; bus = bus->parent) {
4270 /* If it's an integrated device, allow ATS */
4273 /* Connected via non-PCIe: no ATS */
4274 if (!pci_is_pcie(bridge) ||
4275 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4277 /* If we found the root port, look it up in the ATSR */
4278 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4283 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4284 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4285 if (atsr->segment != pci_domain_nr(dev->bus))
4288 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4289 if (tmp == &bridge->dev)
4292 if (atsru->include_all)
4302 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4305 struct dmar_rmrr_unit *rmrru;
4306 struct dmar_atsr_unit *atsru;
4307 struct acpi_dmar_atsr *atsr;
4308 struct acpi_dmar_reserved_memory *rmrr;
4310 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4313 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4314 rmrr = container_of(rmrru->hdr,
4315 struct acpi_dmar_reserved_memory, header);
4316 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4317 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4318 ((void *)rmrr) + rmrr->header.length,
4319 rmrr->segment, rmrru->devices,
4320 rmrru->devices_cnt);
4323 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4324 dmar_remove_dev_scope(info, rmrr->segment,
4325 rmrru->devices, rmrru->devices_cnt);
4329 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4330 if (atsru->include_all)
4333 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4334 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4335 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4336 (void *)atsr + atsr->header.length,
4337 atsr->segment, atsru->devices,
4338 atsru->devices_cnt);
4343 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4344 if (dmar_remove_dev_scope(info, atsr->segment,
4345 atsru->devices, atsru->devices_cnt))
4353 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4354 unsigned long val, void *v)
4356 struct memory_notify *mhp = v;
4357 unsigned long long start, end;
4358 unsigned long start_vpfn, last_vpfn;
4361 case MEM_GOING_ONLINE:
4362 start = mhp->start_pfn << PAGE_SHIFT;
4363 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4364 if (iommu_domain_identity_map(si_domain, start, end)) {
4365 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4372 case MEM_CANCEL_ONLINE:
4373 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4374 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4375 while (start_vpfn <= last_vpfn) {
4377 struct dmar_drhd_unit *drhd;
4378 struct intel_iommu *iommu;
4379 struct page *freelist;
4381 iova = find_iova(&si_domain->iovad, start_vpfn);
4383 pr_debug("Failed get IOVA for PFN %lx\n",
4388 iova = split_and_remove_iova(&si_domain->iovad, iova,
4389 start_vpfn, last_vpfn);
4391 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4392 start_vpfn, last_vpfn);
4396 freelist = domain_unmap(si_domain, iova->pfn_lo,
4400 for_each_active_iommu(iommu, drhd)
4401 iommu_flush_iotlb_psi(iommu, si_domain,
4402 iova->pfn_lo, iova_size(iova),
4405 dma_free_pagelist(freelist);
4407 start_vpfn = iova->pfn_hi + 1;
4408 free_iova_mem(iova);
4416 static struct notifier_block intel_iommu_memory_nb = {
4417 .notifier_call = intel_iommu_memory_notifier,
4421 static void free_all_cpu_cached_iovas(unsigned int cpu)
4425 for (i = 0; i < g_num_of_iommus; i++) {
4426 struct intel_iommu *iommu = g_iommus[i];
4427 struct dmar_domain *domain;
4433 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4434 domain = get_iommu_domain(iommu, (u16)did);
4438 free_cpu_cached_iovas(cpu, &domain->iovad);
4443 static int intel_iommu_cpu_dead(unsigned int cpu)
4445 free_all_cpu_cached_iovas(cpu);
4449 static void intel_disable_iommus(void)
4451 struct intel_iommu *iommu = NULL;
4452 struct dmar_drhd_unit *drhd;
4454 for_each_iommu(iommu, drhd)
4455 iommu_disable_translation(iommu);
4458 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4460 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4462 return container_of(iommu_dev, struct intel_iommu, iommu);
4465 static ssize_t intel_iommu_show_version(struct device *dev,
4466 struct device_attribute *attr,
4469 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4470 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4471 return sprintf(buf, "%d:%d\n",
4472 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4474 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4476 static ssize_t intel_iommu_show_address(struct device *dev,
4477 struct device_attribute *attr,
4480 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4481 return sprintf(buf, "%llx\n", iommu->reg_phys);
4483 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4485 static ssize_t intel_iommu_show_cap(struct device *dev,
4486 struct device_attribute *attr,
4489 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4490 return sprintf(buf, "%llx\n", iommu->cap);
4492 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4494 static ssize_t intel_iommu_show_ecap(struct device *dev,
4495 struct device_attribute *attr,
4498 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4499 return sprintf(buf, "%llx\n", iommu->ecap);
4501 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4503 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4504 struct device_attribute *attr,
4507 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4508 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4510 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4512 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4513 struct device_attribute *attr,
4516 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4517 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4518 cap_ndoms(iommu->cap)));
4520 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4522 static struct attribute *intel_iommu_attrs[] = {
4523 &dev_attr_version.attr,
4524 &dev_attr_address.attr,
4526 &dev_attr_ecap.attr,
4527 &dev_attr_domains_supported.attr,
4528 &dev_attr_domains_used.attr,
4532 static struct attribute_group intel_iommu_group = {
4533 .name = "intel-iommu",
4534 .attrs = intel_iommu_attrs,
4537 const struct attribute_group *intel_iommu_groups[] = {
4542 static int __init platform_optin_force_iommu(void)
4544 struct pci_dev *pdev = NULL;
4545 bool has_untrusted_dev = false;
4547 if (!dmar_platform_optin() || no_platform_optin)
4550 for_each_pci_dev(pdev) {
4551 if (pdev->untrusted) {
4552 has_untrusted_dev = true;
4557 if (!has_untrusted_dev)
4560 if (no_iommu || dmar_disabled)
4561 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4564 * If Intel-IOMMU is disabled by default, we will apply identity
4565 * map for all devices except those marked as being untrusted.
4568 iommu_identity_mapping |= IDENTMAP_ALL;
4571 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4579 static int __init probe_acpi_namespace_devices(void)
4581 struct dmar_drhd_unit *drhd;
4582 /* To avoid a -Wunused-but-set-variable warning. */
4583 struct intel_iommu *iommu __maybe_unused;
4587 for_each_active_iommu(iommu, drhd) {
4588 for_each_active_dev_scope(drhd->devices,
4589 drhd->devices_cnt, i, dev) {
4590 struct acpi_device_physical_node *pn;
4591 struct iommu_group *group;
4592 struct acpi_device *adev;
4594 if (dev->bus != &acpi_bus_type)
4597 adev = to_acpi_device(dev);
4598 mutex_lock(&adev->physical_node_lock);
4599 list_for_each_entry(pn,
4600 &adev->physical_node_list, node) {
4601 group = iommu_group_get(pn->dev);
4603 iommu_group_put(group);
4607 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4608 ret = iommu_probe_device(pn->dev);
4612 mutex_unlock(&adev->physical_node_lock);
4622 int __init intel_iommu_init(void)
4625 struct dmar_drhd_unit *drhd;
4626 struct intel_iommu *iommu;
4629 * Intel IOMMU is required for a TXT/tboot launch or platform
4630 * opt in, so enforce that.
4632 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4634 if (iommu_init_mempool()) {
4636 panic("tboot: Failed to initialize iommu memory\n");
4640 down_write(&dmar_global_lock);
4641 if (dmar_table_init()) {
4643 panic("tboot: Failed to initialize DMAR table\n");
4647 if (dmar_dev_scope_init() < 0) {
4649 panic("tboot: Failed to initialize DMAR device scope\n");
4653 up_write(&dmar_global_lock);
4656 * The bus notifier takes the dmar_global_lock, so lockdep will
4657 * complain later when we register it under the lock.
4659 dmar_register_bus_notifier();
4661 down_write(&dmar_global_lock);
4663 if (no_iommu || dmar_disabled) {
4665 * We exit the function here to ensure IOMMU's remapping and
4666 * mempool aren't setup, which means that the IOMMU's PMRs
4667 * won't be disabled via the call to init_dmars(). So disable
4668 * it explicitly here. The PMRs were setup by tboot prior to
4669 * calling SENTER, but the kernel is expected to reset/tear
4672 if (intel_iommu_tboot_noforce) {
4673 for_each_iommu(iommu, drhd)
4674 iommu_disable_protect_mem_regions(iommu);
4678 * Make sure the IOMMUs are switched off, even when we
4679 * boot into a kexec kernel and the previous kernel left
4682 intel_disable_iommus();
4686 if (list_empty(&dmar_rmrr_units))
4687 pr_info("No RMRR found\n");
4689 if (list_empty(&dmar_atsr_units))
4690 pr_info("No ATSR found\n");
4692 if (dmar_init_reserved_ranges()) {
4694 panic("tboot: Failed to reserve iommu ranges\n");
4695 goto out_free_reserved_range;
4699 intel_iommu_gfx_mapped = 1;
4701 init_no_remapping_devices();
4706 panic("tboot: Failed to initialize DMARs\n");
4707 pr_err("Initialization failed\n");
4708 goto out_free_reserved_range;
4710 up_write(&dmar_global_lock);
4712 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4715 dma_ops = &intel_dma_ops;
4717 init_iommu_pm_ops();
4719 for_each_active_iommu(iommu, drhd) {
4720 iommu_device_sysfs_add(&iommu->iommu, NULL,
4723 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4724 iommu_device_register(&iommu->iommu);
4727 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4728 if (si_domain && !hw_pass_through)
4729 register_memory_notifier(&intel_iommu_memory_nb);
4730 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4731 intel_iommu_cpu_dead);
4733 down_read(&dmar_global_lock);
4734 if (probe_acpi_namespace_devices())
4735 pr_warn("ACPI name space devices didn't probe correctly\n");
4736 up_read(&dmar_global_lock);
4738 /* Finally, we enable the DMA remapping hardware. */
4739 for_each_iommu(iommu, drhd) {
4740 if (!drhd->ignored && !translation_pre_enabled(iommu))
4741 iommu_enable_translation(iommu);
4743 iommu_disable_protect_mem_regions(iommu);
4745 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4747 intel_iommu_enabled = 1;
4748 intel_iommu_debugfs_init();
4752 out_free_reserved_range:
4753 put_iova_domain(&reserved_iova_list);
4755 intel_iommu_free_dmars();
4756 up_write(&dmar_global_lock);
4757 iommu_exit_mempool();
4761 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4763 struct intel_iommu *iommu = opaque;
4765 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4770 * NB - intel-iommu lacks any sort of reference counting for the users of
4771 * dependent devices. If multiple endpoints have intersecting dependent
4772 * devices, unbinding the driver from any one of them will possibly leave
4773 * the others unable to operate.
4775 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4777 if (!iommu || !dev || !dev_is_pci(dev))
4780 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4783 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4785 struct dmar_domain *domain;
4786 struct intel_iommu *iommu;
4787 unsigned long flags;
4789 assert_spin_locked(&device_domain_lock);
4794 iommu = info->iommu;
4795 domain = info->domain;
4798 if (dev_is_pci(info->dev) && sm_supported(iommu))
4799 intel_pasid_tear_down_entry(iommu, info->dev,
4802 iommu_disable_dev_iotlb(info);
4803 domain_context_clear(iommu, info->dev);
4804 intel_pasid_free_table(info->dev);
4807 unlink_domain_info(info);
4809 spin_lock_irqsave(&iommu->lock, flags);
4810 domain_detach_iommu(domain, iommu);
4811 spin_unlock_irqrestore(&iommu->lock, flags);
4813 /* free the private domain */
4814 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4815 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4816 domain_exit(info->domain);
4818 free_devinfo_mem(info);
4821 static void dmar_remove_one_dev_info(struct device *dev)
4823 struct device_domain_info *info;
4824 unsigned long flags;
4826 spin_lock_irqsave(&device_domain_lock, flags);
4827 info = dev->archdata.iommu;
4828 __dmar_remove_one_dev_info(info);
4829 spin_unlock_irqrestore(&device_domain_lock, flags);
4832 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4834 struct dmar_domain *dmar_domain;
4835 struct iommu_domain *domain;
4838 case IOMMU_DOMAIN_DMA:
4840 case IOMMU_DOMAIN_UNMANAGED:
4841 dmar_domain = alloc_domain(0);
4843 pr_err("Can't allocate dmar_domain\n");
4846 if (domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4847 pr_err("Domain initialization failed\n");
4848 domain_exit(dmar_domain);
4852 if (type == IOMMU_DOMAIN_DMA &&
4853 init_iova_flush_queue(&dmar_domain->iovad,
4854 iommu_flush_iova, iova_entry_free)) {
4855 pr_warn("iova flush queue initialization failed\n");
4856 intel_iommu_strict = 1;
4859 domain_update_iommu_cap(dmar_domain);
4861 domain = &dmar_domain->domain;
4862 domain->geometry.aperture_start = 0;
4863 domain->geometry.aperture_end =
4864 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4865 domain->geometry.force_aperture = true;
4868 case IOMMU_DOMAIN_IDENTITY:
4869 return &si_domain->domain;
4877 static void intel_iommu_domain_free(struct iommu_domain *domain)
4879 if (domain != &si_domain->domain)
4880 domain_exit(to_dmar_domain(domain));
4884 * Check whether a @domain could be attached to the @dev through the
4885 * aux-domain attach/detach APIs.
4888 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4890 struct device_domain_info *info = dev->archdata.iommu;
4892 return info && info->auxd_enabled &&
4893 domain->type == IOMMU_DOMAIN_UNMANAGED;
4896 static void auxiliary_link_device(struct dmar_domain *domain,
4899 struct device_domain_info *info = dev->archdata.iommu;
4901 assert_spin_locked(&device_domain_lock);
4905 domain->auxd_refcnt++;
4906 list_add(&domain->auxd, &info->auxiliary_domains);
4909 static void auxiliary_unlink_device(struct dmar_domain *domain,
4912 struct device_domain_info *info = dev->archdata.iommu;
4914 assert_spin_locked(&device_domain_lock);
4918 list_del(&domain->auxd);
4919 domain->auxd_refcnt--;
4921 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4922 intel_pasid_free_id(domain->default_pasid);
4925 static int aux_domain_add_dev(struct dmar_domain *domain,
4930 unsigned long flags;
4931 struct intel_iommu *iommu;
4933 iommu = device_to_iommu(dev, &bus, &devfn);
4937 if (domain->default_pasid <= 0) {
4940 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4941 pci_max_pasids(to_pci_dev(dev)),
4944 pr_err("Can't allocate default pasid\n");
4947 domain->default_pasid = pasid;
4950 spin_lock_irqsave(&device_domain_lock, flags);
4952 * iommu->lock must be held to attach domain to iommu and setup the
4953 * pasid entry for second level translation.
4955 spin_lock(&iommu->lock);
4956 ret = domain_attach_iommu(domain, iommu);
4960 /* Setup the PASID entry for mediated devices: */
4961 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4962 domain->default_pasid);
4965 spin_unlock(&iommu->lock);
4967 auxiliary_link_device(domain, dev);
4969 spin_unlock_irqrestore(&device_domain_lock, flags);
4974 domain_detach_iommu(domain, iommu);
4976 spin_unlock(&iommu->lock);
4977 spin_unlock_irqrestore(&device_domain_lock, flags);
4978 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4979 intel_pasid_free_id(domain->default_pasid);
4984 static void aux_domain_remove_dev(struct dmar_domain *domain,
4987 struct device_domain_info *info;
4988 struct intel_iommu *iommu;
4989 unsigned long flags;
4991 if (!is_aux_domain(dev, &domain->domain))
4994 spin_lock_irqsave(&device_domain_lock, flags);
4995 info = dev->archdata.iommu;
4996 iommu = info->iommu;
4998 auxiliary_unlink_device(domain, dev);
5000 spin_lock(&iommu->lock);
5001 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5002 domain_detach_iommu(domain, iommu);
5003 spin_unlock(&iommu->lock);
5005 spin_unlock_irqrestore(&device_domain_lock, flags);
5008 static int prepare_domain_attach_device(struct iommu_domain *domain,
5011 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5012 struct intel_iommu *iommu;
5016 iommu = device_to_iommu(dev, &bus, &devfn);
5020 /* check if this iommu agaw is sufficient for max mapped address */
5021 addr_width = agaw_to_width(iommu->agaw);
5022 if (addr_width > cap_mgaw(iommu->cap))
5023 addr_width = cap_mgaw(iommu->cap);
5025 if (dmar_domain->max_addr > (1LL << addr_width)) {
5026 dev_err(dev, "%s: iommu width (%d) is not "
5027 "sufficient for the mapped address (%llx)\n",
5028 __func__, addr_width, dmar_domain->max_addr);
5031 dmar_domain->gaw = addr_width;
5034 * Knock out extra levels of page tables if necessary
5036 while (iommu->agaw < dmar_domain->agaw) {
5037 struct dma_pte *pte;
5039 pte = dmar_domain->pgd;
5040 if (dma_pte_present(pte)) {
5041 dmar_domain->pgd = (struct dma_pte *)
5042 phys_to_virt(dma_pte_addr(pte));
5043 free_pgtable_page(pte);
5045 dmar_domain->agaw--;
5051 static int intel_iommu_attach_device(struct iommu_domain *domain,
5056 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5057 device_is_rmrr_locked(dev)) {
5058 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5062 if (is_aux_domain(dev, domain))
5065 /* normally dev is not mapped */
5066 if (unlikely(domain_context_mapped(dev))) {
5067 struct dmar_domain *old_domain;
5069 old_domain = find_domain(dev);
5071 dmar_remove_one_dev_info(dev);
5074 ret = prepare_domain_attach_device(domain, dev);
5078 return domain_add_dev_info(to_dmar_domain(domain), dev);
5081 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5086 if (!is_aux_domain(dev, domain))
5089 ret = prepare_domain_attach_device(domain, dev);
5093 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5096 static void intel_iommu_detach_device(struct iommu_domain *domain,
5099 dmar_remove_one_dev_info(dev);
5102 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5105 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5108 static int intel_iommu_map(struct iommu_domain *domain,
5109 unsigned long iova, phys_addr_t hpa,
5110 size_t size, int iommu_prot)
5112 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5117 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5120 if (iommu_prot & IOMMU_READ)
5121 prot |= DMA_PTE_READ;
5122 if (iommu_prot & IOMMU_WRITE)
5123 prot |= DMA_PTE_WRITE;
5124 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5125 prot |= DMA_PTE_SNP;
5127 max_addr = iova + size;
5128 if (dmar_domain->max_addr < max_addr) {
5131 /* check if minimum agaw is sufficient for mapped address */
5132 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5133 if (end < max_addr) {
5134 pr_err("%s: iommu width (%d) is not "
5135 "sufficient for the mapped address (%llx)\n",
5136 __func__, dmar_domain->gaw, max_addr);
5139 dmar_domain->max_addr = max_addr;
5141 /* Round up size to next multiple of PAGE_SIZE, if it and
5142 the low bits of hpa would take us onto the next page */
5143 size = aligned_nrpages(hpa, size);
5144 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5145 hpa >> VTD_PAGE_SHIFT, size, prot);
5149 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5150 unsigned long iova, size_t size)
5152 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5153 struct page *freelist = NULL;
5154 unsigned long start_pfn, last_pfn;
5155 unsigned int npages;
5156 int iommu_id, level = 0;
5158 /* Cope with horrid API which requires us to unmap more than the
5159 size argument if it happens to be a large-page mapping. */
5160 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5161 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5164 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5165 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5167 start_pfn = iova >> VTD_PAGE_SHIFT;
5168 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5170 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5172 npages = last_pfn - start_pfn + 1;
5174 for_each_domain_iommu(iommu_id, dmar_domain)
5175 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5176 start_pfn, npages, !freelist, 0);
5178 dma_free_pagelist(freelist);
5180 if (dmar_domain->max_addr == iova + size)
5181 dmar_domain->max_addr = iova;
5186 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5189 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5190 struct dma_pte *pte;
5194 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5197 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5199 phys = dma_pte_addr(pte);
5204 static inline bool scalable_mode_support(void)
5206 struct dmar_drhd_unit *drhd;
5207 struct intel_iommu *iommu;
5211 for_each_active_iommu(iommu, drhd) {
5212 if (!sm_supported(iommu)) {
5222 static inline bool iommu_pasid_support(void)
5224 struct dmar_drhd_unit *drhd;
5225 struct intel_iommu *iommu;
5229 for_each_active_iommu(iommu, drhd) {
5230 if (!pasid_supported(iommu)) {
5240 static bool intel_iommu_capable(enum iommu_cap cap)
5242 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5243 return domain_update_iommu_snooping(NULL) == 1;
5244 if (cap == IOMMU_CAP_INTR_REMAP)
5245 return irq_remapping_enabled == 1;
5250 static int intel_iommu_add_device(struct device *dev)
5252 struct dmar_domain *dmar_domain;
5253 struct iommu_domain *domain;
5254 struct intel_iommu *iommu;
5255 struct iommu_group *group;
5259 iommu = device_to_iommu(dev, &bus, &devfn);
5263 iommu_device_link(&iommu->iommu, dev);
5265 if (translation_pre_enabled(iommu))
5266 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5268 group = iommu_group_get_for_dev(dev);
5271 return PTR_ERR(group);
5273 iommu_group_put(group);
5275 domain = iommu_get_domain_for_dev(dev);
5276 dmar_domain = to_dmar_domain(domain);
5277 if (domain->type == IOMMU_DOMAIN_DMA) {
5278 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5279 ret = iommu_request_dm_for_dev(dev);
5281 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5282 domain_add_dev_info(si_domain, dev);
5284 "Device uses a private identity domain.\n");
5288 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5289 ret = iommu_request_dma_domain_for_dev(dev);
5291 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5292 if (!get_private_domain_for_dev(dev)) {
5294 "Failed to get a private domain.\n");
5299 "Device uses a private dma domain.\n");
5307 static void intel_iommu_remove_device(struct device *dev)
5309 struct intel_iommu *iommu;
5312 iommu = device_to_iommu(dev, &bus, &devfn);
5316 iommu_group_remove_device(dev);
5318 iommu_device_unlink(&iommu->iommu, dev);
5321 static void intel_iommu_get_resv_regions(struct device *device,
5322 struct list_head *head)
5324 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5325 struct iommu_resv_region *reg;
5326 struct dmar_rmrr_unit *rmrr;
5327 struct device *i_dev;
5330 down_read(&dmar_global_lock);
5331 for_each_rmrr_units(rmrr) {
5332 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5334 struct iommu_resv_region *resv;
5335 enum iommu_resv_type type;
5338 if (i_dev != device &&
5339 !is_downstream_to_pci_bridge(device, i_dev))
5342 length = rmrr->end_address - rmrr->base_address + 1;
5344 type = device_rmrr_is_relaxable(device) ?
5345 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5347 resv = iommu_alloc_resv_region(rmrr->base_address,
5348 length, prot, type);
5352 list_add_tail(&resv->list, head);
5355 up_read(&dmar_global_lock);
5357 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5358 if (dev_is_pci(device)) {
5359 struct pci_dev *pdev = to_pci_dev(device);
5361 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5362 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5365 list_add_tail(®->list, head);
5368 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5370 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5371 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5375 list_add_tail(®->list, head);
5378 static void intel_iommu_put_resv_regions(struct device *dev,
5379 struct list_head *head)
5381 struct iommu_resv_region *entry, *next;
5383 list_for_each_entry_safe(entry, next, head, list)
5387 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5389 struct device_domain_info *info;
5390 struct context_entry *context;
5391 struct dmar_domain *domain;
5392 unsigned long flags;
5396 domain = find_domain(dev);
5400 spin_lock_irqsave(&device_domain_lock, flags);
5401 spin_lock(&iommu->lock);
5404 info = dev->archdata.iommu;
5405 if (!info || !info->pasid_supported)
5408 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5409 if (WARN_ON(!context))
5412 ctx_lo = context[0].lo;
5414 if (!(ctx_lo & CONTEXT_PASIDE)) {
5415 ctx_lo |= CONTEXT_PASIDE;
5416 context[0].lo = ctx_lo;
5418 iommu->flush.flush_context(iommu,
5419 domain->iommu_did[iommu->seq_id],
5420 PCI_DEVID(info->bus, info->devfn),
5421 DMA_CCMD_MASK_NOBIT,
5422 DMA_CCMD_DEVICE_INVL);
5425 /* Enable PASID support in the device, if it wasn't already */
5426 if (!info->pasid_enabled)
5427 iommu_enable_dev_iotlb(info);
5432 spin_unlock(&iommu->lock);
5433 spin_unlock_irqrestore(&device_domain_lock, flags);
5438 static void intel_iommu_apply_resv_region(struct device *dev,
5439 struct iommu_domain *domain,
5440 struct iommu_resv_region *region)
5442 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5443 unsigned long start, end;
5445 start = IOVA_PFN(region->start);
5446 end = IOVA_PFN(region->start + region->length - 1);
5448 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5451 #ifdef CONFIG_INTEL_IOMMU_SVM
5452 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5454 struct intel_iommu *iommu;
5457 if (iommu_dummy(dev)) {
5459 "No IOMMU translation for device; cannot enable SVM\n");
5463 iommu = device_to_iommu(dev, &bus, &devfn);
5465 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5471 #endif /* CONFIG_INTEL_IOMMU_SVM */
5473 static int intel_iommu_enable_auxd(struct device *dev)
5475 struct device_domain_info *info;
5476 struct intel_iommu *iommu;
5477 unsigned long flags;
5481 iommu = device_to_iommu(dev, &bus, &devfn);
5482 if (!iommu || dmar_disabled)
5485 if (!sm_supported(iommu) || !pasid_supported(iommu))
5488 ret = intel_iommu_enable_pasid(iommu, dev);
5492 spin_lock_irqsave(&device_domain_lock, flags);
5493 info = dev->archdata.iommu;
5494 info->auxd_enabled = 1;
5495 spin_unlock_irqrestore(&device_domain_lock, flags);
5500 static int intel_iommu_disable_auxd(struct device *dev)
5502 struct device_domain_info *info;
5503 unsigned long flags;
5505 spin_lock_irqsave(&device_domain_lock, flags);
5506 info = dev->archdata.iommu;
5507 if (!WARN_ON(!info))
5508 info->auxd_enabled = 0;
5509 spin_unlock_irqrestore(&device_domain_lock, flags);
5515 * A PCI express designated vendor specific extended capability is defined
5516 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5517 * for system software and tools to detect endpoint devices supporting the
5518 * Intel scalable IO virtualization without host driver dependency.
5520 * Returns the address of the matching extended capability structure within
5521 * the device's PCI configuration space or 0 if the device does not support
5524 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5529 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5531 pci_read_config_word(pdev, pos + 4, &vendor);
5532 pci_read_config_word(pdev, pos + 8, &id);
5533 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5536 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5543 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5545 if (feat == IOMMU_DEV_FEAT_AUX) {
5548 if (!dev_is_pci(dev) || dmar_disabled ||
5549 !scalable_mode_support() || !iommu_pasid_support())
5552 ret = pci_pasid_features(to_pci_dev(dev));
5556 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5563 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5565 if (feat == IOMMU_DEV_FEAT_AUX)
5566 return intel_iommu_enable_auxd(dev);
5572 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5574 if (feat == IOMMU_DEV_FEAT_AUX)
5575 return intel_iommu_disable_auxd(dev);
5581 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5583 struct device_domain_info *info = dev->archdata.iommu;
5585 if (feat == IOMMU_DEV_FEAT_AUX)
5586 return scalable_mode_support() && info && info->auxd_enabled;
5592 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5594 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5596 return dmar_domain->default_pasid > 0 ?
5597 dmar_domain->default_pasid : -EINVAL;
5600 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5603 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5606 const struct iommu_ops intel_iommu_ops = {
5607 .capable = intel_iommu_capable,
5608 .domain_alloc = intel_iommu_domain_alloc,
5609 .domain_free = intel_iommu_domain_free,
5610 .attach_dev = intel_iommu_attach_device,
5611 .detach_dev = intel_iommu_detach_device,
5612 .aux_attach_dev = intel_iommu_aux_attach_device,
5613 .aux_detach_dev = intel_iommu_aux_detach_device,
5614 .aux_get_pasid = intel_iommu_aux_get_pasid,
5615 .map = intel_iommu_map,
5616 .unmap = intel_iommu_unmap,
5617 .iova_to_phys = intel_iommu_iova_to_phys,
5618 .add_device = intel_iommu_add_device,
5619 .remove_device = intel_iommu_remove_device,
5620 .get_resv_regions = intel_iommu_get_resv_regions,
5621 .put_resv_regions = intel_iommu_put_resv_regions,
5622 .apply_resv_region = intel_iommu_apply_resv_region,
5623 .device_group = pci_device_group,
5624 .dev_has_feat = intel_iommu_dev_has_feat,
5625 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5626 .dev_enable_feat = intel_iommu_dev_enable_feat,
5627 .dev_disable_feat = intel_iommu_dev_disable_feat,
5628 .is_attach_deferred = intel_iommu_is_attach_deferred,
5629 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5632 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5634 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5635 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5647 static void quirk_iommu_rwbf(struct pci_dev *dev)
5650 * Mobile 4 Series Chipset neglects to set RWBF capability,
5651 * but needs it. Same seems to hold for the desktop versions.
5653 pci_info(dev, "Forcing write-buffer flush capability\n");
5657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5666 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5667 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5668 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5669 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5670 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5671 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5672 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5673 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5675 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5679 if (pci_read_config_word(dev, GGC, &ggc))
5682 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5683 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5685 } else if (dmar_map_gfx) {
5686 /* we have to ensure the gfx device is idle before we flush */
5687 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5688 intel_iommu_strict = 1;
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5696 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5697 ISOCH DMAR unit for the Azalia sound device, but not give it any
5698 TLB entries, which causes it to deadlock. Check for that. We do
5699 this in a function called from init_dmars(), instead of in a PCI
5700 quirk, because we don't want to print the obnoxious "BIOS broken"
5701 message if VT-d is actually disabled.
5703 static void __init check_tylersburg_isoch(void)
5705 struct pci_dev *pdev;
5706 uint32_t vtisochctrl;
5708 /* If there's no Azalia in the system anyway, forget it. */
5709 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5714 /* System Management Registers. Might be hidden, in which case
5715 we can't do the sanity check. But that's OK, because the
5716 known-broken BIOSes _don't_ actually hide it, so far. */
5717 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5721 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5728 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5729 if (vtisochctrl & 1)
5732 /* Drop all bits other than the number of TLB entries */
5733 vtisochctrl &= 0x1c;
5735 /* If we have the recommended number of TLB entries (16), fine. */
5736 if (vtisochctrl == 0x10)
5739 /* Zero TLB entries? You get to ride the short bus to school. */
5741 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5742 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5743 dmi_get_system_info(DMI_BIOS_VENDOR),
5744 dmi_get_system_info(DMI_BIOS_VERSION),
5745 dmi_get_system_info(DMI_PRODUCT_VERSION));
5746 iommu_identity_mapping |= IDENTMAP_AZALIA;
5750 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",