1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343 struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
364 #define IDENTMAP_ALL 1
365 #define IDENTMAP_GFX 2
366 #define IDENTMAP_AZALIA 4
368 int intel_iommu_gfx_mapped;
369 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
372 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
377 * Iterate over elements in device_domain_list and call the specified
378 * callback @fn against each element.
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381 void *data), void *data)
385 struct device_domain_info *info;
387 spin_lock_irqsave(&device_domain_lock, flags);
388 list_for_each_entry(info, &device_domain_list, global) {
389 ret = fn(info, data);
391 spin_unlock_irqrestore(&device_domain_lock, flags);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
400 const struct iommu_ops intel_iommu_ops;
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
404 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
409 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 static void init_translation_status(struct intel_iommu *iommu)
416 gsts = readl(iommu->reg + DMAR_GSTS_REG);
417 if (gsts & DMA_GSTS_TES)
418 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
424 return container_of(dom, struct dmar_domain, domain);
427 static int __init intel_iommu_setup(char *str)
432 if (!strncmp(str, "on", 2)) {
434 pr_info("IOMMU enabled\n");
435 } else if (!strncmp(str, "off", 3)) {
437 no_platform_optin = 1;
438 pr_info("IOMMU disabled\n");
439 } else if (!strncmp(str, "igfx_off", 8)) {
441 pr_info("Disable GFX device mapping\n");
442 } else if (!strncmp(str, "forcedac", 8)) {
443 pr_info("Forcing DAC for PCI devices\n");
445 } else if (!strncmp(str, "strict", 6)) {
446 pr_info("Disable batched IOTLB flush\n");
447 intel_iommu_strict = 1;
448 } else if (!strncmp(str, "sp_off", 6)) {
449 pr_info("Disable supported super page\n");
450 intel_iommu_superpage = 0;
451 } else if (!strncmp(str, "sm_on", 5)) {
452 pr_info("Intel-IOMMU: scalable mode supported\n");
454 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
545 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
547 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
555 sagaw = cap_sagaw(iommu->cap);
556 for (agaw = width_to_agaw(max_gaw);
558 if (test_bit(agaw, &sagaw))
566 * Calculate max SAGAW for each iommu.
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
570 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
574 * calculate agaw for each iommu.
575 * "SAGAW" may be different across iommus, use a default agaw, and
576 * get a supported less agaw for iommus that don't support the default agaw.
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
588 /* si_domain and vm domain should not get here. */
589 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592 for_each_domain_iommu(iommu_id, domain)
595 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598 return g_iommus[iommu_id];
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 struct dmar_drhd_unit *drhd;
604 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_domain_iommu(i, domain) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686 struct root_entry *root = &iommu->root_entry[bus];
687 struct context_entry *context;
691 if (sm_supported(iommu)) {
699 context = phys_to_virt(*entry & VTD_PAGE_MASK);
701 unsigned long phy_addr;
705 context = alloc_pgtable_page(iommu->node);
709 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710 phy_addr = virt_to_phys((void *)context);
711 *entry = phy_addr | 1;
712 __iommu_flush_cache(iommu, entry, sizeof(*entry));
714 return &context[devfn];
717 static int iommu_dummy(struct device *dev)
719 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
723 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
724 * sub-hierarchy of a candidate PCI-PCI bridge
725 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
726 * @bridge: the candidate PCI-PCI bridge
728 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
733 struct pci_dev *pdev, *pbridge;
735 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738 pdev = to_pci_dev(dev);
739 pbridge = to_pci_dev(bridge);
741 if (pbridge->subordinate &&
742 pbridge->subordinate->number <= pdev->bus->number &&
743 pbridge->subordinate->busn_res.end >= pdev->bus->number)
749 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 struct dmar_drhd_unit *drhd = NULL;
752 struct intel_iommu *iommu;
754 struct pci_dev *pdev = NULL;
758 if (iommu_dummy(dev))
761 if (dev_is_pci(dev)) {
762 struct pci_dev *pf_pdev;
764 pdev = to_pci_dev(dev);
767 /* VMD child devices currently cannot be handled individually */
768 if (is_vmd(pdev->bus))
772 /* VFs aren't listed in scope tables; we need to look up
773 * the PF instead to find the IOMMU. */
774 pf_pdev = pci_physfn(pdev);
776 segment = pci_domain_nr(pdev->bus);
777 } else if (has_acpi_companion(dev))
778 dev = &ACPI_COMPANION(dev)->dev;
781 for_each_active_iommu(iommu, drhd) {
782 if (pdev && segment != drhd->segment)
785 for_each_active_dev_scope(drhd->devices,
786 drhd->devices_cnt, i, tmp) {
788 /* For a VF use its original BDF# not that of the PF
789 * which we used for the IOMMU lookup. Strictly speaking
790 * we could do this for all PCI devices; we only need to
791 * get the BDF# from the scope table for ACPI matches. */
792 if (pdev && pdev->is_virtfn)
795 *bus = drhd->devices[i].bus;
796 *devfn = drhd->devices[i].devfn;
800 if (is_downstream_to_pci_bridge(dev, tmp))
804 if (pdev && drhd->include_all) {
806 *bus = pdev->bus->number;
807 *devfn = pdev->devfn;
818 static void domain_flush_cache(struct dmar_domain *domain,
819 void *addr, int size)
821 if (!domain->iommu_coherency)
822 clflush_cache_range(addr, size);
825 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
827 struct context_entry *context;
831 spin_lock_irqsave(&iommu->lock, flags);
832 context = iommu_context_addr(iommu, bus, devfn, 0);
834 ret = context_present(context);
835 spin_unlock_irqrestore(&iommu->lock, flags);
839 static void free_context_table(struct intel_iommu *iommu)
843 struct context_entry *context;
845 spin_lock_irqsave(&iommu->lock, flags);
846 if (!iommu->root_entry) {
849 for (i = 0; i < ROOT_ENTRY_NR; i++) {
850 context = iommu_context_addr(iommu, i, 0, 0);
852 free_pgtable_page(context);
854 if (!sm_supported(iommu))
857 context = iommu_context_addr(iommu, i, 0x80, 0);
859 free_pgtable_page(context);
862 free_pgtable_page(iommu->root_entry);
863 iommu->root_entry = NULL;
865 spin_unlock_irqrestore(&iommu->lock, flags);
868 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
869 unsigned long pfn, int *target_level)
871 struct dma_pte *parent, *pte;
872 int level = agaw_to_level(domain->agaw);
875 BUG_ON(!domain->pgd);
877 if (!domain_pfn_supported(domain, pfn))
878 /* Address beyond IOMMU's addressing capabilities. */
881 parent = domain->pgd;
886 offset = pfn_level_offset(pfn, level);
887 pte = &parent[offset];
888 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
890 if (level == *target_level)
893 if (!dma_pte_present(pte)) {
896 tmp_page = alloc_pgtable_page(domain->nid);
901 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
902 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
903 if (cmpxchg64(&pte->val, 0ULL, pteval))
904 /* Someone else set it while we were thinking; use theirs. */
905 free_pgtable_page(tmp_page);
907 domain_flush_cache(domain, pte, sizeof(*pte));
912 parent = phys_to_virt(dma_pte_addr(pte));
917 *target_level = level;
922 /* return address's pte at specific level */
923 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
925 int level, int *large_page)
927 struct dma_pte *parent, *pte;
928 int total = agaw_to_level(domain->agaw);
931 parent = domain->pgd;
932 while (level <= total) {
933 offset = pfn_level_offset(pfn, total);
934 pte = &parent[offset];
938 if (!dma_pte_present(pte)) {
943 if (dma_pte_superpage(pte)) {
948 parent = phys_to_virt(dma_pte_addr(pte));
954 /* clear last level pte, a tlb flush should be followed */
955 static void dma_pte_clear_range(struct dmar_domain *domain,
956 unsigned long start_pfn,
957 unsigned long last_pfn)
959 unsigned int large_page;
960 struct dma_pte *first_pte, *pte;
962 BUG_ON(!domain_pfn_supported(domain, start_pfn));
963 BUG_ON(!domain_pfn_supported(domain, last_pfn));
964 BUG_ON(start_pfn > last_pfn);
966 /* we don't need lock here; nobody else touches the iova range */
969 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
971 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
976 start_pfn += lvl_to_nr_pages(large_page);
978 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
980 domain_flush_cache(domain, first_pte,
981 (void *)pte - (void *)first_pte);
983 } while (start_pfn && start_pfn <= last_pfn);
986 static void dma_pte_free_level(struct dmar_domain *domain, int level,
987 int retain_level, struct dma_pte *pte,
988 unsigned long pfn, unsigned long start_pfn,
989 unsigned long last_pfn)
991 pfn = max(start_pfn, pfn);
992 pte = &pte[pfn_level_offset(pfn, level)];
995 unsigned long level_pfn;
996 struct dma_pte *level_pte;
998 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001 level_pfn = pfn & level_mask(level);
1002 level_pte = phys_to_virt(dma_pte_addr(pte));
1005 dma_pte_free_level(domain, level - 1, retain_level,
1006 level_pte, level_pfn, start_pfn,
1011 * Free the page table if we're below the level we want to
1012 * retain and the range covers the entire table.
1014 if (level < retain_level && !(start_pfn > level_pfn ||
1015 last_pfn < level_pfn + level_size(level) - 1)) {
1017 domain_flush_cache(domain, pte, sizeof(*pte));
1018 free_pgtable_page(level_pte);
1021 pfn += level_size(level);
1022 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1026 * clear last level (leaf) ptes and free page table pages below the
1027 * level we wish to keep intact.
1029 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1030 unsigned long start_pfn,
1031 unsigned long last_pfn,
1034 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1035 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1036 BUG_ON(start_pfn > last_pfn);
1038 dma_pte_clear_range(domain, start_pfn, last_pfn);
1040 /* We don't need lock here; nobody else touches the iova range */
1041 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1042 domain->pgd, 0, start_pfn, last_pfn);
1045 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1046 free_pgtable_page(domain->pgd);
1051 /* When a page at a given level is being unlinked from its parent, we don't
1052 need to *modify* it at all. All we need to do is make a list of all the
1053 pages which can be freed just as soon as we've flushed the IOTLB and we
1054 know the hardware page-walk will no longer touch them.
1055 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1057 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1058 int level, struct dma_pte *pte,
1059 struct page *freelist)
1063 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1064 pg->freelist = freelist;
1070 pte = page_address(pg);
1072 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1073 freelist = dma_pte_list_pagetables(domain, level - 1,
1076 } while (!first_pte_in_page(pte));
1081 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1082 struct dma_pte *pte, unsigned long pfn,
1083 unsigned long start_pfn,
1084 unsigned long last_pfn,
1085 struct page *freelist)
1087 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1089 pfn = max(start_pfn, pfn);
1090 pte = &pte[pfn_level_offset(pfn, level)];
1093 unsigned long level_pfn;
1095 if (!dma_pte_present(pte))
1098 level_pfn = pfn & level_mask(level);
1100 /* If range covers entire pagetable, free it */
1101 if (start_pfn <= level_pfn &&
1102 last_pfn >= level_pfn + level_size(level) - 1) {
1103 /* These suborbinate page tables are going away entirely. Don't
1104 bother to clear them; we're just going to *free* them. */
1105 if (level > 1 && !dma_pte_superpage(pte))
1106 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1112 } else if (level > 1) {
1113 /* Recurse down into a level that isn't *entirely* obsolete */
1114 freelist = dma_pte_clear_level(domain, level - 1,
1115 phys_to_virt(dma_pte_addr(pte)),
1116 level_pfn, start_pfn, last_pfn,
1120 pfn += level_size(level);
1121 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124 domain_flush_cache(domain, first_pte,
1125 (void *)++last_pte - (void *)first_pte);
1130 /* We can't just free the pages because the IOMMU may still be walking
1131 the page tables, and may have cached the intermediate levels. The
1132 pages can only be freed after the IOTLB flush has been done. */
1133 static struct page *domain_unmap(struct dmar_domain *domain,
1134 unsigned long start_pfn,
1135 unsigned long last_pfn)
1137 struct page *freelist;
1139 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1140 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1141 BUG_ON(start_pfn > last_pfn);
1143 /* we don't need lock here; nobody else touches the iova range */
1144 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1145 domain->pgd, 0, start_pfn, last_pfn, NULL);
1148 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1149 struct page *pgd_page = virt_to_page(domain->pgd);
1150 pgd_page->freelist = freelist;
1151 freelist = pgd_page;
1159 static void dma_free_pagelist(struct page *freelist)
1163 while ((pg = freelist)) {
1164 freelist = pg->freelist;
1165 free_pgtable_page(page_address(pg));
1169 static void iova_entry_free(unsigned long data)
1171 struct page *freelist = (struct page *)data;
1173 dma_free_pagelist(freelist);
1176 /* iommu handling */
1177 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1179 struct root_entry *root;
1180 unsigned long flags;
1182 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1184 pr_err("Allocating root entry for %s failed\n",
1189 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1191 spin_lock_irqsave(&iommu->lock, flags);
1192 iommu->root_entry = root;
1193 spin_unlock_irqrestore(&iommu->lock, flags);
1198 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 addr = virt_to_phys(iommu->root_entry);
1205 if (sm_supported(iommu))
1206 addr |= DMA_RTADDR_SMT;
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1211 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1213 /* Make sure hardware complete it */
1214 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1215 readl, (sts & DMA_GSTS_RTPS), sts);
1217 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1225 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1231 /* Make sure hardware complete it */
1232 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233 readl, (!(val & DMA_GSTS_WBFS)), val);
1235 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 /* return value determine if we need a write buffer flush */
1239 static void __iommu_flush_context(struct intel_iommu *iommu,
1240 u16 did, u16 source_id, u8 function_mask,
1247 case DMA_CCMD_GLOBAL_INVL:
1248 val = DMA_CCMD_GLOBAL_INVL;
1250 case DMA_CCMD_DOMAIN_INVL:
1251 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1253 case DMA_CCMD_DEVICE_INVL:
1254 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1255 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1260 val |= DMA_CCMD_ICC;
1262 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1263 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1267 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 /* return value determine if we need a write buffer flush */
1273 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1274 u64 addr, unsigned int size_order, u64 type)
1276 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1277 u64 val = 0, val_iva = 0;
1281 case DMA_TLB_GLOBAL_FLUSH:
1282 /* global flush doesn't need set IVA_REG */
1283 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1285 case DMA_TLB_DSI_FLUSH:
1286 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 case DMA_TLB_PSI_FLUSH:
1289 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 /* IH bit is passed in as part of address */
1291 val_iva = size_order | addr;
1296 /* Note: set drain read/write */
1299 * This is probably to be super secure.. Looks like we can
1300 * ignore it without any impact.
1302 if (cap_read_drain(iommu->cap))
1303 val |= DMA_TLB_READ_DRAIN;
1305 if (cap_write_drain(iommu->cap))
1306 val |= DMA_TLB_WRITE_DRAIN;
1308 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1309 /* Note: Only uses first TLB reg currently */
1311 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1312 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1314 /* Make sure hardware complete it */
1315 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1316 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1320 /* check IOTLB invalidation granularity */
1321 if (DMA_TLB_IAIG(val) == 0)
1322 pr_err("Flush IOTLB failed\n");
1323 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1324 pr_debug("TLB flush request %Lx, actual %Lx\n",
1325 (unsigned long long)DMA_TLB_IIRG(type),
1326 (unsigned long long)DMA_TLB_IAIG(val));
1329 static struct device_domain_info *
1330 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333 struct device_domain_info *info;
1335 assert_spin_locked(&device_domain_lock);
1340 list_for_each_entry(info, &domain->devices, link)
1341 if (info->iommu == iommu && info->bus == bus &&
1342 info->devfn == devfn) {
1343 if (info->ats_supported && info->dev)
1351 static void domain_update_iotlb(struct dmar_domain *domain)
1353 struct device_domain_info *info;
1354 bool has_iotlb_device = false;
1356 assert_spin_locked(&device_domain_lock);
1358 list_for_each_entry(info, &domain->devices, link) {
1359 struct pci_dev *pdev;
1361 if (!info->dev || !dev_is_pci(info->dev))
1364 pdev = to_pci_dev(info->dev);
1365 if (pdev->ats_enabled) {
1366 has_iotlb_device = true;
1371 domain->has_iotlb_device = has_iotlb_device;
1374 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1376 struct pci_dev *pdev;
1378 assert_spin_locked(&device_domain_lock);
1380 if (!info || !dev_is_pci(info->dev))
1383 pdev = to_pci_dev(info->dev);
1384 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1385 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1386 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1387 * reserved, which should be set to 0.
1389 if (!ecap_dit(info->iommu->ecap))
1392 struct pci_dev *pf_pdev;
1394 /* pdev will be returned if device is not a vf */
1395 pf_pdev = pci_physfn(pdev);
1396 info->pfsid = pci_dev_id(pf_pdev);
1399 #ifdef CONFIG_INTEL_IOMMU_SVM
1400 /* The PCIe spec, in its wisdom, declares that the behaviour of
1401 the device if you enable PASID support after ATS support is
1402 undefined. So always enable PASID support on devices which
1403 have it, even if we can't yet know if we're ever going to
1405 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1406 info->pasid_enabled = 1;
1408 if (info->pri_supported &&
1409 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1410 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1411 info->pri_enabled = 1;
1413 if (!pdev->untrusted && info->ats_supported &&
1414 pci_ats_page_aligned(pdev) &&
1415 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1416 info->ats_enabled = 1;
1417 domain_update_iotlb(info->domain);
1418 info->ats_qdep = pci_ats_queue_depth(pdev);
1422 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1424 struct pci_dev *pdev;
1426 assert_spin_locked(&device_domain_lock);
1428 if (!dev_is_pci(info->dev))
1431 pdev = to_pci_dev(info->dev);
1433 if (info->ats_enabled) {
1434 pci_disable_ats(pdev);
1435 info->ats_enabled = 0;
1436 domain_update_iotlb(info->domain);
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439 if (info->pri_enabled) {
1440 pci_disable_pri(pdev);
1441 info->pri_enabled = 0;
1443 if (info->pasid_enabled) {
1444 pci_disable_pasid(pdev);
1445 info->pasid_enabled = 0;
1450 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1451 u64 addr, unsigned mask)
1454 unsigned long flags;
1455 struct device_domain_info *info;
1457 if (!domain->has_iotlb_device)
1460 spin_lock_irqsave(&device_domain_lock, flags);
1461 list_for_each_entry(info, &domain->devices, link) {
1462 if (!info->ats_enabled)
1465 sid = info->bus << 8 | info->devfn;
1466 qdep = info->ats_qdep;
1467 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1473 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1474 struct dmar_domain *domain,
1475 unsigned long pfn, unsigned int pages,
1478 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1479 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1480 u16 did = domain->iommu_did[iommu->seq_id];
1487 * Fallback to domain selective flush if no PSI support or the size is
1489 * PSI requires page size to be 2 ^ x, and the base address is naturally
1490 * aligned to the size
1492 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1493 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1500 * In caching mode, changes of pages from non-present to present require
1501 * flush. However, device IOTLB doesn't need to be flushed in this case.
1503 if (!cap_caching_mode(iommu->cap) || !map)
1504 iommu_flush_dev_iotlb(domain, addr, mask);
1507 /* Notification for newly created mappings */
1508 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1509 struct dmar_domain *domain,
1510 unsigned long pfn, unsigned int pages)
1512 /* It's a non-present to present mapping. Only flush if caching mode */
1513 if (cap_caching_mode(iommu->cap))
1514 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1516 iommu_flush_write_buffer(iommu);
1519 static void iommu_flush_iova(struct iova_domain *iovad)
1521 struct dmar_domain *domain;
1524 domain = container_of(iovad, struct dmar_domain, iovad);
1526 for_each_domain_iommu(idx, domain) {
1527 struct intel_iommu *iommu = g_iommus[idx];
1528 u16 did = domain->iommu_did[iommu->seq_id];
1530 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1532 if (!cap_caching_mode(iommu->cap))
1533 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1534 0, MAX_AGAW_PFN_WIDTH);
1538 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 unsigned long flags;
1543 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1547 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1548 pmen &= ~DMA_PMEN_EPM;
1549 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1551 /* wait for the protected region status bit to clear */
1552 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1553 readl, !(pmen & DMA_PMEN_PRS), pmen);
1555 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 unsigned long flags;
1563 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1564 iommu->gcmd |= DMA_GCMD_TE;
1565 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1567 /* Make sure hardware complete it */
1568 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1569 readl, (sts & DMA_GSTS_TES), sts);
1571 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 static void iommu_disable_translation(struct intel_iommu *iommu)
1579 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1580 iommu->gcmd &= ~DMA_GCMD_TE;
1581 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1583 /* Make sure hardware complete it */
1584 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1585 readl, (!(sts & DMA_GSTS_TES)), sts);
1587 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 static int iommu_init_domains(struct intel_iommu *iommu)
1592 u32 ndomains, nlongs;
1595 ndomains = cap_ndoms(iommu->cap);
1596 pr_debug("%s: Number of Domains supported <%d>\n",
1597 iommu->name, ndomains);
1598 nlongs = BITS_TO_LONGS(ndomains);
1600 spin_lock_init(&iommu->lock);
1602 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1603 if (!iommu->domain_ids) {
1604 pr_err("%s: Allocating domain id array failed\n",
1609 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1610 iommu->domains = kzalloc(size, GFP_KERNEL);
1612 if (iommu->domains) {
1613 size = 256 * sizeof(struct dmar_domain *);
1614 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617 if (!iommu->domains || !iommu->domains[0]) {
1618 pr_err("%s: Allocating domain array failed\n",
1620 kfree(iommu->domain_ids);
1621 kfree(iommu->domains);
1622 iommu->domain_ids = NULL;
1623 iommu->domains = NULL;
1628 * If Caching mode is set, then invalid translations are tagged
1629 * with domain-id 0, hence we need to pre-allocate it. We also
1630 * use domain-id 0 as a marker for non-allocated domain-id, so
1631 * make sure it is not used for a real domain.
1633 set_bit(0, iommu->domain_ids);
1636 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1637 * entry for first-level or pass-through translation modes should
1638 * be programmed with a domain id different from those used for
1639 * second-level or nested translation. We reserve a domain id for
1642 if (sm_supported(iommu))
1643 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1648 static void disable_dmar_iommu(struct intel_iommu *iommu)
1650 struct device_domain_info *info, *tmp;
1651 unsigned long flags;
1653 if (!iommu->domains || !iommu->domain_ids)
1656 spin_lock_irqsave(&device_domain_lock, flags);
1657 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1658 if (info->iommu != iommu)
1661 if (!info->dev || !info->domain)
1664 __dmar_remove_one_dev_info(info);
1666 spin_unlock_irqrestore(&device_domain_lock, flags);
1668 if (iommu->gcmd & DMA_GCMD_TE)
1669 iommu_disable_translation(iommu);
1672 static void free_dmar_iommu(struct intel_iommu *iommu)
1674 if ((iommu->domains) && (iommu->domain_ids)) {
1675 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678 for (i = 0; i < elems; i++)
1679 kfree(iommu->domains[i]);
1680 kfree(iommu->domains);
1681 kfree(iommu->domain_ids);
1682 iommu->domains = NULL;
1683 iommu->domain_ids = NULL;
1686 g_iommus[iommu->seq_id] = NULL;
1688 /* free context mapping */
1689 free_context_table(iommu);
1691 #ifdef CONFIG_INTEL_IOMMU_SVM
1692 if (pasid_supported(iommu)) {
1693 if (ecap_prs(iommu->ecap))
1694 intel_svm_finish_prq(iommu);
1699 static struct dmar_domain *alloc_domain(int flags)
1701 struct dmar_domain *domain;
1703 domain = alloc_domain_mem();
1707 memset(domain, 0, sizeof(*domain));
1708 domain->nid = NUMA_NO_NODE;
1709 domain->flags = flags;
1710 domain->has_iotlb_device = false;
1711 INIT_LIST_HEAD(&domain->devices);
1716 /* Must be called with iommu->lock */
1717 static int domain_attach_iommu(struct dmar_domain *domain,
1718 struct intel_iommu *iommu)
1720 unsigned long ndomains;
1723 assert_spin_locked(&device_domain_lock);
1724 assert_spin_locked(&iommu->lock);
1726 domain->iommu_refcnt[iommu->seq_id] += 1;
1727 domain->iommu_count += 1;
1728 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1729 ndomains = cap_ndoms(iommu->cap);
1730 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1732 if (num >= ndomains) {
1733 pr_err("%s: No free domain ids\n", iommu->name);
1734 domain->iommu_refcnt[iommu->seq_id] -= 1;
1735 domain->iommu_count -= 1;
1739 set_bit(num, iommu->domain_ids);
1740 set_iommu_domain(iommu, num, domain);
1742 domain->iommu_did[iommu->seq_id] = num;
1743 domain->nid = iommu->node;
1745 domain_update_iommu_cap(domain);
1751 static int domain_detach_iommu(struct dmar_domain *domain,
1752 struct intel_iommu *iommu)
1756 assert_spin_locked(&device_domain_lock);
1757 assert_spin_locked(&iommu->lock);
1759 domain->iommu_refcnt[iommu->seq_id] -= 1;
1760 count = --domain->iommu_count;
1761 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1762 num = domain->iommu_did[iommu->seq_id];
1763 clear_bit(num, iommu->domain_ids);
1764 set_iommu_domain(iommu, num, NULL);
1766 domain_update_iommu_cap(domain);
1767 domain->iommu_did[iommu->seq_id] = 0;
1773 static struct iova_domain reserved_iova_list;
1774 static struct lock_class_key reserved_rbtree_key;
1776 static int dmar_init_reserved_ranges(void)
1778 struct pci_dev *pdev = NULL;
1782 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1784 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1785 &reserved_rbtree_key);
1787 /* IOAPIC ranges shouldn't be accessed by DMA */
1788 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1789 IOVA_PFN(IOAPIC_RANGE_END));
1791 pr_err("Reserve IOAPIC range failed\n");
1795 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1796 for_each_pci_dev(pdev) {
1799 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1800 r = &pdev->resource[i];
1801 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1803 iova = reserve_iova(&reserved_iova_list,
1807 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1817 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 static inline int guestwidth_to_adjustwidth(int gaw)
1823 int r = (gaw - 12) % 9;
1834 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837 int adjust_width, agaw;
1838 unsigned long sagaw;
1841 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1843 err = init_iova_flush_queue(&domain->iovad,
1844 iommu_flush_iova, iova_entry_free);
1848 domain_reserve_special_ranges(domain);
1850 /* calculate AGAW */
1851 if (guest_width > cap_mgaw(iommu->cap))
1852 guest_width = cap_mgaw(iommu->cap);
1853 domain->gaw = guest_width;
1854 adjust_width = guestwidth_to_adjustwidth(guest_width);
1855 agaw = width_to_agaw(adjust_width);
1856 sagaw = cap_sagaw(iommu->cap);
1857 if (!test_bit(agaw, &sagaw)) {
1858 /* hardware doesn't support it, choose a bigger one */
1859 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1860 agaw = find_next_bit(&sagaw, 5, agaw);
1864 domain->agaw = agaw;
1866 if (ecap_coherent(iommu->ecap))
1867 domain->iommu_coherency = 1;
1869 domain->iommu_coherency = 0;
1871 if (ecap_sc_support(iommu->ecap))
1872 domain->iommu_snooping = 1;
1874 domain->iommu_snooping = 0;
1876 if (intel_iommu_superpage)
1877 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1879 domain->iommu_superpage = 0;
1881 domain->nid = iommu->node;
1883 /* always allocate the top pgd */
1884 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1891 static void domain_exit(struct dmar_domain *domain)
1894 /* Remove associated devices and clear attached or cached domains */
1895 domain_remove_dev_info(domain);
1898 put_iova_domain(&domain->iovad);
1901 struct page *freelist;
1903 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1904 dma_free_pagelist(freelist);
1907 free_domain_mem(domain);
1911 * Get the PASID directory size for scalable mode context entry.
1912 * Value of X in the PDTS field of a scalable mode context entry
1913 * indicates PASID directory with 2^(X + 7) entries.
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928 * Set the RID_PASID field of a scalable mode context entry. The
1929 * IOMMU hardware will use the PASID value set in this field for
1930 * DMA translations of DMA requests without PASID.
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1935 context->hi |= pasid & ((1 << 20) - 1);
1936 context->hi |= (1 << 20);
1940 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943 static inline void context_set_sm_dte(struct context_entry *context)
1945 context->lo |= (1 << 2);
1949 * Set the PRE(Page Request Enable) field of a scalable mode context
1952 static inline void context_set_sm_pre(struct context_entry *context)
1954 context->lo |= (1 << 4);
1957 /* Convert value to context PASID directory size field coding. */
1958 #define context_pdts(pds) (((pds) & 0x7) << 9)
1960 static int domain_context_mapping_one(struct dmar_domain *domain,
1961 struct intel_iommu *iommu,
1962 struct pasid_table *table,
1965 u16 did = domain->iommu_did[iommu->seq_id];
1966 int translation = CONTEXT_TT_MULTI_LEVEL;
1967 struct device_domain_info *info = NULL;
1968 struct context_entry *context;
1969 unsigned long flags;
1974 if (hw_pass_through && domain_type_is_si(domain))
1975 translation = CONTEXT_TT_PASS_THROUGH;
1977 pr_debug("Set context mapping for %02x:%02x.%d\n",
1978 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1980 BUG_ON(!domain->pgd);
1982 spin_lock_irqsave(&device_domain_lock, flags);
1983 spin_lock(&iommu->lock);
1986 context = iommu_context_addr(iommu, bus, devfn, 1);
1991 if (context_present(context))
1995 * For kdump cases, old valid entries may be cached due to the
1996 * in-flight DMA and copied pgtable, but there is no unmapping
1997 * behaviour for them, thus we need an explicit cache flush for
1998 * the newly-mapped device. For kdump, at this point, the device
1999 * is supposed to finish reset at its driver probe stage, so no
2000 * in-flight DMA will exist, and we don't need to worry anymore
2003 if (context_copied(context)) {
2004 u16 did_old = context_domain_id(context);
2006 if (did_old < cap_ndoms(iommu->cap)) {
2007 iommu->flush.flush_context(iommu, did_old,
2008 (((u16)bus) << 8) | devfn,
2009 DMA_CCMD_MASK_NOBIT,
2010 DMA_CCMD_DEVICE_INVL);
2011 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2016 context_clear_entry(context);
2018 if (sm_supported(iommu)) {
2023 /* Setup the PASID DIR pointer: */
2024 pds = context_get_sm_pds(table);
2025 context->lo = (u64)virt_to_phys(table->table) |
2028 /* Setup the RID_PASID field: */
2029 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032 * Setup the Device-TLB enable bit and Page request
2035 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2036 if (info && info->ats_supported)
2037 context_set_sm_dte(context);
2038 if (info && info->pri_supported)
2039 context_set_sm_pre(context);
2041 struct dma_pte *pgd = domain->pgd;
2044 context_set_domain_id(context, did);
2046 if (translation != CONTEXT_TT_PASS_THROUGH) {
2048 * Skip top levels of page tables for iommu which has
2049 * less agaw than default. Unnecessary for PT mode.
2051 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2053 pgd = phys_to_virt(dma_pte_addr(pgd));
2054 if (!dma_pte_present(pgd))
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 translation = CONTEXT_TT_DEV_IOTLB;
2062 translation = CONTEXT_TT_MULTI_LEVEL;
2064 context_set_address_root(context, virt_to_phys(pgd));
2065 context_set_address_width(context, agaw);
2068 * In pass through mode, AW must be programmed to
2069 * indicate the largest AGAW value supported by
2070 * hardware. And ASR is ignored by hardware.
2072 context_set_address_width(context, iommu->msagaw);
2075 context_set_translation_type(context, translation);
2078 context_set_fault_enable(context);
2079 context_set_present(context);
2080 domain_flush_cache(domain, context, sizeof(*context));
2083 * It's a non-present to present mapping. If hardware doesn't cache
2084 * non-present entry we only need to flush the write-buffer. If the
2085 * _does_ cache non-present entries, then it does so in the special
2086 * domain #0, which we have to flush:
2088 if (cap_caching_mode(iommu->cap)) {
2089 iommu->flush.flush_context(iommu, 0,
2090 (((u16)bus) << 8) | devfn,
2091 DMA_CCMD_MASK_NOBIT,
2092 DMA_CCMD_DEVICE_INVL);
2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2095 iommu_flush_write_buffer(iommu);
2097 iommu_enable_dev_iotlb(info);
2102 spin_unlock(&iommu->lock);
2103 spin_unlock_irqrestore(&device_domain_lock, flags);
2109 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2111 struct pasid_table *table;
2112 struct intel_iommu *iommu;
2115 iommu = device_to_iommu(dev, &bus, &devfn);
2119 table = intel_pasid_get_table(dev);
2120 return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2123 static int domain_context_mapped_cb(struct pci_dev *pdev,
2124 u16 alias, void *opaque)
2126 struct intel_iommu *iommu = opaque;
2128 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2131 static int domain_context_mapped(struct device *dev)
2133 struct intel_iommu *iommu;
2136 iommu = device_to_iommu(dev, &bus, &devfn);
2140 if (!dev_is_pci(dev))
2141 return device_context_mapped(iommu, bus, devfn);
2143 return !pci_for_each_dma_alias(to_pci_dev(dev),
2144 domain_context_mapped_cb, iommu);
2147 /* Returns a number of VTD pages, but aligned to MM page size */
2148 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2151 host_addr &= ~PAGE_MASK;
2152 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2155 /* Return largest possible superpage level for a given mapping */
2156 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2157 unsigned long iov_pfn,
2158 unsigned long phy_pfn,
2159 unsigned long pages)
2161 int support, level = 1;
2162 unsigned long pfnmerge;
2164 support = domain->iommu_superpage;
2166 /* To use a large page, the virtual *and* physical addresses
2167 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2168 of them will mean we have to use smaller pages. So just
2169 merge them and check both at once. */
2170 pfnmerge = iov_pfn | phy_pfn;
2172 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2173 pages >>= VTD_STRIDE_SHIFT;
2176 pfnmerge >>= VTD_STRIDE_SHIFT;
2183 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184 struct scatterlist *sg, unsigned long phys_pfn,
2185 unsigned long nr_pages, int prot)
2187 struct dma_pte *first_pte = NULL, *pte = NULL;
2188 phys_addr_t uninitialized_var(pteval);
2189 unsigned long sg_res = 0;
2190 unsigned int largepage_lvl = 0;
2191 unsigned long lvl_pages = 0;
2193 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2195 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2198 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2202 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2205 while (nr_pages > 0) {
2209 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2211 sg_res = aligned_nrpages(sg->offset, sg->length);
2212 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2213 sg->dma_length = sg->length;
2214 pteval = (sg_phys(sg) - pgoff) | prot;
2215 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2219 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2221 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2224 /* It is large page*/
2225 if (largepage_lvl > 1) {
2226 unsigned long nr_superpages, end_pfn;
2228 pteval |= DMA_PTE_LARGE_PAGE;
2229 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2231 nr_superpages = sg_res / lvl_pages;
2232 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2235 * Ensure that old small page tables are
2236 * removed to make room for superpage(s).
2237 * We're adding new large pages, so make sure
2238 * we don't remove their parent tables.
2240 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2243 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2247 /* We don't need lock here, nobody else
2248 * touches the iova range
2250 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2252 static int dumps = 5;
2253 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2254 iov_pfn, tmp, (unsigned long long)pteval);
2257 debug_dma_dump_mappings(NULL);
2262 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2264 BUG_ON(nr_pages < lvl_pages);
2265 BUG_ON(sg_res < lvl_pages);
2267 nr_pages -= lvl_pages;
2268 iov_pfn += lvl_pages;
2269 phys_pfn += lvl_pages;
2270 pteval += lvl_pages * VTD_PAGE_SIZE;
2271 sg_res -= lvl_pages;
2273 /* If the next PTE would be the first in a new page, then we
2274 need to flush the cache on the entries we've just written.
2275 And then we'll need to recalculate 'pte', so clear it and
2276 let it get set again in the if (!pte) block above.
2278 If we're done (!nr_pages) we need to flush the cache too.
2280 Also if we've been setting superpages, we may need to
2281 recalculate 'pte' and switch back to smaller pages for the
2282 end of the mapping, if the trailing size is not enough to
2283 use another superpage (i.e. sg_res < lvl_pages). */
2285 if (!nr_pages || first_pte_in_page(pte) ||
2286 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2287 domain_flush_cache(domain, first_pte,
2288 (void *)pte - (void *)first_pte);
2292 if (!sg_res && nr_pages)
2298 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299 struct scatterlist *sg, unsigned long phys_pfn,
2300 unsigned long nr_pages, int prot)
2303 struct intel_iommu *iommu;
2305 /* Do the real mapping first */
2306 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2310 for_each_domain_iommu(iommu_id, domain) {
2311 iommu = g_iommus[iommu_id];
2312 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2318 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2319 struct scatterlist *sg, unsigned long nr_pages,
2322 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2325 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326 unsigned long phys_pfn, unsigned long nr_pages,
2329 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2332 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2334 unsigned long flags;
2335 struct context_entry *context;
2341 spin_lock_irqsave(&iommu->lock, flags);
2342 context = iommu_context_addr(iommu, bus, devfn, 0);
2344 spin_unlock_irqrestore(&iommu->lock, flags);
2347 did_old = context_domain_id(context);
2348 context_clear_entry(context);
2349 __iommu_flush_cache(iommu, context, sizeof(*context));
2350 spin_unlock_irqrestore(&iommu->lock, flags);
2351 iommu->flush.flush_context(iommu,
2353 (((u16)bus) << 8) | devfn,
2354 DMA_CCMD_MASK_NOBIT,
2355 DMA_CCMD_DEVICE_INVL);
2356 iommu->flush.flush_iotlb(iommu,
2363 static inline void unlink_domain_info(struct device_domain_info *info)
2365 assert_spin_locked(&device_domain_lock);
2366 list_del(&info->link);
2367 list_del(&info->global);
2369 info->dev->archdata.iommu = NULL;
2372 static void domain_remove_dev_info(struct dmar_domain *domain)
2374 struct device_domain_info *info, *tmp;
2375 unsigned long flags;
2377 spin_lock_irqsave(&device_domain_lock, flags);
2378 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2379 __dmar_remove_one_dev_info(info);
2380 spin_unlock_irqrestore(&device_domain_lock, flags);
2385 * Note: we use struct device->archdata.iommu stores the info
2387 static struct dmar_domain *find_domain(struct device *dev)
2389 struct device_domain_info *info;
2391 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2392 struct iommu_domain *domain;
2394 dev->archdata.iommu = NULL;
2395 domain = iommu_get_domain_for_dev(dev);
2397 intel_iommu_attach_device(domain, dev);
2400 /* No lock here, assumes no domain exit in normal case */
2401 info = dev->archdata.iommu;
2404 return info->domain;
2408 static inline struct device_domain_info *
2409 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2411 struct device_domain_info *info;
2413 list_for_each_entry(info, &device_domain_list, global)
2414 if (info->iommu->segment == segment && info->bus == bus &&
2415 info->devfn == devfn)
2421 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2424 struct dmar_domain *domain)
2426 struct dmar_domain *found = NULL;
2427 struct device_domain_info *info;
2428 unsigned long flags;
2431 info = alloc_devinfo_mem();
2436 info->devfn = devfn;
2437 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2438 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2441 info->domain = domain;
2442 info->iommu = iommu;
2443 info->pasid_table = NULL;
2444 info->auxd_enabled = 0;
2445 INIT_LIST_HEAD(&info->auxiliary_domains);
2447 if (dev && dev_is_pci(dev)) {
2448 struct pci_dev *pdev = to_pci_dev(info->dev);
2450 if (!pdev->untrusted &&
2451 !pci_ats_disabled() &&
2452 ecap_dev_iotlb_support(iommu->ecap) &&
2453 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2454 dmar_find_matched_atsr_unit(pdev))
2455 info->ats_supported = 1;
2457 if (sm_supported(iommu)) {
2458 if (pasid_supported(iommu)) {
2459 int features = pci_pasid_features(pdev);
2461 info->pasid_supported = features | 1;
2464 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2465 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2466 info->pri_supported = 1;
2470 spin_lock_irqsave(&device_domain_lock, flags);
2472 found = find_domain(dev);
2475 struct device_domain_info *info2;
2476 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2478 found = info2->domain;
2484 spin_unlock_irqrestore(&device_domain_lock, flags);
2485 free_devinfo_mem(info);
2486 /* Caller must free the original domain */
2490 spin_lock(&iommu->lock);
2491 ret = domain_attach_iommu(domain, iommu);
2492 spin_unlock(&iommu->lock);
2495 spin_unlock_irqrestore(&device_domain_lock, flags);
2496 free_devinfo_mem(info);
2500 list_add(&info->link, &domain->devices);
2501 list_add(&info->global, &device_domain_list);
2503 dev->archdata.iommu = info;
2504 spin_unlock_irqrestore(&device_domain_lock, flags);
2506 /* PASID table is mandatory for a PCI device in scalable mode. */
2507 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2508 ret = intel_pasid_alloc_table(dev);
2510 dev_err(dev, "PASID table allocation failed\n");
2511 dmar_remove_one_dev_info(dev);
2515 /* Setup the PASID entry for requests without PASID: */
2516 spin_lock(&iommu->lock);
2517 if (hw_pass_through && domain_type_is_si(domain))
2518 ret = intel_pasid_setup_pass_through(iommu, domain,
2519 dev, PASID_RID2PASID);
2521 ret = intel_pasid_setup_second_level(iommu, domain,
2522 dev, PASID_RID2PASID);
2523 spin_unlock(&iommu->lock);
2525 dev_err(dev, "Setup RID2PASID failed\n");
2526 dmar_remove_one_dev_info(dev);
2531 if (dev && domain_context_mapping(domain, dev)) {
2532 dev_err(dev, "Domain context map failed\n");
2533 dmar_remove_one_dev_info(dev);
2540 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2542 *(u16 *)opaque = alias;
2546 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2548 struct device_domain_info *info;
2549 struct dmar_domain *domain = NULL;
2550 struct intel_iommu *iommu;
2552 unsigned long flags;
2555 iommu = device_to_iommu(dev, &bus, &devfn);
2559 if (dev_is_pci(dev)) {
2560 struct pci_dev *pdev = to_pci_dev(dev);
2562 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2564 spin_lock_irqsave(&device_domain_lock, flags);
2565 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2566 PCI_BUS_NUM(dma_alias),
2569 iommu = info->iommu;
2570 domain = info->domain;
2572 spin_unlock_irqrestore(&device_domain_lock, flags);
2574 /* DMA alias already has a domain, use it */
2579 /* Allocate and initialize new domain for the device */
2580 domain = alloc_domain(0);
2583 if (domain_init(domain, iommu, gaw)) {
2584 domain_exit(domain);
2592 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2593 struct dmar_domain *domain)
2595 struct intel_iommu *iommu;
2596 struct dmar_domain *tmp;
2597 u16 req_id, dma_alias;
2600 iommu = device_to_iommu(dev, &bus, &devfn);
2604 req_id = ((u16)bus << 8) | devfn;
2606 if (dev_is_pci(dev)) {
2607 struct pci_dev *pdev = to_pci_dev(dev);
2609 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611 /* register PCI DMA alias device */
2612 if (req_id != dma_alias) {
2613 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2614 dma_alias & 0xff, NULL, domain);
2616 if (!tmp || tmp != domain)
2621 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2622 if (!tmp || tmp != domain)
2628 static int iommu_domain_identity_map(struct dmar_domain *domain,
2629 unsigned long long start,
2630 unsigned long long end)
2632 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2633 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2635 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2636 dma_to_mm_pfn(last_vpfn))) {
2637 pr_err("Reserving iova failed\n");
2641 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2643 * RMRR range might have overlap with physical memory range,
2646 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2648 return __domain_mapping(domain, first_vpfn, NULL,
2649 first_vpfn, last_vpfn - first_vpfn + 1,
2650 DMA_PTE_READ|DMA_PTE_WRITE);
2653 static int domain_prepare_identity_map(struct device *dev,
2654 struct dmar_domain *domain,
2655 unsigned long long start,
2656 unsigned long long end)
2658 /* For _hardware_ passthrough, don't bother. But for software
2659 passthrough, we do it anyway -- it may indicate a memory
2660 range which is reserved in E820, so which didn't get set
2661 up to start with in si_domain */
2662 if (domain == si_domain && hw_pass_through) {
2663 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2668 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2671 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2672 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2673 dmi_get_system_info(DMI_BIOS_VENDOR),
2674 dmi_get_system_info(DMI_BIOS_VERSION),
2675 dmi_get_system_info(DMI_PRODUCT_VERSION));
2679 if (end >> agaw_to_width(domain->agaw)) {
2680 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2681 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2682 agaw_to_width(domain->agaw),
2683 dmi_get_system_info(DMI_BIOS_VENDOR),
2684 dmi_get_system_info(DMI_BIOS_VERSION),
2685 dmi_get_system_info(DMI_PRODUCT_VERSION));
2689 return iommu_domain_identity_map(domain, start, end);
2692 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2694 static int __init si_domain_init(int hw)
2696 struct dmar_rmrr_unit *rmrr;
2700 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2704 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2705 domain_exit(si_domain);
2712 for_each_online_node(nid) {
2713 unsigned long start_pfn, end_pfn;
2716 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2717 ret = iommu_domain_identity_map(si_domain,
2718 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2725 * Normally we use DMA domains for devices which have RMRRs. But we
2726 * loose this requirement for graphic and usb devices. Identity map
2727 * the RMRRs for graphic and USB devices so that they could use the
2730 for_each_rmrr_units(rmrr) {
2731 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2733 unsigned long long start = rmrr->base_address;
2734 unsigned long long end = rmrr->end_address;
2736 if (device_is_rmrr_locked(dev))
2739 if (WARN_ON(end < start ||
2740 end >> agaw_to_width(si_domain->agaw)))
2743 ret = iommu_domain_identity_map(si_domain, start, end);
2752 static int identity_mapping(struct device *dev)
2754 struct device_domain_info *info;
2756 info = dev->archdata.iommu;
2757 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2758 return (info->domain == si_domain);
2763 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2765 struct dmar_domain *ndomain;
2766 struct intel_iommu *iommu;
2769 iommu = device_to_iommu(dev, &bus, &devfn);
2773 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2774 if (ndomain != domain)
2780 static bool device_has_rmrr(struct device *dev)
2782 struct dmar_rmrr_unit *rmrr;
2787 for_each_rmrr_units(rmrr) {
2789 * Return TRUE if this RMRR contains the device that
2792 for_each_active_dev_scope(rmrr->devices,
2793 rmrr->devices_cnt, i, tmp)
2795 is_downstream_to_pci_bridge(dev, tmp)) {
2805 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2806 * is relaxable (ie. is allowed to be not enforced under some conditions)
2807 * @dev: device handle
2809 * We assume that PCI USB devices with RMRRs have them largely
2810 * for historical reasons and that the RMRR space is not actively used post
2811 * boot. This exclusion may change if vendors begin to abuse it.
2813 * The same exception is made for graphics devices, with the requirement that
2814 * any use of the RMRR regions will be torn down before assigning the device
2817 * Return: true if the RMRR is relaxable, false otherwise
2819 static bool device_rmrr_is_relaxable(struct device *dev)
2821 struct pci_dev *pdev;
2823 if (!dev_is_pci(dev))
2826 pdev = to_pci_dev(dev);
2827 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2834 * There are a couple cases where we need to restrict the functionality of
2835 * devices associated with RMRRs. The first is when evaluating a device for
2836 * identity mapping because problems exist when devices are moved in and out
2837 * of domains and their respective RMRR information is lost. This means that
2838 * a device with associated RMRRs will never be in a "passthrough" domain.
2839 * The second is use of the device through the IOMMU API. This interface
2840 * expects to have full control of the IOVA space for the device. We cannot
2841 * satisfy both the requirement that RMRR access is maintained and have an
2842 * unencumbered IOVA space. We also have no ability to quiesce the device's
2843 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2844 * We therefore prevent devices associated with an RMRR from participating in
2845 * the IOMMU API, which eliminates them from device assignment.
2847 * In both cases, devices which have relaxable RMRRs are not concerned by this
2848 * restriction. See device_rmrr_is_relaxable comment.
2850 static bool device_is_rmrr_locked(struct device *dev)
2852 if (!device_has_rmrr(dev))
2855 if (device_rmrr_is_relaxable(dev))
2862 * Return the required default domain type for a specific device.
2864 * @dev: the device in query
2865 * @startup: true if this is during early boot
2868 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2869 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2870 * - 0: both identity and dynamic domains work for this device
2872 static int device_def_domain_type(struct device *dev)
2874 if (dev_is_pci(dev)) {
2875 struct pci_dev *pdev = to_pci_dev(dev);
2877 if (device_is_rmrr_locked(dev))
2878 return IOMMU_DOMAIN_DMA;
2881 * Prevent any device marked as untrusted from getting
2882 * placed into the statically identity mapping domain.
2884 if (pdev->untrusted)
2885 return IOMMU_DOMAIN_DMA;
2887 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2888 return IOMMU_DOMAIN_IDENTITY;
2890 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2891 return IOMMU_DOMAIN_IDENTITY;
2894 * We want to start off with all devices in the 1:1 domain, and
2895 * take them out later if we find they can't access all of memory.
2897 * However, we can't do this for PCI devices behind bridges,
2898 * because all PCI devices behind the same bridge will end up
2899 * with the same source-id on their transactions.
2901 * Practically speaking, we can't change things around for these
2902 * devices at run-time, because we can't be sure there'll be no
2903 * DMA transactions in flight for any of their siblings.
2905 * So PCI devices (unless they're on the root bus) as well as
2906 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2907 * the 1:1 domain, just in _case_ one of their siblings turns out
2908 * not to be able to map all of memory.
2910 if (!pci_is_pcie(pdev)) {
2911 if (!pci_is_root_bus(pdev->bus))
2912 return IOMMU_DOMAIN_DMA;
2913 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2914 return IOMMU_DOMAIN_DMA;
2915 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2916 return IOMMU_DOMAIN_DMA;
2918 if (device_has_rmrr(dev))
2919 return IOMMU_DOMAIN_DMA;
2922 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2923 IOMMU_DOMAIN_IDENTITY : 0;
2926 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2929 * Start from the sane iommu hardware state.
2930 * If the queued invalidation is already initialized by us
2931 * (for example, while enabling interrupt-remapping) then
2932 * we got the things already rolling from a sane state.
2936 * Clear any previous faults.
2938 dmar_fault(-1, iommu);
2940 * Disable queued invalidation if supported and already enabled
2941 * before OS handover.
2943 dmar_disable_qi(iommu);
2946 if (dmar_enable_qi(iommu)) {
2948 * Queued Invalidate not enabled, use Register Based Invalidate
2950 iommu->flush.flush_context = __iommu_flush_context;
2951 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2952 pr_info("%s: Using Register based invalidation\n",
2955 iommu->flush.flush_context = qi_flush_context;
2956 iommu->flush.flush_iotlb = qi_flush_iotlb;
2957 pr_info("%s: Using Queued invalidation\n", iommu->name);
2961 static int copy_context_table(struct intel_iommu *iommu,
2962 struct root_entry *old_re,
2963 struct context_entry **tbl,
2966 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2967 struct context_entry *new_ce = NULL, ce;
2968 struct context_entry *old_ce = NULL;
2969 struct root_entry re;
2970 phys_addr_t old_ce_phys;
2972 tbl_idx = ext ? bus * 2 : bus;
2973 memcpy(&re, old_re, sizeof(re));
2975 for (devfn = 0; devfn < 256; devfn++) {
2976 /* First calculate the correct index */
2977 idx = (ext ? devfn * 2 : devfn) % 256;
2980 /* First save what we may have and clean up */
2982 tbl[tbl_idx] = new_ce;
2983 __iommu_flush_cache(iommu, new_ce,
2993 old_ce_phys = root_entry_lctp(&re);
2995 old_ce_phys = root_entry_uctp(&re);
2998 if (ext && devfn == 0) {
2999 /* No LCTP, try UCTP */
3008 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3013 new_ce = alloc_pgtable_page(iommu->node);
3020 /* Now copy the context entry */
3021 memcpy(&ce, old_ce + idx, sizeof(ce));
3023 if (!__context_present(&ce))
3026 did = context_domain_id(&ce);
3027 if (did >= 0 && did < cap_ndoms(iommu->cap))
3028 set_bit(did, iommu->domain_ids);
3031 * We need a marker for copied context entries. This
3032 * marker needs to work for the old format as well as
3033 * for extended context entries.
3035 * Bit 67 of the context entry is used. In the old
3036 * format this bit is available to software, in the
3037 * extended format it is the PGE bit, but PGE is ignored
3038 * by HW if PASIDs are disabled (and thus still
3041 * So disable PASIDs first and then mark the entry
3042 * copied. This means that we don't copy PASID
3043 * translations from the old kernel, but this is fine as
3044 * faults there are not fatal.
3046 context_clear_pasid_enable(&ce);
3047 context_set_copied(&ce);
3052 tbl[tbl_idx + pos] = new_ce;
3054 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3063 static int copy_translation_tables(struct intel_iommu *iommu)
3065 struct context_entry **ctxt_tbls;
3066 struct root_entry *old_rt;
3067 phys_addr_t old_rt_phys;
3068 int ctxt_table_entries;
3069 unsigned long flags;
3074 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3075 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3076 new_ext = !!ecap_ecs(iommu->ecap);
3079 * The RTT bit can only be changed when translation is disabled,
3080 * but disabling translation means to open a window for data
3081 * corruption. So bail out and don't copy anything if we would
3082 * have to change the bit.
3087 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3091 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3095 /* This is too big for the stack - allocate it from slab */
3096 ctxt_table_entries = ext ? 512 : 256;
3098 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3102 for (bus = 0; bus < 256; bus++) {
3103 ret = copy_context_table(iommu, &old_rt[bus],
3104 ctxt_tbls, bus, ext);
3106 pr_err("%s: Failed to copy context table for bus %d\n",
3112 spin_lock_irqsave(&iommu->lock, flags);
3114 /* Context tables are copied, now write them to the root_entry table */
3115 for (bus = 0; bus < 256; bus++) {
3116 int idx = ext ? bus * 2 : bus;
3119 if (ctxt_tbls[idx]) {
3120 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3121 iommu->root_entry[bus].lo = val;
3124 if (!ext || !ctxt_tbls[idx + 1])
3127 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3128 iommu->root_entry[bus].hi = val;
3131 spin_unlock_irqrestore(&iommu->lock, flags);
3135 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3145 static int __init init_dmars(void)
3147 struct dmar_drhd_unit *drhd;
3148 struct intel_iommu *iommu;
3154 * initialize and program root entry to not present
3157 for_each_drhd_unit(drhd) {
3159 * lock not needed as this is only incremented in the single
3160 * threaded kernel __init code path all other access are read
3163 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3167 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3170 /* Preallocate enough resources for IOMMU hot-addition */
3171 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3172 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3174 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3177 pr_err("Allocating global iommu array failed\n");
3182 for_each_iommu(iommu, drhd) {
3183 if (drhd->ignored) {
3184 iommu_disable_translation(iommu);
3189 * Find the max pasid size of all IOMMU's in the system.
3190 * We need to ensure the system pasid table is no bigger
3191 * than the smallest supported.
3193 if (pasid_supported(iommu)) {
3194 u32 temp = 2 << ecap_pss(iommu->ecap);
3196 intel_pasid_max_id = min_t(u32, temp,
3197 intel_pasid_max_id);
3200 g_iommus[iommu->seq_id] = iommu;
3202 intel_iommu_init_qi(iommu);
3204 ret = iommu_init_domains(iommu);
3208 init_translation_status(iommu);
3210 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3211 iommu_disable_translation(iommu);
3212 clear_translation_pre_enabled(iommu);
3213 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3219 * we could share the same root & context tables
3220 * among all IOMMU's. Need to Split it later.
3222 ret = iommu_alloc_root_entry(iommu);
3226 if (translation_pre_enabled(iommu)) {
3227 pr_info("Translation already enabled - trying to copy translation structures\n");
3229 ret = copy_translation_tables(iommu);
3232 * We found the IOMMU with translation
3233 * enabled - but failed to copy over the
3234 * old root-entry table. Try to proceed
3235 * by disabling translation now and
3236 * allocating a clean root-entry table.
3237 * This might cause DMAR faults, but
3238 * probably the dump will still succeed.
3240 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3242 iommu_disable_translation(iommu);
3243 clear_translation_pre_enabled(iommu);
3245 pr_info("Copied translation tables from previous kernel for %s\n",
3250 if (!ecap_pass_through(iommu->ecap))
3251 hw_pass_through = 0;
3252 #ifdef CONFIG_INTEL_IOMMU_SVM
3253 if (pasid_supported(iommu))
3254 intel_svm_init(iommu);
3259 * Now that qi is enabled on all iommus, set the root entry and flush
3260 * caches. This is required on some Intel X58 chipsets, otherwise the
3261 * flush_context function will loop forever and the boot hangs.
3263 for_each_active_iommu(iommu, drhd) {
3264 iommu_flush_write_buffer(iommu);
3265 iommu_set_root_entry(iommu);
3266 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3267 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3270 if (iommu_pass_through)
3271 iommu_identity_mapping |= IDENTMAP_ALL;
3273 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3278 iommu_identity_mapping |= IDENTMAP_GFX;
3280 check_tylersburg_isoch();
3282 ret = si_domain_init(hw_pass_through);
3289 * global invalidate context cache
3290 * global invalidate iotlb
3291 * enable translation
3293 for_each_iommu(iommu, drhd) {
3294 if (drhd->ignored) {
3296 * we always have to disable PMRs or DMA may fail on
3300 iommu_disable_protect_mem_regions(iommu);
3304 iommu_flush_write_buffer(iommu);
3306 #ifdef CONFIG_INTEL_IOMMU_SVM
3307 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3309 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3310 * could cause possible lock race condition.
3312 up_write(&dmar_global_lock);
3313 ret = intel_svm_enable_prq(iommu);
3314 down_write(&dmar_global_lock);
3319 ret = dmar_set_interrupt(iommu);
3327 for_each_active_iommu(iommu, drhd) {
3328 disable_dmar_iommu(iommu);
3329 free_dmar_iommu(iommu);
3338 /* This takes a number of _MM_ pages, not VTD pages */
3339 static unsigned long intel_alloc_iova(struct device *dev,
3340 struct dmar_domain *domain,
3341 unsigned long nrpages, uint64_t dma_mask)
3343 unsigned long iova_pfn;
3345 /* Restrict dma_mask to the width that the iommu can handle */
3346 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3347 /* Ensure we reserve the whole size-aligned region */
3348 nrpages = __roundup_pow_of_two(nrpages);
3350 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3352 * First try to allocate an io virtual address in
3353 * DMA_BIT_MASK(32) and if that fails then try allocating
3356 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3357 IOVA_PFN(DMA_BIT_MASK(32)), false);
3361 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3362 IOVA_PFN(dma_mask), true);
3363 if (unlikely(!iova_pfn)) {
3364 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3371 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3373 struct dmar_domain *domain, *tmp;
3374 struct dmar_rmrr_unit *rmrr;
3375 struct device *i_dev;
3378 /* Device shouldn't be attached by any domains. */
3379 domain = find_domain(dev);
3383 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3387 /* We have a new domain - setup possible RMRRs for the device */
3389 for_each_rmrr_units(rmrr) {
3390 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3395 ret = domain_prepare_identity_map(dev, domain,
3399 dev_err(dev, "Mapping reserved region failed\n");
3404 tmp = set_domain_for_dev(dev, domain);
3405 if (!tmp || domain != tmp) {
3406 domain_exit(domain);
3412 dev_err(dev, "Allocating domain failed\n");
3414 domain->domain.type = IOMMU_DOMAIN_DMA;
3419 /* Check if the dev needs to go through non-identity map and unmap process.*/
3420 static bool iommu_need_mapping(struct device *dev)
3424 if (iommu_dummy(dev))
3427 ret = identity_mapping(dev);
3429 u64 dma_mask = *dev->dma_mask;
3431 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3432 dma_mask = dev->coherent_dma_mask;
3434 if (dma_mask >= dma_get_required_mask(dev))
3438 * 32 bit DMA is removed from si_domain and fall back to
3439 * non-identity mapping.
3441 dmar_remove_one_dev_info(dev);
3442 ret = iommu_request_dma_domain_for_dev(dev);
3444 struct iommu_domain *domain;
3445 struct dmar_domain *dmar_domain;
3447 domain = iommu_get_domain_for_dev(dev);
3449 dmar_domain = to_dmar_domain(domain);
3450 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3452 get_private_domain_for_dev(dev);
3455 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3461 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3462 size_t size, int dir, u64 dma_mask)
3464 struct dmar_domain *domain;
3465 phys_addr_t start_paddr;
3466 unsigned long iova_pfn;
3469 struct intel_iommu *iommu;
3470 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3472 BUG_ON(dir == DMA_NONE);
3474 domain = find_domain(dev);
3476 return DMA_MAPPING_ERROR;
3478 iommu = domain_get_iommu(domain);
3479 size = aligned_nrpages(paddr, size);
3481 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3486 * Check if DMAR supports zero-length reads on write only
3489 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3490 !cap_zlr(iommu->cap))
3491 prot |= DMA_PTE_READ;
3492 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3493 prot |= DMA_PTE_WRITE;
3495 * paddr - (paddr + size) might be partial page, we should map the whole
3496 * page. Note: if two part of one page are separately mapped, we
3497 * might have two guest_addr mapping to the same host paddr, but this
3498 * is not a big problem
3500 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3501 mm_to_dma_pfn(paddr_pfn), size, prot);
3505 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3506 start_paddr += paddr & ~PAGE_MASK;
3511 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3512 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3513 size, (unsigned long long)paddr, dir);
3514 return DMA_MAPPING_ERROR;
3517 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3518 unsigned long offset, size_t size,
3519 enum dma_data_direction dir,
3520 unsigned long attrs)
3522 if (iommu_need_mapping(dev))
3523 return __intel_map_single(dev, page_to_phys(page) + offset,
3524 size, dir, *dev->dma_mask);
3525 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3528 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3529 size_t size, enum dma_data_direction dir,
3530 unsigned long attrs)
3532 if (iommu_need_mapping(dev))
3533 return __intel_map_single(dev, phys_addr, size, dir,
3535 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3538 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3540 struct dmar_domain *domain;
3541 unsigned long start_pfn, last_pfn;
3542 unsigned long nrpages;
3543 unsigned long iova_pfn;
3544 struct intel_iommu *iommu;
3545 struct page *freelist;
3546 struct pci_dev *pdev = NULL;
3548 domain = find_domain(dev);
3551 iommu = domain_get_iommu(domain);
3553 iova_pfn = IOVA_PFN(dev_addr);
3555 nrpages = aligned_nrpages(dev_addr, size);
3556 start_pfn = mm_to_dma_pfn(iova_pfn);
3557 last_pfn = start_pfn + nrpages - 1;
3559 if (dev_is_pci(dev))
3560 pdev = to_pci_dev(dev);
3562 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3564 freelist = domain_unmap(domain, start_pfn, last_pfn);
3566 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3567 !has_iova_flush_queue(&domain->iovad)) {
3568 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3569 nrpages, !freelist, 0);
3571 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3572 dma_free_pagelist(freelist);
3574 queue_iova(&domain->iovad, iova_pfn, nrpages,
3575 (unsigned long)freelist);
3577 * queue up the release of the unmap to save the 1/6th of the
3578 * cpu used up by the iotlb flush operation...
3583 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3584 size_t size, enum dma_data_direction dir,
3585 unsigned long attrs)
3587 if (iommu_need_mapping(dev))
3588 intel_unmap(dev, dev_addr, size);
3590 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3593 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3594 size_t size, enum dma_data_direction dir, unsigned long attrs)
3596 if (iommu_need_mapping(dev))
3597 intel_unmap(dev, dev_addr, size);
3600 static void *intel_alloc_coherent(struct device *dev, size_t size,
3601 dma_addr_t *dma_handle, gfp_t flags,
3602 unsigned long attrs)
3604 struct page *page = NULL;
3607 if (!iommu_need_mapping(dev))
3608 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3610 size = PAGE_ALIGN(size);
3611 order = get_order(size);
3613 if (gfpflags_allow_blocking(flags)) {
3614 unsigned int count = size >> PAGE_SHIFT;
3616 page = dma_alloc_from_contiguous(dev, count, order,
3617 flags & __GFP_NOWARN);
3621 page = alloc_pages(flags, order);
3624 memset(page_address(page), 0, size);
3626 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3628 dev->coherent_dma_mask);
3629 if (*dma_handle != DMA_MAPPING_ERROR)
3630 return page_address(page);
3631 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3632 __free_pages(page, order);
3637 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3638 dma_addr_t dma_handle, unsigned long attrs)
3641 struct page *page = virt_to_page(vaddr);
3643 if (!iommu_need_mapping(dev))
3644 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3646 size = PAGE_ALIGN(size);
3647 order = get_order(size);
3649 intel_unmap(dev, dma_handle, size);
3650 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3651 __free_pages(page, order);
3654 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3655 int nelems, enum dma_data_direction dir,
3656 unsigned long attrs)
3658 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3659 unsigned long nrpages = 0;
3660 struct scatterlist *sg;
3663 if (!iommu_need_mapping(dev))
3664 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3666 for_each_sg(sglist, sg, nelems, i) {
3667 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3670 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3673 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3674 enum dma_data_direction dir, unsigned long attrs)
3677 struct dmar_domain *domain;
3680 unsigned long iova_pfn;
3682 struct scatterlist *sg;
3683 unsigned long start_vpfn;
3684 struct intel_iommu *iommu;
3686 BUG_ON(dir == DMA_NONE);
3687 if (!iommu_need_mapping(dev))
3688 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3690 domain = find_domain(dev);
3694 iommu = domain_get_iommu(domain);
3696 for_each_sg(sglist, sg, nelems, i)
3697 size += aligned_nrpages(sg->offset, sg->length);
3699 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3702 sglist->dma_length = 0;
3707 * Check if DMAR supports zero-length reads on write only
3710 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3711 !cap_zlr(iommu->cap))
3712 prot |= DMA_PTE_READ;
3713 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3714 prot |= DMA_PTE_WRITE;
3716 start_vpfn = mm_to_dma_pfn(iova_pfn);
3718 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3719 if (unlikely(ret)) {
3720 dma_pte_free_pagetable(domain, start_vpfn,
3721 start_vpfn + size - 1,
3722 agaw_to_level(domain->agaw) + 1);
3723 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3730 static const struct dma_map_ops intel_dma_ops = {
3731 .alloc = intel_alloc_coherent,
3732 .free = intel_free_coherent,
3733 .map_sg = intel_map_sg,
3734 .unmap_sg = intel_unmap_sg,
3735 .map_page = intel_map_page,
3736 .unmap_page = intel_unmap_page,
3737 .map_resource = intel_map_resource,
3738 .unmap_resource = intel_unmap_resource,
3739 .dma_supported = dma_direct_supported,
3742 static inline int iommu_domain_cache_init(void)
3746 iommu_domain_cache = kmem_cache_create("iommu_domain",
3747 sizeof(struct dmar_domain),
3752 if (!iommu_domain_cache) {
3753 pr_err("Couldn't create iommu_domain cache\n");
3760 static inline int iommu_devinfo_cache_init(void)
3764 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3765 sizeof(struct device_domain_info),
3769 if (!iommu_devinfo_cache) {
3770 pr_err("Couldn't create devinfo cache\n");
3777 static int __init iommu_init_mempool(void)
3780 ret = iova_cache_get();
3784 ret = iommu_domain_cache_init();
3788 ret = iommu_devinfo_cache_init();
3792 kmem_cache_destroy(iommu_domain_cache);
3799 static void __init iommu_exit_mempool(void)
3801 kmem_cache_destroy(iommu_devinfo_cache);
3802 kmem_cache_destroy(iommu_domain_cache);
3806 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3808 struct dmar_drhd_unit *drhd;
3812 /* We know that this device on this chipset has its own IOMMU.
3813 * If we find it under a different IOMMU, then the BIOS is lying
3814 * to us. Hope that the IOMMU for this device is actually
3815 * disabled, and it needs no translation...
3817 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3819 /* "can't" happen */
3820 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3823 vtbar &= 0xffff0000;
3825 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3826 drhd = dmar_find_matched_drhd_unit(pdev);
3827 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3828 TAINT_FIRMWARE_WORKAROUND,
3829 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3830 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3832 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3834 static void __init init_no_remapping_devices(void)
3836 struct dmar_drhd_unit *drhd;
3840 for_each_drhd_unit(drhd) {
3841 if (!drhd->include_all) {
3842 for_each_active_dev_scope(drhd->devices,
3843 drhd->devices_cnt, i, dev)
3845 /* ignore DMAR unit if no devices exist */
3846 if (i == drhd->devices_cnt)
3851 for_each_active_drhd_unit(drhd) {
3852 if (drhd->include_all)
3855 for_each_active_dev_scope(drhd->devices,
3856 drhd->devices_cnt, i, dev)
3857 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3859 if (i < drhd->devices_cnt)
3862 /* This IOMMU has *only* gfx devices. Either bypass it or
3863 set the gfx_mapped flag, as appropriate */
3864 if (!dmar_map_gfx) {
3866 for_each_active_dev_scope(drhd->devices,
3867 drhd->devices_cnt, i, dev)
3868 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3873 #ifdef CONFIG_SUSPEND
3874 static int init_iommu_hw(void)
3876 struct dmar_drhd_unit *drhd;
3877 struct intel_iommu *iommu = NULL;
3879 for_each_active_iommu(iommu, drhd)
3881 dmar_reenable_qi(iommu);
3883 for_each_iommu(iommu, drhd) {
3884 if (drhd->ignored) {
3886 * we always have to disable PMRs or DMA may fail on
3890 iommu_disable_protect_mem_regions(iommu);
3894 iommu_flush_write_buffer(iommu);
3896 iommu_set_root_entry(iommu);
3898 iommu->flush.flush_context(iommu, 0, 0, 0,
3899 DMA_CCMD_GLOBAL_INVL);
3900 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3901 iommu_enable_translation(iommu);
3902 iommu_disable_protect_mem_regions(iommu);
3908 static void iommu_flush_all(void)
3910 struct dmar_drhd_unit *drhd;
3911 struct intel_iommu *iommu;
3913 for_each_active_iommu(iommu, drhd) {
3914 iommu->flush.flush_context(iommu, 0, 0, 0,
3915 DMA_CCMD_GLOBAL_INVL);
3916 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3917 DMA_TLB_GLOBAL_FLUSH);
3921 static int iommu_suspend(void)
3923 struct dmar_drhd_unit *drhd;
3924 struct intel_iommu *iommu = NULL;
3927 for_each_active_iommu(iommu, drhd) {
3928 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3930 if (!iommu->iommu_state)
3936 for_each_active_iommu(iommu, drhd) {
3937 iommu_disable_translation(iommu);
3939 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3941 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3942 readl(iommu->reg + DMAR_FECTL_REG);
3943 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3944 readl(iommu->reg + DMAR_FEDATA_REG);
3945 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3946 readl(iommu->reg + DMAR_FEADDR_REG);
3947 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3948 readl(iommu->reg + DMAR_FEUADDR_REG);
3950 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3955 for_each_active_iommu(iommu, drhd)
3956 kfree(iommu->iommu_state);
3961 static void iommu_resume(void)
3963 struct dmar_drhd_unit *drhd;
3964 struct intel_iommu *iommu = NULL;
3967 if (init_iommu_hw()) {
3969 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3971 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3975 for_each_active_iommu(iommu, drhd) {
3977 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3979 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3980 iommu->reg + DMAR_FECTL_REG);
3981 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3982 iommu->reg + DMAR_FEDATA_REG);
3983 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3984 iommu->reg + DMAR_FEADDR_REG);
3985 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3986 iommu->reg + DMAR_FEUADDR_REG);
3988 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3991 for_each_active_iommu(iommu, drhd)
3992 kfree(iommu->iommu_state);
3995 static struct syscore_ops iommu_syscore_ops = {
3996 .resume = iommu_resume,
3997 .suspend = iommu_suspend,
4000 static void __init init_iommu_pm_ops(void)
4002 register_syscore_ops(&iommu_syscore_ops);
4006 static inline void init_iommu_pm_ops(void) {}
4007 #endif /* CONFIG_PM */
4009 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4011 struct acpi_dmar_reserved_memory *rmrr;
4012 struct dmar_rmrr_unit *rmrru;
4014 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4018 rmrru->hdr = header;
4019 rmrr = (struct acpi_dmar_reserved_memory *)header;
4020 rmrru->base_address = rmrr->base_address;
4021 rmrru->end_address = rmrr->end_address;
4023 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4024 ((void *)rmrr) + rmrr->header.length,
4025 &rmrru->devices_cnt);
4026 if (rmrru->devices_cnt && rmrru->devices == NULL)
4029 list_add(&rmrru->list, &dmar_rmrr_units);
4038 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4040 struct dmar_atsr_unit *atsru;
4041 struct acpi_dmar_atsr *tmp;
4043 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4044 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4045 if (atsr->segment != tmp->segment)
4047 if (atsr->header.length != tmp->header.length)
4049 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4056 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4058 struct acpi_dmar_atsr *atsr;
4059 struct dmar_atsr_unit *atsru;
4061 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4064 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4065 atsru = dmar_find_atsr(atsr);
4069 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4074 * If memory is allocated from slab by ACPI _DSM method, we need to
4075 * copy the memory content because the memory buffer will be freed
4078 atsru->hdr = (void *)(atsru + 1);
4079 memcpy(atsru->hdr, hdr, hdr->length);
4080 atsru->include_all = atsr->flags & 0x1;
4081 if (!atsru->include_all) {
4082 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4083 (void *)atsr + atsr->header.length,
4084 &atsru->devices_cnt);
4085 if (atsru->devices_cnt && atsru->devices == NULL) {
4091 list_add_rcu(&atsru->list, &dmar_atsr_units);
4096 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4098 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4102 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4104 struct acpi_dmar_atsr *atsr;
4105 struct dmar_atsr_unit *atsru;
4107 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4108 atsru = dmar_find_atsr(atsr);
4110 list_del_rcu(&atsru->list);
4112 intel_iommu_free_atsr(atsru);
4118 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4122 struct acpi_dmar_atsr *atsr;
4123 struct dmar_atsr_unit *atsru;
4125 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4126 atsru = dmar_find_atsr(atsr);
4130 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4131 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4139 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4142 struct intel_iommu *iommu = dmaru->iommu;
4144 if (g_iommus[iommu->seq_id])
4147 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4148 pr_warn("%s: Doesn't support hardware pass through.\n",
4152 if (!ecap_sc_support(iommu->ecap) &&
4153 domain_update_iommu_snooping(iommu)) {
4154 pr_warn("%s: Doesn't support snooping.\n",
4158 sp = domain_update_iommu_superpage(iommu) - 1;
4159 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4160 pr_warn("%s: Doesn't support large page.\n",
4166 * Disable translation if already enabled prior to OS handover.
4168 if (iommu->gcmd & DMA_GCMD_TE)
4169 iommu_disable_translation(iommu);
4171 g_iommus[iommu->seq_id] = iommu;
4172 ret = iommu_init_domains(iommu);
4174 ret = iommu_alloc_root_entry(iommu);
4178 #ifdef CONFIG_INTEL_IOMMU_SVM
4179 if (pasid_supported(iommu))
4180 intel_svm_init(iommu);
4183 if (dmaru->ignored) {
4185 * we always have to disable PMRs or DMA may fail on this device
4188 iommu_disable_protect_mem_regions(iommu);
4192 intel_iommu_init_qi(iommu);
4193 iommu_flush_write_buffer(iommu);
4195 #ifdef CONFIG_INTEL_IOMMU_SVM
4196 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4197 ret = intel_svm_enable_prq(iommu);
4202 ret = dmar_set_interrupt(iommu);
4206 iommu_set_root_entry(iommu);
4207 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4208 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4209 iommu_enable_translation(iommu);
4211 iommu_disable_protect_mem_regions(iommu);
4215 disable_dmar_iommu(iommu);
4217 free_dmar_iommu(iommu);
4221 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4224 struct intel_iommu *iommu = dmaru->iommu;
4226 if (!intel_iommu_enabled)
4232 ret = intel_iommu_add(dmaru);
4234 disable_dmar_iommu(iommu);
4235 free_dmar_iommu(iommu);
4241 static void intel_iommu_free_dmars(void)
4243 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4244 struct dmar_atsr_unit *atsru, *atsr_n;
4246 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4247 list_del(&rmrru->list);
4248 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4252 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4253 list_del(&atsru->list);
4254 intel_iommu_free_atsr(atsru);
4258 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4261 struct pci_bus *bus;
4262 struct pci_dev *bridge = NULL;
4264 struct acpi_dmar_atsr *atsr;
4265 struct dmar_atsr_unit *atsru;
4267 dev = pci_physfn(dev);
4268 for (bus = dev->bus; bus; bus = bus->parent) {
4270 /* If it's an integrated device, allow ATS */
4273 /* Connected via non-PCIe: no ATS */
4274 if (!pci_is_pcie(bridge) ||
4275 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4277 /* If we found the root port, look it up in the ATSR */
4278 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4283 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4284 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4285 if (atsr->segment != pci_domain_nr(dev->bus))
4288 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4289 if (tmp == &bridge->dev)
4292 if (atsru->include_all)
4302 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4305 struct dmar_rmrr_unit *rmrru;
4306 struct dmar_atsr_unit *atsru;
4307 struct acpi_dmar_atsr *atsr;
4308 struct acpi_dmar_reserved_memory *rmrr;
4310 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4313 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4314 rmrr = container_of(rmrru->hdr,
4315 struct acpi_dmar_reserved_memory, header);
4316 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4317 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4318 ((void *)rmrr) + rmrr->header.length,
4319 rmrr->segment, rmrru->devices,
4320 rmrru->devices_cnt);
4323 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4324 dmar_remove_dev_scope(info, rmrr->segment,
4325 rmrru->devices, rmrru->devices_cnt);
4329 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4330 if (atsru->include_all)
4333 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4334 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4335 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4336 (void *)atsr + atsr->header.length,
4337 atsr->segment, atsru->devices,
4338 atsru->devices_cnt);
4343 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4344 if (dmar_remove_dev_scope(info, atsr->segment,
4345 atsru->devices, atsru->devices_cnt))
4353 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4354 unsigned long val, void *v)
4356 struct memory_notify *mhp = v;
4357 unsigned long long start, end;
4358 unsigned long start_vpfn, last_vpfn;
4361 case MEM_GOING_ONLINE:
4362 start = mhp->start_pfn << PAGE_SHIFT;
4363 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4364 if (iommu_domain_identity_map(si_domain, start, end)) {
4365 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4372 case MEM_CANCEL_ONLINE:
4373 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4374 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4375 while (start_vpfn <= last_vpfn) {
4377 struct dmar_drhd_unit *drhd;
4378 struct intel_iommu *iommu;
4379 struct page *freelist;
4381 iova = find_iova(&si_domain->iovad, start_vpfn);
4383 pr_debug("Failed get IOVA for PFN %lx\n",
4388 iova = split_and_remove_iova(&si_domain->iovad, iova,
4389 start_vpfn, last_vpfn);
4391 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4392 start_vpfn, last_vpfn);
4396 freelist = domain_unmap(si_domain, iova->pfn_lo,
4400 for_each_active_iommu(iommu, drhd)
4401 iommu_flush_iotlb_psi(iommu, si_domain,
4402 iova->pfn_lo, iova_size(iova),
4405 dma_free_pagelist(freelist);
4407 start_vpfn = iova->pfn_hi + 1;
4408 free_iova_mem(iova);
4416 static struct notifier_block intel_iommu_memory_nb = {
4417 .notifier_call = intel_iommu_memory_notifier,
4421 static void free_all_cpu_cached_iovas(unsigned int cpu)
4425 for (i = 0; i < g_num_of_iommus; i++) {
4426 struct intel_iommu *iommu = g_iommus[i];
4427 struct dmar_domain *domain;
4433 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4434 domain = get_iommu_domain(iommu, (u16)did);
4438 free_cpu_cached_iovas(cpu, &domain->iovad);
4443 static int intel_iommu_cpu_dead(unsigned int cpu)
4445 free_all_cpu_cached_iovas(cpu);
4449 static void intel_disable_iommus(void)
4451 struct intel_iommu *iommu = NULL;
4452 struct dmar_drhd_unit *drhd;
4454 for_each_iommu(iommu, drhd)
4455 iommu_disable_translation(iommu);
4458 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4460 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4462 return container_of(iommu_dev, struct intel_iommu, iommu);
4465 static ssize_t intel_iommu_show_version(struct device *dev,
4466 struct device_attribute *attr,
4469 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4470 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4471 return sprintf(buf, "%d:%d\n",
4472 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4474 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4476 static ssize_t intel_iommu_show_address(struct device *dev,
4477 struct device_attribute *attr,
4480 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4481 return sprintf(buf, "%llx\n", iommu->reg_phys);
4483 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4485 static ssize_t intel_iommu_show_cap(struct device *dev,
4486 struct device_attribute *attr,
4489 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4490 return sprintf(buf, "%llx\n", iommu->cap);
4492 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4494 static ssize_t intel_iommu_show_ecap(struct device *dev,
4495 struct device_attribute *attr,
4498 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4499 return sprintf(buf, "%llx\n", iommu->ecap);
4501 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4503 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4504 struct device_attribute *attr,
4507 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4508 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4510 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4512 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4513 struct device_attribute *attr,
4516 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4517 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4518 cap_ndoms(iommu->cap)));
4520 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4522 static struct attribute *intel_iommu_attrs[] = {
4523 &dev_attr_version.attr,
4524 &dev_attr_address.attr,
4526 &dev_attr_ecap.attr,
4527 &dev_attr_domains_supported.attr,
4528 &dev_attr_domains_used.attr,
4532 static struct attribute_group intel_iommu_group = {
4533 .name = "intel-iommu",
4534 .attrs = intel_iommu_attrs,
4537 const struct attribute_group *intel_iommu_groups[] = {
4542 static int __init platform_optin_force_iommu(void)
4544 struct pci_dev *pdev = NULL;
4545 bool has_untrusted_dev = false;
4547 if (!dmar_platform_optin() || no_platform_optin)
4550 for_each_pci_dev(pdev) {
4551 if (pdev->untrusted) {
4552 has_untrusted_dev = true;
4557 if (!has_untrusted_dev)
4560 if (no_iommu || dmar_disabled)
4561 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4564 * If Intel-IOMMU is disabled by default, we will apply identity
4565 * map for all devices except those marked as being untrusted.
4568 iommu_identity_mapping |= IDENTMAP_ALL;
4571 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4579 static int __init probe_acpi_namespace_devices(void)
4581 struct dmar_drhd_unit *drhd;
4582 /* To avoid a -Wunused-but-set-variable warning. */
4583 struct intel_iommu *iommu __maybe_unused;
4587 for_each_active_iommu(iommu, drhd) {
4588 for_each_active_dev_scope(drhd->devices,
4589 drhd->devices_cnt, i, dev) {
4590 struct acpi_device_physical_node *pn;
4591 struct iommu_group *group;
4592 struct acpi_device *adev;
4594 if (dev->bus != &acpi_bus_type)
4597 adev = to_acpi_device(dev);
4598 mutex_lock(&adev->physical_node_lock);
4599 list_for_each_entry(pn,
4600 &adev->physical_node_list, node) {
4601 group = iommu_group_get(pn->dev);
4603 iommu_group_put(group);
4607 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4608 ret = iommu_probe_device(pn->dev);
4612 mutex_unlock(&adev->physical_node_lock);
4622 int __init intel_iommu_init(void)
4625 struct dmar_drhd_unit *drhd;
4626 struct intel_iommu *iommu;
4629 * Intel IOMMU is required for a TXT/tboot launch or platform
4630 * opt in, so enforce that.
4632 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4634 if (iommu_init_mempool()) {
4636 panic("tboot: Failed to initialize iommu memory\n");
4640 down_write(&dmar_global_lock);
4641 if (dmar_table_init()) {
4643 panic("tboot: Failed to initialize DMAR table\n");
4647 if (dmar_dev_scope_init() < 0) {
4649 panic("tboot: Failed to initialize DMAR device scope\n");
4653 up_write(&dmar_global_lock);
4656 * The bus notifier takes the dmar_global_lock, so lockdep will
4657 * complain later when we register it under the lock.
4659 dmar_register_bus_notifier();
4661 down_write(&dmar_global_lock);
4663 if (no_iommu || dmar_disabled) {
4665 * We exit the function here to ensure IOMMU's remapping and
4666 * mempool aren't setup, which means that the IOMMU's PMRs
4667 * won't be disabled via the call to init_dmars(). So disable
4668 * it explicitly here. The PMRs were setup by tboot prior to
4669 * calling SENTER, but the kernel is expected to reset/tear
4672 if (intel_iommu_tboot_noforce) {
4673 for_each_iommu(iommu, drhd)
4674 iommu_disable_protect_mem_regions(iommu);
4678 * Make sure the IOMMUs are switched off, even when we
4679 * boot into a kexec kernel and the previous kernel left
4682 intel_disable_iommus();
4686 if (list_empty(&dmar_rmrr_units))
4687 pr_info("No RMRR found\n");
4689 if (list_empty(&dmar_atsr_units))
4690 pr_info("No ATSR found\n");
4692 if (dmar_init_reserved_ranges()) {
4694 panic("tboot: Failed to reserve iommu ranges\n");
4695 goto out_free_reserved_range;
4699 intel_iommu_gfx_mapped = 1;
4701 init_no_remapping_devices();
4706 panic("tboot: Failed to initialize DMARs\n");
4707 pr_err("Initialization failed\n");
4708 goto out_free_reserved_range;
4710 up_write(&dmar_global_lock);
4712 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4715 dma_ops = &intel_dma_ops;
4717 init_iommu_pm_ops();
4719 for_each_active_iommu(iommu, drhd) {
4720 iommu_device_sysfs_add(&iommu->iommu, NULL,
4723 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4724 iommu_device_register(&iommu->iommu);
4727 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4728 if (si_domain && !hw_pass_through)
4729 register_memory_notifier(&intel_iommu_memory_nb);
4730 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4731 intel_iommu_cpu_dead);
4733 down_read(&dmar_global_lock);
4734 if (probe_acpi_namespace_devices())
4735 pr_warn("ACPI name space devices didn't probe correctly\n");
4736 up_read(&dmar_global_lock);
4738 /* Finally, we enable the DMA remapping hardware. */
4739 for_each_iommu(iommu, drhd) {
4740 if (!drhd->ignored && !translation_pre_enabled(iommu))
4741 iommu_enable_translation(iommu);
4743 iommu_disable_protect_mem_regions(iommu);
4745 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4747 intel_iommu_enabled = 1;
4748 intel_iommu_debugfs_init();
4752 out_free_reserved_range:
4753 put_iova_domain(&reserved_iova_list);
4755 intel_iommu_free_dmars();
4756 up_write(&dmar_global_lock);
4757 iommu_exit_mempool();
4761 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4763 struct dmar_domain *domain;
4764 struct intel_iommu *iommu;
4765 unsigned long flags;
4767 assert_spin_locked(&device_domain_lock);
4772 iommu = info->iommu;
4773 domain = info->domain;
4776 if (dev_is_pci(info->dev) && sm_supported(iommu))
4777 intel_pasid_tear_down_entry(iommu, info->dev,
4780 iommu_disable_dev_iotlb(info);
4781 domain_context_clear_one(iommu, info->bus, info->devfn);
4782 intel_pasid_free_table(info->dev);
4785 unlink_domain_info(info);
4787 spin_lock_irqsave(&iommu->lock, flags);
4788 domain_detach_iommu(domain, iommu);
4789 spin_unlock_irqrestore(&iommu->lock, flags);
4791 /* free the private domain */
4792 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4793 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4794 domain_exit(info->domain);
4796 free_devinfo_mem(info);
4799 static void dmar_remove_one_dev_info(struct device *dev)
4801 struct device_domain_info *info;
4802 unsigned long flags;
4804 spin_lock_irqsave(&device_domain_lock, flags);
4805 info = dev->archdata.iommu;
4806 __dmar_remove_one_dev_info(info);
4807 spin_unlock_irqrestore(&device_domain_lock, flags);
4810 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4814 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4815 domain_reserve_special_ranges(domain);
4817 /* calculate AGAW */
4818 domain->gaw = guest_width;
4819 adjust_width = guestwidth_to_adjustwidth(guest_width);
4820 domain->agaw = width_to_agaw(adjust_width);
4822 domain->iommu_coherency = 0;
4823 domain->iommu_snooping = 0;
4824 domain->iommu_superpage = 0;
4825 domain->max_addr = 0;
4827 /* always allocate the top pgd */
4828 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4831 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4835 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4837 struct dmar_domain *dmar_domain;
4838 struct iommu_domain *domain;
4841 case IOMMU_DOMAIN_DMA:
4843 case IOMMU_DOMAIN_UNMANAGED:
4844 dmar_domain = alloc_domain(0);
4846 pr_err("Can't allocate dmar_domain\n");
4849 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4850 pr_err("Domain initialization failed\n");
4851 domain_exit(dmar_domain);
4855 if (type == IOMMU_DOMAIN_DMA &&
4856 init_iova_flush_queue(&dmar_domain->iovad,
4857 iommu_flush_iova, iova_entry_free)) {
4858 pr_warn("iova flush queue initialization failed\n");
4859 intel_iommu_strict = 1;
4862 domain_update_iommu_cap(dmar_domain);
4864 domain = &dmar_domain->domain;
4865 domain->geometry.aperture_start = 0;
4866 domain->geometry.aperture_end =
4867 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4868 domain->geometry.force_aperture = true;
4871 case IOMMU_DOMAIN_IDENTITY:
4872 return &si_domain->domain;
4880 static void intel_iommu_domain_free(struct iommu_domain *domain)
4882 if (domain != &si_domain->domain)
4883 domain_exit(to_dmar_domain(domain));
4887 * Check whether a @domain could be attached to the @dev through the
4888 * aux-domain attach/detach APIs.
4891 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4893 struct device_domain_info *info = dev->archdata.iommu;
4895 return info && info->auxd_enabled &&
4896 domain->type == IOMMU_DOMAIN_UNMANAGED;
4899 static void auxiliary_link_device(struct dmar_domain *domain,
4902 struct device_domain_info *info = dev->archdata.iommu;
4904 assert_spin_locked(&device_domain_lock);
4908 domain->auxd_refcnt++;
4909 list_add(&domain->auxd, &info->auxiliary_domains);
4912 static void auxiliary_unlink_device(struct dmar_domain *domain,
4915 struct device_domain_info *info = dev->archdata.iommu;
4917 assert_spin_locked(&device_domain_lock);
4921 list_del(&domain->auxd);
4922 domain->auxd_refcnt--;
4924 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4925 intel_pasid_free_id(domain->default_pasid);
4928 static int aux_domain_add_dev(struct dmar_domain *domain,
4933 unsigned long flags;
4934 struct intel_iommu *iommu;
4936 iommu = device_to_iommu(dev, &bus, &devfn);
4940 if (domain->default_pasid <= 0) {
4943 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4944 pci_max_pasids(to_pci_dev(dev)),
4947 pr_err("Can't allocate default pasid\n");
4950 domain->default_pasid = pasid;
4953 spin_lock_irqsave(&device_domain_lock, flags);
4955 * iommu->lock must be held to attach domain to iommu and setup the
4956 * pasid entry for second level translation.
4958 spin_lock(&iommu->lock);
4959 ret = domain_attach_iommu(domain, iommu);
4963 /* Setup the PASID entry for mediated devices: */
4964 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4965 domain->default_pasid);
4968 spin_unlock(&iommu->lock);
4970 auxiliary_link_device(domain, dev);
4972 spin_unlock_irqrestore(&device_domain_lock, flags);
4977 domain_detach_iommu(domain, iommu);
4979 spin_unlock(&iommu->lock);
4980 spin_unlock_irqrestore(&device_domain_lock, flags);
4981 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4982 intel_pasid_free_id(domain->default_pasid);
4987 static void aux_domain_remove_dev(struct dmar_domain *domain,
4990 struct device_domain_info *info;
4991 struct intel_iommu *iommu;
4992 unsigned long flags;
4994 if (!is_aux_domain(dev, &domain->domain))
4997 spin_lock_irqsave(&device_domain_lock, flags);
4998 info = dev->archdata.iommu;
4999 iommu = info->iommu;
5001 auxiliary_unlink_device(domain, dev);
5003 spin_lock(&iommu->lock);
5004 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5005 domain_detach_iommu(domain, iommu);
5006 spin_unlock(&iommu->lock);
5008 spin_unlock_irqrestore(&device_domain_lock, flags);
5011 static int prepare_domain_attach_device(struct iommu_domain *domain,
5014 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5015 struct intel_iommu *iommu;
5019 iommu = device_to_iommu(dev, &bus, &devfn);
5023 /* check if this iommu agaw is sufficient for max mapped address */
5024 addr_width = agaw_to_width(iommu->agaw);
5025 if (addr_width > cap_mgaw(iommu->cap))
5026 addr_width = cap_mgaw(iommu->cap);
5028 if (dmar_domain->max_addr > (1LL << addr_width)) {
5029 dev_err(dev, "%s: iommu width (%d) is not "
5030 "sufficient for the mapped address (%llx)\n",
5031 __func__, addr_width, dmar_domain->max_addr);
5034 dmar_domain->gaw = addr_width;
5037 * Knock out extra levels of page tables if necessary
5039 while (iommu->agaw < dmar_domain->agaw) {
5040 struct dma_pte *pte;
5042 pte = dmar_domain->pgd;
5043 if (dma_pte_present(pte)) {
5044 dmar_domain->pgd = (struct dma_pte *)
5045 phys_to_virt(dma_pte_addr(pte));
5046 free_pgtable_page(pte);
5048 dmar_domain->agaw--;
5054 static int intel_iommu_attach_device(struct iommu_domain *domain,
5059 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5060 device_is_rmrr_locked(dev)) {
5061 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5065 if (is_aux_domain(dev, domain))
5068 /* normally dev is not mapped */
5069 if (unlikely(domain_context_mapped(dev))) {
5070 struct dmar_domain *old_domain;
5072 old_domain = find_domain(dev);
5074 dmar_remove_one_dev_info(dev);
5077 ret = prepare_domain_attach_device(domain, dev);
5081 return domain_add_dev_info(to_dmar_domain(domain), dev);
5084 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5089 if (!is_aux_domain(dev, domain))
5092 ret = prepare_domain_attach_device(domain, dev);
5096 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5099 static void intel_iommu_detach_device(struct iommu_domain *domain,
5102 dmar_remove_one_dev_info(dev);
5105 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5108 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5111 static int intel_iommu_map(struct iommu_domain *domain,
5112 unsigned long iova, phys_addr_t hpa,
5113 size_t size, int iommu_prot)
5115 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5120 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5123 if (iommu_prot & IOMMU_READ)
5124 prot |= DMA_PTE_READ;
5125 if (iommu_prot & IOMMU_WRITE)
5126 prot |= DMA_PTE_WRITE;
5127 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5128 prot |= DMA_PTE_SNP;
5130 max_addr = iova + size;
5131 if (dmar_domain->max_addr < max_addr) {
5134 /* check if minimum agaw is sufficient for mapped address */
5135 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5136 if (end < max_addr) {
5137 pr_err("%s: iommu width (%d) is not "
5138 "sufficient for the mapped address (%llx)\n",
5139 __func__, dmar_domain->gaw, max_addr);
5142 dmar_domain->max_addr = max_addr;
5144 /* Round up size to next multiple of PAGE_SIZE, if it and
5145 the low bits of hpa would take us onto the next page */
5146 size = aligned_nrpages(hpa, size);
5147 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5148 hpa >> VTD_PAGE_SHIFT, size, prot);
5152 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5153 unsigned long iova, size_t size,
5154 struct iommu_iotlb_gather *gather)
5156 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5157 struct page *freelist = NULL;
5158 unsigned long start_pfn, last_pfn;
5159 unsigned int npages;
5160 int iommu_id, level = 0;
5162 /* Cope with horrid API which requires us to unmap more than the
5163 size argument if it happens to be a large-page mapping. */
5164 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5165 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5168 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5169 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5171 start_pfn = iova >> VTD_PAGE_SHIFT;
5172 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5174 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5176 npages = last_pfn - start_pfn + 1;
5178 for_each_domain_iommu(iommu_id, dmar_domain)
5179 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5180 start_pfn, npages, !freelist, 0);
5182 dma_free_pagelist(freelist);
5184 if (dmar_domain->max_addr == iova + size)
5185 dmar_domain->max_addr = iova;
5190 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5193 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5194 struct dma_pte *pte;
5198 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5201 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5203 phys = dma_pte_addr(pte);
5208 static inline bool scalable_mode_support(void)
5210 struct dmar_drhd_unit *drhd;
5211 struct intel_iommu *iommu;
5215 for_each_active_iommu(iommu, drhd) {
5216 if (!sm_supported(iommu)) {
5226 static inline bool iommu_pasid_support(void)
5228 struct dmar_drhd_unit *drhd;
5229 struct intel_iommu *iommu;
5233 for_each_active_iommu(iommu, drhd) {
5234 if (!pasid_supported(iommu)) {
5244 static bool intel_iommu_capable(enum iommu_cap cap)
5246 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5247 return domain_update_iommu_snooping(NULL) == 1;
5248 if (cap == IOMMU_CAP_INTR_REMAP)
5249 return irq_remapping_enabled == 1;
5254 static int intel_iommu_add_device(struct device *dev)
5256 struct dmar_domain *dmar_domain;
5257 struct iommu_domain *domain;
5258 struct intel_iommu *iommu;
5259 struct iommu_group *group;
5263 iommu = device_to_iommu(dev, &bus, &devfn);
5267 iommu_device_link(&iommu->iommu, dev);
5269 if (translation_pre_enabled(iommu))
5270 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5272 group = iommu_group_get_for_dev(dev);
5275 return PTR_ERR(group);
5277 iommu_group_put(group);
5279 domain = iommu_get_domain_for_dev(dev);
5280 dmar_domain = to_dmar_domain(domain);
5281 if (domain->type == IOMMU_DOMAIN_DMA) {
5282 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5283 ret = iommu_request_dm_for_dev(dev);
5285 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5286 domain_add_dev_info(si_domain, dev);
5288 "Device uses a private identity domain.\n");
5292 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5293 ret = iommu_request_dma_domain_for_dev(dev);
5295 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5296 if (!get_private_domain_for_dev(dev)) {
5298 "Failed to get a private domain.\n");
5303 "Device uses a private dma domain.\n");
5311 static void intel_iommu_remove_device(struct device *dev)
5313 struct intel_iommu *iommu;
5316 iommu = device_to_iommu(dev, &bus, &devfn);
5320 iommu_group_remove_device(dev);
5322 iommu_device_unlink(&iommu->iommu, dev);
5325 static void intel_iommu_get_resv_regions(struct device *device,
5326 struct list_head *head)
5328 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5329 struct iommu_resv_region *reg;
5330 struct dmar_rmrr_unit *rmrr;
5331 struct device *i_dev;
5334 down_read(&dmar_global_lock);
5335 for_each_rmrr_units(rmrr) {
5336 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5338 struct iommu_resv_region *resv;
5339 enum iommu_resv_type type;
5342 if (i_dev != device &&
5343 !is_downstream_to_pci_bridge(device, i_dev))
5346 length = rmrr->end_address - rmrr->base_address + 1;
5348 type = device_rmrr_is_relaxable(device) ?
5349 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5351 resv = iommu_alloc_resv_region(rmrr->base_address,
5352 length, prot, type);
5356 list_add_tail(&resv->list, head);
5359 up_read(&dmar_global_lock);
5361 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5362 if (dev_is_pci(device)) {
5363 struct pci_dev *pdev = to_pci_dev(device);
5365 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5366 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5369 list_add_tail(®->list, head);
5372 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5374 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5375 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5379 list_add_tail(®->list, head);
5382 static void intel_iommu_put_resv_regions(struct device *dev,
5383 struct list_head *head)
5385 struct iommu_resv_region *entry, *next;
5387 list_for_each_entry_safe(entry, next, head, list)
5391 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5393 struct device_domain_info *info;
5394 struct context_entry *context;
5395 struct dmar_domain *domain;
5396 unsigned long flags;
5400 domain = find_domain(dev);
5404 spin_lock_irqsave(&device_domain_lock, flags);
5405 spin_lock(&iommu->lock);
5408 info = dev->archdata.iommu;
5409 if (!info || !info->pasid_supported)
5412 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5413 if (WARN_ON(!context))
5416 ctx_lo = context[0].lo;
5418 if (!(ctx_lo & CONTEXT_PASIDE)) {
5419 ctx_lo |= CONTEXT_PASIDE;
5420 context[0].lo = ctx_lo;
5422 iommu->flush.flush_context(iommu,
5423 domain->iommu_did[iommu->seq_id],
5424 PCI_DEVID(info->bus, info->devfn),
5425 DMA_CCMD_MASK_NOBIT,
5426 DMA_CCMD_DEVICE_INVL);
5429 /* Enable PASID support in the device, if it wasn't already */
5430 if (!info->pasid_enabled)
5431 iommu_enable_dev_iotlb(info);
5436 spin_unlock(&iommu->lock);
5437 spin_unlock_irqrestore(&device_domain_lock, flags);
5442 static void intel_iommu_apply_resv_region(struct device *dev,
5443 struct iommu_domain *domain,
5444 struct iommu_resv_region *region)
5446 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5447 unsigned long start, end;
5449 start = IOVA_PFN(region->start);
5450 end = IOVA_PFN(region->start + region->length - 1);
5452 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5455 #ifdef CONFIG_INTEL_IOMMU_SVM
5456 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5458 struct intel_iommu *iommu;
5461 if (iommu_dummy(dev)) {
5463 "No IOMMU translation for device; cannot enable SVM\n");
5467 iommu = device_to_iommu(dev, &bus, &devfn);
5469 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5475 #endif /* CONFIG_INTEL_IOMMU_SVM */
5477 static int intel_iommu_enable_auxd(struct device *dev)
5479 struct device_domain_info *info;
5480 struct intel_iommu *iommu;
5481 unsigned long flags;
5485 iommu = device_to_iommu(dev, &bus, &devfn);
5486 if (!iommu || dmar_disabled)
5489 if (!sm_supported(iommu) || !pasid_supported(iommu))
5492 ret = intel_iommu_enable_pasid(iommu, dev);
5496 spin_lock_irqsave(&device_domain_lock, flags);
5497 info = dev->archdata.iommu;
5498 info->auxd_enabled = 1;
5499 spin_unlock_irqrestore(&device_domain_lock, flags);
5504 static int intel_iommu_disable_auxd(struct device *dev)
5506 struct device_domain_info *info;
5507 unsigned long flags;
5509 spin_lock_irqsave(&device_domain_lock, flags);
5510 info = dev->archdata.iommu;
5511 if (!WARN_ON(!info))
5512 info->auxd_enabled = 0;
5513 spin_unlock_irqrestore(&device_domain_lock, flags);
5519 * A PCI express designated vendor specific extended capability is defined
5520 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5521 * for system software and tools to detect endpoint devices supporting the
5522 * Intel scalable IO virtualization without host driver dependency.
5524 * Returns the address of the matching extended capability structure within
5525 * the device's PCI configuration space or 0 if the device does not support
5528 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5533 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5535 pci_read_config_word(pdev, pos + 4, &vendor);
5536 pci_read_config_word(pdev, pos + 8, &id);
5537 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5540 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5547 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5549 if (feat == IOMMU_DEV_FEAT_AUX) {
5552 if (!dev_is_pci(dev) || dmar_disabled ||
5553 !scalable_mode_support() || !iommu_pasid_support())
5556 ret = pci_pasid_features(to_pci_dev(dev));
5560 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5567 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5569 if (feat == IOMMU_DEV_FEAT_AUX)
5570 return intel_iommu_enable_auxd(dev);
5576 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5578 if (feat == IOMMU_DEV_FEAT_AUX)
5579 return intel_iommu_disable_auxd(dev);
5585 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5587 struct device_domain_info *info = dev->archdata.iommu;
5589 if (feat == IOMMU_DEV_FEAT_AUX)
5590 return scalable_mode_support() && info && info->auxd_enabled;
5596 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5598 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5600 return dmar_domain->default_pasid > 0 ?
5601 dmar_domain->default_pasid : -EINVAL;
5604 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5607 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5610 const struct iommu_ops intel_iommu_ops = {
5611 .capable = intel_iommu_capable,
5612 .domain_alloc = intel_iommu_domain_alloc,
5613 .domain_free = intel_iommu_domain_free,
5614 .attach_dev = intel_iommu_attach_device,
5615 .detach_dev = intel_iommu_detach_device,
5616 .aux_attach_dev = intel_iommu_aux_attach_device,
5617 .aux_detach_dev = intel_iommu_aux_detach_device,
5618 .aux_get_pasid = intel_iommu_aux_get_pasid,
5619 .map = intel_iommu_map,
5620 .unmap = intel_iommu_unmap,
5621 .iova_to_phys = intel_iommu_iova_to_phys,
5622 .add_device = intel_iommu_add_device,
5623 .remove_device = intel_iommu_remove_device,
5624 .get_resv_regions = intel_iommu_get_resv_regions,
5625 .put_resv_regions = intel_iommu_put_resv_regions,
5626 .apply_resv_region = intel_iommu_apply_resv_region,
5627 .device_group = pci_device_group,
5628 .dev_has_feat = intel_iommu_dev_has_feat,
5629 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5630 .dev_enable_feat = intel_iommu_dev_enable_feat,
5631 .dev_disable_feat = intel_iommu_dev_disable_feat,
5632 .is_attach_deferred = intel_iommu_is_attach_deferred,
5633 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5636 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5638 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5639 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5651 static void quirk_iommu_rwbf(struct pci_dev *dev)
5654 * Mobile 4 Series Chipset neglects to set RWBF capability,
5655 * but needs it. Same seems to hold for the desktop versions.
5657 pci_info(dev, "Forcing write-buffer flush capability\n");
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5670 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5671 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5672 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5673 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5674 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5675 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5676 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5677 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5679 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5683 if (pci_read_config_word(dev, GGC, &ggc))
5686 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5687 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5689 } else if (dmar_map_gfx) {
5690 /* we have to ensure the gfx device is idle before we flush */
5691 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5692 intel_iommu_strict = 1;
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5697 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5698 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5700 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5701 ISOCH DMAR unit for the Azalia sound device, but not give it any
5702 TLB entries, which causes it to deadlock. Check for that. We do
5703 this in a function called from init_dmars(), instead of in a PCI
5704 quirk, because we don't want to print the obnoxious "BIOS broken"
5705 message if VT-d is actually disabled.
5707 static void __init check_tylersburg_isoch(void)
5709 struct pci_dev *pdev;
5710 uint32_t vtisochctrl;
5712 /* If there's no Azalia in the system anyway, forget it. */
5713 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5718 /* System Management Registers. Might be hidden, in which case
5719 we can't do the sanity check. But that's OK, because the
5720 known-broken BIOSes _don't_ actually hide it, so far. */
5721 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5725 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5732 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5733 if (vtisochctrl & 1)
5736 /* Drop all bits other than the number of TLB entries */
5737 vtisochctrl &= 0x1c;
5739 /* If we have the recommended number of TLB entries (16), fine. */
5740 if (vtisochctrl == 0x10)
5743 /* Zero TLB entries? You get to ride the short bus to school. */
5745 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5746 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5747 dmi_get_system_info(DMI_BIOS_VENDOR),
5748 dmi_get_system_info(DMI_BIOS_VERSION),
5749 dmi_get_system_info(DMI_PRODUCT_VERSION));
5750 iommu_identity_mapping |= IDENTMAP_AZALIA;
5754 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",