1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343 struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
364 #define IDENTMAP_ALL 1
365 #define IDENTMAP_GFX 2
366 #define IDENTMAP_AZALIA 4
368 int intel_iommu_gfx_mapped;
369 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
372 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
377 * Iterate over elements in device_domain_list and call the specified
378 * callback @fn against each element.
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381 void *data), void *data)
385 struct device_domain_info *info;
387 spin_lock_irqsave(&device_domain_lock, flags);
388 list_for_each_entry(info, &device_domain_list, global) {
389 ret = fn(info, data);
391 spin_unlock_irqrestore(&device_domain_lock, flags);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
400 const struct iommu_ops intel_iommu_ops;
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
404 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
409 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 static void init_translation_status(struct intel_iommu *iommu)
416 gsts = readl(iommu->reg + DMAR_GSTS_REG);
417 if (gsts & DMA_GSTS_TES)
418 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
424 return container_of(dom, struct dmar_domain, domain);
427 static int __init intel_iommu_setup(char *str)
432 if (!strncmp(str, "on", 2)) {
434 pr_info("IOMMU enabled\n");
435 } else if (!strncmp(str, "off", 3)) {
437 no_platform_optin = 1;
438 pr_info("IOMMU disabled\n");
439 } else if (!strncmp(str, "igfx_off", 8)) {
441 pr_info("Disable GFX device mapping\n");
442 } else if (!strncmp(str, "forcedac", 8)) {
443 pr_info("Forcing DAC for PCI devices\n");
445 } else if (!strncmp(str, "strict", 6)) {
446 pr_info("Disable batched IOTLB flush\n");
447 intel_iommu_strict = 1;
448 } else if (!strncmp(str, "sp_off", 6)) {
449 pr_info("Disable supported super page\n");
450 intel_iommu_superpage = 0;
451 } else if (!strncmp(str, "sm_on", 5)) {
452 pr_info("Intel-IOMMU: scalable mode supported\n");
454 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
545 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
547 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
555 sagaw = cap_sagaw(iommu->cap);
556 for (agaw = width_to_agaw(max_gaw);
558 if (test_bit(agaw, &sagaw))
566 * Calculate max SAGAW for each iommu.
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
570 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
574 * calculate agaw for each iommu.
575 * "SAGAW" may be different across iommus, use a default agaw, and
576 * get a supported less agaw for iommus that don't support the default agaw.
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
588 /* si_domain and vm domain should not get here. */
589 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592 for_each_domain_iommu(iommu_id, domain)
595 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598 return g_iommus[iommu_id];
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 struct dmar_drhd_unit *drhd;
604 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_domain_iommu(i, domain) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686 struct root_entry *root = &iommu->root_entry[bus];
687 struct context_entry *context;
691 if (sm_supported(iommu)) {
699 context = phys_to_virt(*entry & VTD_PAGE_MASK);
701 unsigned long phy_addr;
705 context = alloc_pgtable_page(iommu->node);
709 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710 phy_addr = virt_to_phys((void *)context);
711 *entry = phy_addr | 1;
712 __iommu_flush_cache(iommu, entry, sizeof(*entry));
714 return &context[devfn];
717 static int iommu_dummy(struct device *dev)
719 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
723 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
724 * sub-hierarchy of a candidate PCI-PCI bridge
725 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
726 * @bridge: the candidate PCI-PCI bridge
728 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
733 struct pci_dev *pdev, *pbridge;
735 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738 pdev = to_pci_dev(dev);
739 pbridge = to_pci_dev(bridge);
741 if (pbridge->subordinate &&
742 pbridge->subordinate->number <= pdev->bus->number &&
743 pbridge->subordinate->busn_res.end >= pdev->bus->number)
749 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 struct dmar_drhd_unit *drhd = NULL;
752 struct intel_iommu *iommu;
754 struct pci_dev *pdev = NULL;
758 if (iommu_dummy(dev))
761 if (dev_is_pci(dev)) {
762 struct pci_dev *pf_pdev;
764 pdev = to_pci_dev(dev);
767 /* VMD child devices currently cannot be handled individually */
768 if (is_vmd(pdev->bus))
772 /* VFs aren't listed in scope tables; we need to look up
773 * the PF instead to find the IOMMU. */
774 pf_pdev = pci_physfn(pdev);
776 segment = pci_domain_nr(pdev->bus);
777 } else if (has_acpi_companion(dev))
778 dev = &ACPI_COMPANION(dev)->dev;
781 for_each_active_iommu(iommu, drhd) {
782 if (pdev && segment != drhd->segment)
785 for_each_active_dev_scope(drhd->devices,
786 drhd->devices_cnt, i, tmp) {
788 /* For a VF use its original BDF# not that of the PF
789 * which we used for the IOMMU lookup. Strictly speaking
790 * we could do this for all PCI devices; we only need to
791 * get the BDF# from the scope table for ACPI matches. */
792 if (pdev && pdev->is_virtfn)
795 *bus = drhd->devices[i].bus;
796 *devfn = drhd->devices[i].devfn;
800 if (is_downstream_to_pci_bridge(dev, tmp))
804 if (pdev && drhd->include_all) {
806 *bus = pdev->bus->number;
807 *devfn = pdev->devfn;
818 static void domain_flush_cache(struct dmar_domain *domain,
819 void *addr, int size)
821 if (!domain->iommu_coherency)
822 clflush_cache_range(addr, size);
825 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
827 struct context_entry *context;
831 spin_lock_irqsave(&iommu->lock, flags);
832 context = iommu_context_addr(iommu, bus, devfn, 0);
834 ret = context_present(context);
835 spin_unlock_irqrestore(&iommu->lock, flags);
839 static void free_context_table(struct intel_iommu *iommu)
843 struct context_entry *context;
845 spin_lock_irqsave(&iommu->lock, flags);
846 if (!iommu->root_entry) {
849 for (i = 0; i < ROOT_ENTRY_NR; i++) {
850 context = iommu_context_addr(iommu, i, 0, 0);
852 free_pgtable_page(context);
854 if (!sm_supported(iommu))
857 context = iommu_context_addr(iommu, i, 0x80, 0);
859 free_pgtable_page(context);
862 free_pgtable_page(iommu->root_entry);
863 iommu->root_entry = NULL;
865 spin_unlock_irqrestore(&iommu->lock, flags);
868 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
869 unsigned long pfn, int *target_level)
871 struct dma_pte *parent, *pte;
872 int level = agaw_to_level(domain->agaw);
875 BUG_ON(!domain->pgd);
877 if (!domain_pfn_supported(domain, pfn))
878 /* Address beyond IOMMU's addressing capabilities. */
881 parent = domain->pgd;
886 offset = pfn_level_offset(pfn, level);
887 pte = &parent[offset];
888 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
890 if (level == *target_level)
893 if (!dma_pte_present(pte)) {
896 tmp_page = alloc_pgtable_page(domain->nid);
901 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
902 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
903 if (cmpxchg64(&pte->val, 0ULL, pteval))
904 /* Someone else set it while we were thinking; use theirs. */
905 free_pgtable_page(tmp_page);
907 domain_flush_cache(domain, pte, sizeof(*pte));
912 parent = phys_to_virt(dma_pte_addr(pte));
917 *target_level = level;
922 /* return address's pte at specific level */
923 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
925 int level, int *large_page)
927 struct dma_pte *parent, *pte;
928 int total = agaw_to_level(domain->agaw);
931 parent = domain->pgd;
932 while (level <= total) {
933 offset = pfn_level_offset(pfn, total);
934 pte = &parent[offset];
938 if (!dma_pte_present(pte)) {
943 if (dma_pte_superpage(pte)) {
948 parent = phys_to_virt(dma_pte_addr(pte));
954 /* clear last level pte, a tlb flush should be followed */
955 static void dma_pte_clear_range(struct dmar_domain *domain,
956 unsigned long start_pfn,
957 unsigned long last_pfn)
959 unsigned int large_page;
960 struct dma_pte *first_pte, *pte;
962 BUG_ON(!domain_pfn_supported(domain, start_pfn));
963 BUG_ON(!domain_pfn_supported(domain, last_pfn));
964 BUG_ON(start_pfn > last_pfn);
966 /* we don't need lock here; nobody else touches the iova range */
969 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
971 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
976 start_pfn += lvl_to_nr_pages(large_page);
978 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
980 domain_flush_cache(domain, first_pte,
981 (void *)pte - (void *)first_pte);
983 } while (start_pfn && start_pfn <= last_pfn);
986 static void dma_pte_free_level(struct dmar_domain *domain, int level,
987 int retain_level, struct dma_pte *pte,
988 unsigned long pfn, unsigned long start_pfn,
989 unsigned long last_pfn)
991 pfn = max(start_pfn, pfn);
992 pte = &pte[pfn_level_offset(pfn, level)];
995 unsigned long level_pfn;
996 struct dma_pte *level_pte;
998 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001 level_pfn = pfn & level_mask(level);
1002 level_pte = phys_to_virt(dma_pte_addr(pte));
1005 dma_pte_free_level(domain, level - 1, retain_level,
1006 level_pte, level_pfn, start_pfn,
1011 * Free the page table if we're below the level we want to
1012 * retain and the range covers the entire table.
1014 if (level < retain_level && !(start_pfn > level_pfn ||
1015 last_pfn < level_pfn + level_size(level) - 1)) {
1017 domain_flush_cache(domain, pte, sizeof(*pte));
1018 free_pgtable_page(level_pte);
1021 pfn += level_size(level);
1022 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1026 * clear last level (leaf) ptes and free page table pages below the
1027 * level we wish to keep intact.
1029 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1030 unsigned long start_pfn,
1031 unsigned long last_pfn,
1034 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1035 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1036 BUG_ON(start_pfn > last_pfn);
1038 dma_pte_clear_range(domain, start_pfn, last_pfn);
1040 /* We don't need lock here; nobody else touches the iova range */
1041 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1042 domain->pgd, 0, start_pfn, last_pfn);
1045 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1046 free_pgtable_page(domain->pgd);
1051 /* When a page at a given level is being unlinked from its parent, we don't
1052 need to *modify* it at all. All we need to do is make a list of all the
1053 pages which can be freed just as soon as we've flushed the IOTLB and we
1054 know the hardware page-walk will no longer touch them.
1055 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1057 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1058 int level, struct dma_pte *pte,
1059 struct page *freelist)
1063 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1064 pg->freelist = freelist;
1070 pte = page_address(pg);
1072 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1073 freelist = dma_pte_list_pagetables(domain, level - 1,
1076 } while (!first_pte_in_page(pte));
1081 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1082 struct dma_pte *pte, unsigned long pfn,
1083 unsigned long start_pfn,
1084 unsigned long last_pfn,
1085 struct page *freelist)
1087 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1089 pfn = max(start_pfn, pfn);
1090 pte = &pte[pfn_level_offset(pfn, level)];
1093 unsigned long level_pfn;
1095 if (!dma_pte_present(pte))
1098 level_pfn = pfn & level_mask(level);
1100 /* If range covers entire pagetable, free it */
1101 if (start_pfn <= level_pfn &&
1102 last_pfn >= level_pfn + level_size(level) - 1) {
1103 /* These suborbinate page tables are going away entirely. Don't
1104 bother to clear them; we're just going to *free* them. */
1105 if (level > 1 && !dma_pte_superpage(pte))
1106 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1112 } else if (level > 1) {
1113 /* Recurse down into a level that isn't *entirely* obsolete */
1114 freelist = dma_pte_clear_level(domain, level - 1,
1115 phys_to_virt(dma_pte_addr(pte)),
1116 level_pfn, start_pfn, last_pfn,
1120 pfn += level_size(level);
1121 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124 domain_flush_cache(domain, first_pte,
1125 (void *)++last_pte - (void *)first_pte);
1130 /* We can't just free the pages because the IOMMU may still be walking
1131 the page tables, and may have cached the intermediate levels. The
1132 pages can only be freed after the IOTLB flush has been done. */
1133 static struct page *domain_unmap(struct dmar_domain *domain,
1134 unsigned long start_pfn,
1135 unsigned long last_pfn)
1137 struct page *freelist;
1139 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1140 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1141 BUG_ON(start_pfn > last_pfn);
1143 /* we don't need lock here; nobody else touches the iova range */
1144 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1145 domain->pgd, 0, start_pfn, last_pfn, NULL);
1148 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1149 struct page *pgd_page = virt_to_page(domain->pgd);
1150 pgd_page->freelist = freelist;
1151 freelist = pgd_page;
1159 static void dma_free_pagelist(struct page *freelist)
1163 while ((pg = freelist)) {
1164 freelist = pg->freelist;
1165 free_pgtable_page(page_address(pg));
1169 static void iova_entry_free(unsigned long data)
1171 struct page *freelist = (struct page *)data;
1173 dma_free_pagelist(freelist);
1176 /* iommu handling */
1177 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1179 struct root_entry *root;
1180 unsigned long flags;
1182 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1184 pr_err("Allocating root entry for %s failed\n",
1189 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1191 spin_lock_irqsave(&iommu->lock, flags);
1192 iommu->root_entry = root;
1193 spin_unlock_irqrestore(&iommu->lock, flags);
1198 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 addr = virt_to_phys(iommu->root_entry);
1205 if (sm_supported(iommu))
1206 addr |= DMA_RTADDR_SMT;
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1211 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1213 /* Make sure hardware complete it */
1214 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1215 readl, (sts & DMA_GSTS_RTPS), sts);
1217 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1225 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1231 /* Make sure hardware complete it */
1232 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233 readl, (!(val & DMA_GSTS_WBFS)), val);
1235 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 /* return value determine if we need a write buffer flush */
1239 static void __iommu_flush_context(struct intel_iommu *iommu,
1240 u16 did, u16 source_id, u8 function_mask,
1247 case DMA_CCMD_GLOBAL_INVL:
1248 val = DMA_CCMD_GLOBAL_INVL;
1250 case DMA_CCMD_DOMAIN_INVL:
1251 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1253 case DMA_CCMD_DEVICE_INVL:
1254 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1255 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1260 val |= DMA_CCMD_ICC;
1262 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1263 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1267 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 /* return value determine if we need a write buffer flush */
1273 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1274 u64 addr, unsigned int size_order, u64 type)
1276 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1277 u64 val = 0, val_iva = 0;
1281 case DMA_TLB_GLOBAL_FLUSH:
1282 /* global flush doesn't need set IVA_REG */
1283 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1285 case DMA_TLB_DSI_FLUSH:
1286 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 case DMA_TLB_PSI_FLUSH:
1289 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 /* IH bit is passed in as part of address */
1291 val_iva = size_order | addr;
1296 /* Note: set drain read/write */
1299 * This is probably to be super secure.. Looks like we can
1300 * ignore it without any impact.
1302 if (cap_read_drain(iommu->cap))
1303 val |= DMA_TLB_READ_DRAIN;
1305 if (cap_write_drain(iommu->cap))
1306 val |= DMA_TLB_WRITE_DRAIN;
1308 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1309 /* Note: Only uses first TLB reg currently */
1311 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1312 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1314 /* Make sure hardware complete it */
1315 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1316 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1320 /* check IOTLB invalidation granularity */
1321 if (DMA_TLB_IAIG(val) == 0)
1322 pr_err("Flush IOTLB failed\n");
1323 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1324 pr_debug("TLB flush request %Lx, actual %Lx\n",
1325 (unsigned long long)DMA_TLB_IIRG(type),
1326 (unsigned long long)DMA_TLB_IAIG(val));
1329 static struct device_domain_info *
1330 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333 struct device_domain_info *info;
1335 assert_spin_locked(&device_domain_lock);
1340 list_for_each_entry(info, &domain->devices, link)
1341 if (info->iommu == iommu && info->bus == bus &&
1342 info->devfn == devfn) {
1343 if (info->ats_supported && info->dev)
1351 static void domain_update_iotlb(struct dmar_domain *domain)
1353 struct device_domain_info *info;
1354 bool has_iotlb_device = false;
1356 assert_spin_locked(&device_domain_lock);
1358 list_for_each_entry(info, &domain->devices, link) {
1359 struct pci_dev *pdev;
1361 if (!info->dev || !dev_is_pci(info->dev))
1364 pdev = to_pci_dev(info->dev);
1365 if (pdev->ats_enabled) {
1366 has_iotlb_device = true;
1371 domain->has_iotlb_device = has_iotlb_device;
1374 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1376 struct pci_dev *pdev;
1378 assert_spin_locked(&device_domain_lock);
1380 if (!info || !dev_is_pci(info->dev))
1383 pdev = to_pci_dev(info->dev);
1384 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1385 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1386 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1387 * reserved, which should be set to 0.
1389 if (!ecap_dit(info->iommu->ecap))
1392 struct pci_dev *pf_pdev;
1394 /* pdev will be returned if device is not a vf */
1395 pf_pdev = pci_physfn(pdev);
1396 info->pfsid = pci_dev_id(pf_pdev);
1399 #ifdef CONFIG_INTEL_IOMMU_SVM
1400 /* The PCIe spec, in its wisdom, declares that the behaviour of
1401 the device if you enable PASID support after ATS support is
1402 undefined. So always enable PASID support on devices which
1403 have it, even if we can't yet know if we're ever going to
1405 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1406 info->pasid_enabled = 1;
1408 if (info->pri_supported &&
1409 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1410 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1411 info->pri_enabled = 1;
1413 if (!pdev->untrusted && info->ats_supported &&
1414 pci_ats_page_aligned(pdev) &&
1415 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1416 info->ats_enabled = 1;
1417 domain_update_iotlb(info->domain);
1418 info->ats_qdep = pci_ats_queue_depth(pdev);
1422 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1424 struct pci_dev *pdev;
1426 assert_spin_locked(&device_domain_lock);
1428 if (!dev_is_pci(info->dev))
1431 pdev = to_pci_dev(info->dev);
1433 if (info->ats_enabled) {
1434 pci_disable_ats(pdev);
1435 info->ats_enabled = 0;
1436 domain_update_iotlb(info->domain);
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439 if (info->pri_enabled) {
1440 pci_disable_pri(pdev);
1441 info->pri_enabled = 0;
1443 if (info->pasid_enabled) {
1444 pci_disable_pasid(pdev);
1445 info->pasid_enabled = 0;
1450 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1451 u64 addr, unsigned mask)
1454 unsigned long flags;
1455 struct device_domain_info *info;
1457 if (!domain->has_iotlb_device)
1460 spin_lock_irqsave(&device_domain_lock, flags);
1461 list_for_each_entry(info, &domain->devices, link) {
1462 if (!info->ats_enabled)
1465 sid = info->bus << 8 | info->devfn;
1466 qdep = info->ats_qdep;
1467 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1473 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1474 struct dmar_domain *domain,
1475 unsigned long pfn, unsigned int pages,
1478 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1479 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1480 u16 did = domain->iommu_did[iommu->seq_id];
1487 * Fallback to domain selective flush if no PSI support or the size is
1489 * PSI requires page size to be 2 ^ x, and the base address is naturally
1490 * aligned to the size
1492 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1493 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1500 * In caching mode, changes of pages from non-present to present require
1501 * flush. However, device IOTLB doesn't need to be flushed in this case.
1503 if (!cap_caching_mode(iommu->cap) || !map)
1504 iommu_flush_dev_iotlb(domain, addr, mask);
1507 /* Notification for newly created mappings */
1508 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1509 struct dmar_domain *domain,
1510 unsigned long pfn, unsigned int pages)
1512 /* It's a non-present to present mapping. Only flush if caching mode */
1513 if (cap_caching_mode(iommu->cap))
1514 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1516 iommu_flush_write_buffer(iommu);
1519 static void iommu_flush_iova(struct iova_domain *iovad)
1521 struct dmar_domain *domain;
1524 domain = container_of(iovad, struct dmar_domain, iovad);
1526 for_each_domain_iommu(idx, domain) {
1527 struct intel_iommu *iommu = g_iommus[idx];
1528 u16 did = domain->iommu_did[iommu->seq_id];
1530 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1532 if (!cap_caching_mode(iommu->cap))
1533 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1534 0, MAX_AGAW_PFN_WIDTH);
1538 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 unsigned long flags;
1543 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1547 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1548 pmen &= ~DMA_PMEN_EPM;
1549 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1551 /* wait for the protected region status bit to clear */
1552 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1553 readl, !(pmen & DMA_PMEN_PRS), pmen);
1555 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 unsigned long flags;
1563 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1564 iommu->gcmd |= DMA_GCMD_TE;
1565 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1567 /* Make sure hardware complete it */
1568 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1569 readl, (sts & DMA_GSTS_TES), sts);
1571 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 static void iommu_disable_translation(struct intel_iommu *iommu)
1579 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1580 iommu->gcmd &= ~DMA_GCMD_TE;
1581 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1583 /* Make sure hardware complete it */
1584 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1585 readl, (!(sts & DMA_GSTS_TES)), sts);
1587 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 static int iommu_init_domains(struct intel_iommu *iommu)
1592 u32 ndomains, nlongs;
1595 ndomains = cap_ndoms(iommu->cap);
1596 pr_debug("%s: Number of Domains supported <%d>\n",
1597 iommu->name, ndomains);
1598 nlongs = BITS_TO_LONGS(ndomains);
1600 spin_lock_init(&iommu->lock);
1602 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1603 if (!iommu->domain_ids) {
1604 pr_err("%s: Allocating domain id array failed\n",
1609 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1610 iommu->domains = kzalloc(size, GFP_KERNEL);
1612 if (iommu->domains) {
1613 size = 256 * sizeof(struct dmar_domain *);
1614 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617 if (!iommu->domains || !iommu->domains[0]) {
1618 pr_err("%s: Allocating domain array failed\n",
1620 kfree(iommu->domain_ids);
1621 kfree(iommu->domains);
1622 iommu->domain_ids = NULL;
1623 iommu->domains = NULL;
1628 * If Caching mode is set, then invalid translations are tagged
1629 * with domain-id 0, hence we need to pre-allocate it. We also
1630 * use domain-id 0 as a marker for non-allocated domain-id, so
1631 * make sure it is not used for a real domain.
1633 set_bit(0, iommu->domain_ids);
1636 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1637 * entry for first-level or pass-through translation modes should
1638 * be programmed with a domain id different from those used for
1639 * second-level or nested translation. We reserve a domain id for
1642 if (sm_supported(iommu))
1643 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1648 static void disable_dmar_iommu(struct intel_iommu *iommu)
1650 struct device_domain_info *info, *tmp;
1651 unsigned long flags;
1653 if (!iommu->domains || !iommu->domain_ids)
1656 spin_lock_irqsave(&device_domain_lock, flags);
1657 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1658 if (info->iommu != iommu)
1661 if (!info->dev || !info->domain)
1664 __dmar_remove_one_dev_info(info);
1666 spin_unlock_irqrestore(&device_domain_lock, flags);
1668 if (iommu->gcmd & DMA_GCMD_TE)
1669 iommu_disable_translation(iommu);
1672 static void free_dmar_iommu(struct intel_iommu *iommu)
1674 if ((iommu->domains) && (iommu->domain_ids)) {
1675 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678 for (i = 0; i < elems; i++)
1679 kfree(iommu->domains[i]);
1680 kfree(iommu->domains);
1681 kfree(iommu->domain_ids);
1682 iommu->domains = NULL;
1683 iommu->domain_ids = NULL;
1686 g_iommus[iommu->seq_id] = NULL;
1688 /* free context mapping */
1689 free_context_table(iommu);
1691 #ifdef CONFIG_INTEL_IOMMU_SVM
1692 if (pasid_supported(iommu)) {
1693 if (ecap_prs(iommu->ecap))
1694 intel_svm_finish_prq(iommu);
1699 static struct dmar_domain *alloc_domain(int flags)
1701 struct dmar_domain *domain;
1703 domain = alloc_domain_mem();
1707 memset(domain, 0, sizeof(*domain));
1708 domain->nid = NUMA_NO_NODE;
1709 domain->flags = flags;
1710 domain->has_iotlb_device = false;
1711 INIT_LIST_HEAD(&domain->devices);
1716 /* Must be called with iommu->lock */
1717 static int domain_attach_iommu(struct dmar_domain *domain,
1718 struct intel_iommu *iommu)
1720 unsigned long ndomains;
1723 assert_spin_locked(&device_domain_lock);
1724 assert_spin_locked(&iommu->lock);
1726 domain->iommu_refcnt[iommu->seq_id] += 1;
1727 domain->iommu_count += 1;
1728 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1729 ndomains = cap_ndoms(iommu->cap);
1730 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1732 if (num >= ndomains) {
1733 pr_err("%s: No free domain ids\n", iommu->name);
1734 domain->iommu_refcnt[iommu->seq_id] -= 1;
1735 domain->iommu_count -= 1;
1739 set_bit(num, iommu->domain_ids);
1740 set_iommu_domain(iommu, num, domain);
1742 domain->iommu_did[iommu->seq_id] = num;
1743 domain->nid = iommu->node;
1745 domain_update_iommu_cap(domain);
1751 static int domain_detach_iommu(struct dmar_domain *domain,
1752 struct intel_iommu *iommu)
1756 assert_spin_locked(&device_domain_lock);
1757 assert_spin_locked(&iommu->lock);
1759 domain->iommu_refcnt[iommu->seq_id] -= 1;
1760 count = --domain->iommu_count;
1761 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1762 num = domain->iommu_did[iommu->seq_id];
1763 clear_bit(num, iommu->domain_ids);
1764 set_iommu_domain(iommu, num, NULL);
1766 domain_update_iommu_cap(domain);
1767 domain->iommu_did[iommu->seq_id] = 0;
1773 static struct iova_domain reserved_iova_list;
1774 static struct lock_class_key reserved_rbtree_key;
1776 static int dmar_init_reserved_ranges(void)
1778 struct pci_dev *pdev = NULL;
1782 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1784 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1785 &reserved_rbtree_key);
1787 /* IOAPIC ranges shouldn't be accessed by DMA */
1788 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1789 IOVA_PFN(IOAPIC_RANGE_END));
1791 pr_err("Reserve IOAPIC range failed\n");
1795 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1796 for_each_pci_dev(pdev) {
1799 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1800 r = &pdev->resource[i];
1801 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1803 iova = reserve_iova(&reserved_iova_list,
1807 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1817 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 static inline int guestwidth_to_adjustwidth(int gaw)
1823 int r = (gaw - 12) % 9;
1834 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837 int adjust_width, agaw;
1838 unsigned long sagaw;
1841 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1843 err = init_iova_flush_queue(&domain->iovad,
1844 iommu_flush_iova, iova_entry_free);
1848 domain_reserve_special_ranges(domain);
1850 /* calculate AGAW */
1851 if (guest_width > cap_mgaw(iommu->cap))
1852 guest_width = cap_mgaw(iommu->cap);
1853 domain->gaw = guest_width;
1854 adjust_width = guestwidth_to_adjustwidth(guest_width);
1855 agaw = width_to_agaw(adjust_width);
1856 sagaw = cap_sagaw(iommu->cap);
1857 if (!test_bit(agaw, &sagaw)) {
1858 /* hardware doesn't support it, choose a bigger one */
1859 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1860 agaw = find_next_bit(&sagaw, 5, agaw);
1864 domain->agaw = agaw;
1866 if (ecap_coherent(iommu->ecap))
1867 domain->iommu_coherency = 1;
1869 domain->iommu_coherency = 0;
1871 if (ecap_sc_support(iommu->ecap))
1872 domain->iommu_snooping = 1;
1874 domain->iommu_snooping = 0;
1876 if (intel_iommu_superpage)
1877 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1879 domain->iommu_superpage = 0;
1881 domain->nid = iommu->node;
1883 /* always allocate the top pgd */
1884 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1891 static void domain_exit(struct dmar_domain *domain)
1893 struct page *freelist;
1895 /* Remove associated devices and clear attached or cached domains */
1896 domain_remove_dev_info(domain);
1899 put_iova_domain(&domain->iovad);
1901 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1903 dma_free_pagelist(freelist);
1905 free_domain_mem(domain);
1909 * Get the PASID directory size for scalable mode context entry.
1910 * Value of X in the PDTS field of a scalable mode context entry
1911 * indicates PASID directory with 2^(X + 7) entries.
1913 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1917 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1918 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1926 * Set the RID_PASID field of a scalable mode context entry. The
1927 * IOMMU hardware will use the PASID value set in this field for
1928 * DMA translations of DMA requests without PASID.
1931 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1933 context->hi |= pasid & ((1 << 20) - 1);
1934 context->hi |= (1 << 20);
1938 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1941 static inline void context_set_sm_dte(struct context_entry *context)
1943 context->lo |= (1 << 2);
1947 * Set the PRE(Page Request Enable) field of a scalable mode context
1950 static inline void context_set_sm_pre(struct context_entry *context)
1952 context->lo |= (1 << 4);
1955 /* Convert value to context PASID directory size field coding. */
1956 #define context_pdts(pds) (((pds) & 0x7) << 9)
1958 static int domain_context_mapping_one(struct dmar_domain *domain,
1959 struct intel_iommu *iommu,
1960 struct pasid_table *table,
1963 u16 did = domain->iommu_did[iommu->seq_id];
1964 int translation = CONTEXT_TT_MULTI_LEVEL;
1965 struct device_domain_info *info = NULL;
1966 struct context_entry *context;
1967 unsigned long flags;
1972 if (hw_pass_through && domain_type_is_si(domain))
1973 translation = CONTEXT_TT_PASS_THROUGH;
1975 pr_debug("Set context mapping for %02x:%02x.%d\n",
1976 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1978 BUG_ON(!domain->pgd);
1980 spin_lock_irqsave(&device_domain_lock, flags);
1981 spin_lock(&iommu->lock);
1984 context = iommu_context_addr(iommu, bus, devfn, 1);
1989 if (context_present(context))
1993 * For kdump cases, old valid entries may be cached due to the
1994 * in-flight DMA and copied pgtable, but there is no unmapping
1995 * behaviour for them, thus we need an explicit cache flush for
1996 * the newly-mapped device. For kdump, at this point, the device
1997 * is supposed to finish reset at its driver probe stage, so no
1998 * in-flight DMA will exist, and we don't need to worry anymore
2001 if (context_copied(context)) {
2002 u16 did_old = context_domain_id(context);
2004 if (did_old < cap_ndoms(iommu->cap)) {
2005 iommu->flush.flush_context(iommu, did_old,
2006 (((u16)bus) << 8) | devfn,
2007 DMA_CCMD_MASK_NOBIT,
2008 DMA_CCMD_DEVICE_INVL);
2009 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2014 context_clear_entry(context);
2016 if (sm_supported(iommu)) {
2021 /* Setup the PASID DIR pointer: */
2022 pds = context_get_sm_pds(table);
2023 context->lo = (u64)virt_to_phys(table->table) |
2026 /* Setup the RID_PASID field: */
2027 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2030 * Setup the Device-TLB enable bit and Page request
2033 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2034 if (info && info->ats_supported)
2035 context_set_sm_dte(context);
2036 if (info && info->pri_supported)
2037 context_set_sm_pre(context);
2039 struct dma_pte *pgd = domain->pgd;
2042 context_set_domain_id(context, did);
2044 if (translation != CONTEXT_TT_PASS_THROUGH) {
2046 * Skip top levels of page tables for iommu which has
2047 * less agaw than default. Unnecessary for PT mode.
2049 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2051 pgd = phys_to_virt(dma_pte_addr(pgd));
2052 if (!dma_pte_present(pgd))
2056 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2057 if (info && info->ats_supported)
2058 translation = CONTEXT_TT_DEV_IOTLB;
2060 translation = CONTEXT_TT_MULTI_LEVEL;
2062 context_set_address_root(context, virt_to_phys(pgd));
2063 context_set_address_width(context, agaw);
2066 * In pass through mode, AW must be programmed to
2067 * indicate the largest AGAW value supported by
2068 * hardware. And ASR is ignored by hardware.
2070 context_set_address_width(context, iommu->msagaw);
2073 context_set_translation_type(context, translation);
2076 context_set_fault_enable(context);
2077 context_set_present(context);
2078 domain_flush_cache(domain, context, sizeof(*context));
2081 * It's a non-present to present mapping. If hardware doesn't cache
2082 * non-present entry we only need to flush the write-buffer. If the
2083 * _does_ cache non-present entries, then it does so in the special
2084 * domain #0, which we have to flush:
2086 if (cap_caching_mode(iommu->cap)) {
2087 iommu->flush.flush_context(iommu, 0,
2088 (((u16)bus) << 8) | devfn,
2089 DMA_CCMD_MASK_NOBIT,
2090 DMA_CCMD_DEVICE_INVL);
2091 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2093 iommu_flush_write_buffer(iommu);
2095 iommu_enable_dev_iotlb(info);
2100 spin_unlock(&iommu->lock);
2101 spin_unlock_irqrestore(&device_domain_lock, flags);
2107 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2109 struct pasid_table *table;
2110 struct intel_iommu *iommu;
2113 iommu = device_to_iommu(dev, &bus, &devfn);
2117 table = intel_pasid_get_table(dev);
2118 return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2121 static int domain_context_mapped_cb(struct pci_dev *pdev,
2122 u16 alias, void *opaque)
2124 struct intel_iommu *iommu = opaque;
2126 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2129 static int domain_context_mapped(struct device *dev)
2131 struct intel_iommu *iommu;
2134 iommu = device_to_iommu(dev, &bus, &devfn);
2138 if (!dev_is_pci(dev))
2139 return device_context_mapped(iommu, bus, devfn);
2141 return !pci_for_each_dma_alias(to_pci_dev(dev),
2142 domain_context_mapped_cb, iommu);
2145 /* Returns a number of VTD pages, but aligned to MM page size */
2146 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2149 host_addr &= ~PAGE_MASK;
2150 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2153 /* Return largest possible superpage level for a given mapping */
2154 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2155 unsigned long iov_pfn,
2156 unsigned long phy_pfn,
2157 unsigned long pages)
2159 int support, level = 1;
2160 unsigned long pfnmerge;
2162 support = domain->iommu_superpage;
2164 /* To use a large page, the virtual *and* physical addresses
2165 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2166 of them will mean we have to use smaller pages. So just
2167 merge them and check both at once. */
2168 pfnmerge = iov_pfn | phy_pfn;
2170 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2171 pages >>= VTD_STRIDE_SHIFT;
2174 pfnmerge >>= VTD_STRIDE_SHIFT;
2181 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2182 struct scatterlist *sg, unsigned long phys_pfn,
2183 unsigned long nr_pages, int prot)
2185 struct dma_pte *first_pte = NULL, *pte = NULL;
2186 phys_addr_t uninitialized_var(pteval);
2187 unsigned long sg_res = 0;
2188 unsigned int largepage_lvl = 0;
2189 unsigned long lvl_pages = 0;
2191 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2193 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2196 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2200 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2203 while (nr_pages > 0) {
2207 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2209 sg_res = aligned_nrpages(sg->offset, sg->length);
2210 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2211 sg->dma_length = sg->length;
2212 pteval = (sg_phys(sg) - pgoff) | prot;
2213 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2217 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2219 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2222 /* It is large page*/
2223 if (largepage_lvl > 1) {
2224 unsigned long nr_superpages, end_pfn;
2226 pteval |= DMA_PTE_LARGE_PAGE;
2227 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2229 nr_superpages = sg_res / lvl_pages;
2230 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2233 * Ensure that old small page tables are
2234 * removed to make room for superpage(s).
2235 * We're adding new large pages, so make sure
2236 * we don't remove their parent tables.
2238 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2241 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2245 /* We don't need lock here, nobody else
2246 * touches the iova range
2248 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2250 static int dumps = 5;
2251 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2252 iov_pfn, tmp, (unsigned long long)pteval);
2255 debug_dma_dump_mappings(NULL);
2260 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2262 BUG_ON(nr_pages < lvl_pages);
2263 BUG_ON(sg_res < lvl_pages);
2265 nr_pages -= lvl_pages;
2266 iov_pfn += lvl_pages;
2267 phys_pfn += lvl_pages;
2268 pteval += lvl_pages * VTD_PAGE_SIZE;
2269 sg_res -= lvl_pages;
2271 /* If the next PTE would be the first in a new page, then we
2272 need to flush the cache on the entries we've just written.
2273 And then we'll need to recalculate 'pte', so clear it and
2274 let it get set again in the if (!pte) block above.
2276 If we're done (!nr_pages) we need to flush the cache too.
2278 Also if we've been setting superpages, we may need to
2279 recalculate 'pte' and switch back to smaller pages for the
2280 end of the mapping, if the trailing size is not enough to
2281 use another superpage (i.e. sg_res < lvl_pages). */
2283 if (!nr_pages || first_pte_in_page(pte) ||
2284 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2285 domain_flush_cache(domain, first_pte,
2286 (void *)pte - (void *)first_pte);
2290 if (!sg_res && nr_pages)
2296 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2297 struct scatterlist *sg, unsigned long phys_pfn,
2298 unsigned long nr_pages, int prot)
2301 struct intel_iommu *iommu;
2303 /* Do the real mapping first */
2304 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2308 for_each_domain_iommu(iommu_id, domain) {
2309 iommu = g_iommus[iommu_id];
2310 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2316 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2317 struct scatterlist *sg, unsigned long nr_pages,
2320 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2323 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2324 unsigned long phys_pfn, unsigned long nr_pages,
2327 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2330 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2332 unsigned long flags;
2333 struct context_entry *context;
2339 spin_lock_irqsave(&iommu->lock, flags);
2340 context = iommu_context_addr(iommu, bus, devfn, 0);
2342 spin_unlock_irqrestore(&iommu->lock, flags);
2345 did_old = context_domain_id(context);
2346 context_clear_entry(context);
2347 __iommu_flush_cache(iommu, context, sizeof(*context));
2348 spin_unlock_irqrestore(&iommu->lock, flags);
2349 iommu->flush.flush_context(iommu,
2351 (((u16)bus) << 8) | devfn,
2352 DMA_CCMD_MASK_NOBIT,
2353 DMA_CCMD_DEVICE_INVL);
2354 iommu->flush.flush_iotlb(iommu,
2361 static inline void unlink_domain_info(struct device_domain_info *info)
2363 assert_spin_locked(&device_domain_lock);
2364 list_del(&info->link);
2365 list_del(&info->global);
2367 info->dev->archdata.iommu = NULL;
2370 static void domain_remove_dev_info(struct dmar_domain *domain)
2372 struct device_domain_info *info, *tmp;
2373 unsigned long flags;
2375 spin_lock_irqsave(&device_domain_lock, flags);
2376 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2377 __dmar_remove_one_dev_info(info);
2378 spin_unlock_irqrestore(&device_domain_lock, flags);
2383 * Note: we use struct device->archdata.iommu stores the info
2385 static struct dmar_domain *find_domain(struct device *dev)
2387 struct device_domain_info *info;
2389 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2390 struct iommu_domain *domain;
2392 dev->archdata.iommu = NULL;
2393 domain = iommu_get_domain_for_dev(dev);
2395 intel_iommu_attach_device(domain, dev);
2398 /* No lock here, assumes no domain exit in normal case */
2399 info = dev->archdata.iommu;
2402 return info->domain;
2406 static inline struct device_domain_info *
2407 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2409 struct device_domain_info *info;
2411 list_for_each_entry(info, &device_domain_list, global)
2412 if (info->iommu->segment == segment && info->bus == bus &&
2413 info->devfn == devfn)
2419 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2422 struct dmar_domain *domain)
2424 struct dmar_domain *found = NULL;
2425 struct device_domain_info *info;
2426 unsigned long flags;
2429 info = alloc_devinfo_mem();
2434 info->devfn = devfn;
2435 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2436 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2439 info->domain = domain;
2440 info->iommu = iommu;
2441 info->pasid_table = NULL;
2442 info->auxd_enabled = 0;
2443 INIT_LIST_HEAD(&info->auxiliary_domains);
2445 if (dev && dev_is_pci(dev)) {
2446 struct pci_dev *pdev = to_pci_dev(info->dev);
2448 if (!pdev->untrusted &&
2449 !pci_ats_disabled() &&
2450 ecap_dev_iotlb_support(iommu->ecap) &&
2451 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2452 dmar_find_matched_atsr_unit(pdev))
2453 info->ats_supported = 1;
2455 if (sm_supported(iommu)) {
2456 if (pasid_supported(iommu)) {
2457 int features = pci_pasid_features(pdev);
2459 info->pasid_supported = features | 1;
2462 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2463 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2464 info->pri_supported = 1;
2468 spin_lock_irqsave(&device_domain_lock, flags);
2470 found = find_domain(dev);
2473 struct device_domain_info *info2;
2474 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2476 found = info2->domain;
2482 spin_unlock_irqrestore(&device_domain_lock, flags);
2483 free_devinfo_mem(info);
2484 /* Caller must free the original domain */
2488 spin_lock(&iommu->lock);
2489 ret = domain_attach_iommu(domain, iommu);
2490 spin_unlock(&iommu->lock);
2493 spin_unlock_irqrestore(&device_domain_lock, flags);
2494 free_devinfo_mem(info);
2498 list_add(&info->link, &domain->devices);
2499 list_add(&info->global, &device_domain_list);
2501 dev->archdata.iommu = info;
2502 spin_unlock_irqrestore(&device_domain_lock, flags);
2504 /* PASID table is mandatory for a PCI device in scalable mode. */
2505 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2506 ret = intel_pasid_alloc_table(dev);
2508 dev_err(dev, "PASID table allocation failed\n");
2509 dmar_remove_one_dev_info(dev);
2513 /* Setup the PASID entry for requests without PASID: */
2514 spin_lock(&iommu->lock);
2515 if (hw_pass_through && domain_type_is_si(domain))
2516 ret = intel_pasid_setup_pass_through(iommu, domain,
2517 dev, PASID_RID2PASID);
2519 ret = intel_pasid_setup_second_level(iommu, domain,
2520 dev, PASID_RID2PASID);
2521 spin_unlock(&iommu->lock);
2523 dev_err(dev, "Setup RID2PASID failed\n");
2524 dmar_remove_one_dev_info(dev);
2529 if (dev && domain_context_mapping(domain, dev)) {
2530 dev_err(dev, "Domain context map failed\n");
2531 dmar_remove_one_dev_info(dev);
2538 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2540 *(u16 *)opaque = alias;
2544 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2546 struct device_domain_info *info;
2547 struct dmar_domain *domain = NULL;
2548 struct intel_iommu *iommu;
2550 unsigned long flags;
2553 iommu = device_to_iommu(dev, &bus, &devfn);
2557 if (dev_is_pci(dev)) {
2558 struct pci_dev *pdev = to_pci_dev(dev);
2560 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2562 spin_lock_irqsave(&device_domain_lock, flags);
2563 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2564 PCI_BUS_NUM(dma_alias),
2567 iommu = info->iommu;
2568 domain = info->domain;
2570 spin_unlock_irqrestore(&device_domain_lock, flags);
2572 /* DMA alias already has a domain, use it */
2577 /* Allocate and initialize new domain for the device */
2578 domain = alloc_domain(0);
2581 if (domain_init(domain, iommu, gaw)) {
2582 domain_exit(domain);
2590 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2591 struct dmar_domain *domain)
2593 struct intel_iommu *iommu;
2594 struct dmar_domain *tmp;
2595 u16 req_id, dma_alias;
2598 iommu = device_to_iommu(dev, &bus, &devfn);
2602 req_id = ((u16)bus << 8) | devfn;
2604 if (dev_is_pci(dev)) {
2605 struct pci_dev *pdev = to_pci_dev(dev);
2607 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2609 /* register PCI DMA alias device */
2610 if (req_id != dma_alias) {
2611 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2612 dma_alias & 0xff, NULL, domain);
2614 if (!tmp || tmp != domain)
2619 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2620 if (!tmp || tmp != domain)
2626 static int iommu_domain_identity_map(struct dmar_domain *domain,
2627 unsigned long long start,
2628 unsigned long long end)
2630 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2631 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2633 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2634 dma_to_mm_pfn(last_vpfn))) {
2635 pr_err("Reserving iova failed\n");
2639 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2641 * RMRR range might have overlap with physical memory range,
2644 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2646 return __domain_mapping(domain, first_vpfn, NULL,
2647 first_vpfn, last_vpfn - first_vpfn + 1,
2648 DMA_PTE_READ|DMA_PTE_WRITE);
2651 static int domain_prepare_identity_map(struct device *dev,
2652 struct dmar_domain *domain,
2653 unsigned long long start,
2654 unsigned long long end)
2656 /* For _hardware_ passthrough, don't bother. But for software
2657 passthrough, we do it anyway -- it may indicate a memory
2658 range which is reserved in E820, so which didn't get set
2659 up to start with in si_domain */
2660 if (domain == si_domain && hw_pass_through) {
2661 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2666 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2669 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2670 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2671 dmi_get_system_info(DMI_BIOS_VENDOR),
2672 dmi_get_system_info(DMI_BIOS_VERSION),
2673 dmi_get_system_info(DMI_PRODUCT_VERSION));
2677 if (end >> agaw_to_width(domain->agaw)) {
2678 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2679 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2680 agaw_to_width(domain->agaw),
2681 dmi_get_system_info(DMI_BIOS_VENDOR),
2682 dmi_get_system_info(DMI_BIOS_VERSION),
2683 dmi_get_system_info(DMI_PRODUCT_VERSION));
2687 return iommu_domain_identity_map(domain, start, end);
2690 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2692 static int __init si_domain_init(int hw)
2694 struct dmar_rmrr_unit *rmrr;
2698 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2702 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2703 domain_exit(si_domain);
2710 for_each_online_node(nid) {
2711 unsigned long start_pfn, end_pfn;
2714 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2715 ret = iommu_domain_identity_map(si_domain,
2716 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2723 * Normally we use DMA domains for devices which have RMRRs. But we
2724 * loose this requirement for graphic and usb devices. Identity map
2725 * the RMRRs for graphic and USB devices so that they could use the
2728 for_each_rmrr_units(rmrr) {
2729 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2731 unsigned long long start = rmrr->base_address;
2732 unsigned long long end = rmrr->end_address;
2734 if (device_is_rmrr_locked(dev))
2737 if (WARN_ON(end < start ||
2738 end >> agaw_to_width(si_domain->agaw)))
2741 ret = iommu_domain_identity_map(si_domain, start, end);
2750 static int identity_mapping(struct device *dev)
2752 struct device_domain_info *info;
2754 info = dev->archdata.iommu;
2755 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2756 return (info->domain == si_domain);
2761 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2763 struct dmar_domain *ndomain;
2764 struct intel_iommu *iommu;
2767 iommu = device_to_iommu(dev, &bus, &devfn);
2771 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2772 if (ndomain != domain)
2778 static bool device_has_rmrr(struct device *dev)
2780 struct dmar_rmrr_unit *rmrr;
2785 for_each_rmrr_units(rmrr) {
2787 * Return TRUE if this RMRR contains the device that
2790 for_each_active_dev_scope(rmrr->devices,
2791 rmrr->devices_cnt, i, tmp)
2793 is_downstream_to_pci_bridge(dev, tmp)) {
2803 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2804 * is relaxable (ie. is allowed to be not enforced under some conditions)
2805 * @dev: device handle
2807 * We assume that PCI USB devices with RMRRs have them largely
2808 * for historical reasons and that the RMRR space is not actively used post
2809 * boot. This exclusion may change if vendors begin to abuse it.
2811 * The same exception is made for graphics devices, with the requirement that
2812 * any use of the RMRR regions will be torn down before assigning the device
2815 * Return: true if the RMRR is relaxable, false otherwise
2817 static bool device_rmrr_is_relaxable(struct device *dev)
2819 struct pci_dev *pdev;
2821 if (!dev_is_pci(dev))
2824 pdev = to_pci_dev(dev);
2825 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2832 * There are a couple cases where we need to restrict the functionality of
2833 * devices associated with RMRRs. The first is when evaluating a device for
2834 * identity mapping because problems exist when devices are moved in and out
2835 * of domains and their respective RMRR information is lost. This means that
2836 * a device with associated RMRRs will never be in a "passthrough" domain.
2837 * The second is use of the device through the IOMMU API. This interface
2838 * expects to have full control of the IOVA space for the device. We cannot
2839 * satisfy both the requirement that RMRR access is maintained and have an
2840 * unencumbered IOVA space. We also have no ability to quiesce the device's
2841 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2842 * We therefore prevent devices associated with an RMRR from participating in
2843 * the IOMMU API, which eliminates them from device assignment.
2845 * In both cases, devices which have relaxable RMRRs are not concerned by this
2846 * restriction. See device_rmrr_is_relaxable comment.
2848 static bool device_is_rmrr_locked(struct device *dev)
2850 if (!device_has_rmrr(dev))
2853 if (device_rmrr_is_relaxable(dev))
2860 * Return the required default domain type for a specific device.
2862 * @dev: the device in query
2863 * @startup: true if this is during early boot
2866 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2867 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2868 * - 0: both identity and dynamic domains work for this device
2870 static int device_def_domain_type(struct device *dev)
2872 if (dev_is_pci(dev)) {
2873 struct pci_dev *pdev = to_pci_dev(dev);
2875 if (device_is_rmrr_locked(dev))
2876 return IOMMU_DOMAIN_DMA;
2879 * Prevent any device marked as untrusted from getting
2880 * placed into the statically identity mapping domain.
2882 if (pdev->untrusted)
2883 return IOMMU_DOMAIN_DMA;
2885 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2886 return IOMMU_DOMAIN_IDENTITY;
2888 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2889 return IOMMU_DOMAIN_IDENTITY;
2892 * We want to start off with all devices in the 1:1 domain, and
2893 * take them out later if we find they can't access all of memory.
2895 * However, we can't do this for PCI devices behind bridges,
2896 * because all PCI devices behind the same bridge will end up
2897 * with the same source-id on their transactions.
2899 * Practically speaking, we can't change things around for these
2900 * devices at run-time, because we can't be sure there'll be no
2901 * DMA transactions in flight for any of their siblings.
2903 * So PCI devices (unless they're on the root bus) as well as
2904 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2905 * the 1:1 domain, just in _case_ one of their siblings turns out
2906 * not to be able to map all of memory.
2908 if (!pci_is_pcie(pdev)) {
2909 if (!pci_is_root_bus(pdev->bus))
2910 return IOMMU_DOMAIN_DMA;
2911 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2912 return IOMMU_DOMAIN_DMA;
2913 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2914 return IOMMU_DOMAIN_DMA;
2916 if (device_has_rmrr(dev))
2917 return IOMMU_DOMAIN_DMA;
2920 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2921 IOMMU_DOMAIN_IDENTITY : 0;
2924 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2927 * Start from the sane iommu hardware state.
2928 * If the queued invalidation is already initialized by us
2929 * (for example, while enabling interrupt-remapping) then
2930 * we got the things already rolling from a sane state.
2934 * Clear any previous faults.
2936 dmar_fault(-1, iommu);
2938 * Disable queued invalidation if supported and already enabled
2939 * before OS handover.
2941 dmar_disable_qi(iommu);
2944 if (dmar_enable_qi(iommu)) {
2946 * Queued Invalidate not enabled, use Register Based Invalidate
2948 iommu->flush.flush_context = __iommu_flush_context;
2949 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2950 pr_info("%s: Using Register based invalidation\n",
2953 iommu->flush.flush_context = qi_flush_context;
2954 iommu->flush.flush_iotlb = qi_flush_iotlb;
2955 pr_info("%s: Using Queued invalidation\n", iommu->name);
2959 static int copy_context_table(struct intel_iommu *iommu,
2960 struct root_entry *old_re,
2961 struct context_entry **tbl,
2964 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2965 struct context_entry *new_ce = NULL, ce;
2966 struct context_entry *old_ce = NULL;
2967 struct root_entry re;
2968 phys_addr_t old_ce_phys;
2970 tbl_idx = ext ? bus * 2 : bus;
2971 memcpy(&re, old_re, sizeof(re));
2973 for (devfn = 0; devfn < 256; devfn++) {
2974 /* First calculate the correct index */
2975 idx = (ext ? devfn * 2 : devfn) % 256;
2978 /* First save what we may have and clean up */
2980 tbl[tbl_idx] = new_ce;
2981 __iommu_flush_cache(iommu, new_ce,
2991 old_ce_phys = root_entry_lctp(&re);
2993 old_ce_phys = root_entry_uctp(&re);
2996 if (ext && devfn == 0) {
2997 /* No LCTP, try UCTP */
3006 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3011 new_ce = alloc_pgtable_page(iommu->node);
3018 /* Now copy the context entry */
3019 memcpy(&ce, old_ce + idx, sizeof(ce));
3021 if (!__context_present(&ce))
3024 did = context_domain_id(&ce);
3025 if (did >= 0 && did < cap_ndoms(iommu->cap))
3026 set_bit(did, iommu->domain_ids);
3029 * We need a marker for copied context entries. This
3030 * marker needs to work for the old format as well as
3031 * for extended context entries.
3033 * Bit 67 of the context entry is used. In the old
3034 * format this bit is available to software, in the
3035 * extended format it is the PGE bit, but PGE is ignored
3036 * by HW if PASIDs are disabled (and thus still
3039 * So disable PASIDs first and then mark the entry
3040 * copied. This means that we don't copy PASID
3041 * translations from the old kernel, but this is fine as
3042 * faults there are not fatal.
3044 context_clear_pasid_enable(&ce);
3045 context_set_copied(&ce);
3050 tbl[tbl_idx + pos] = new_ce;
3052 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3061 static int copy_translation_tables(struct intel_iommu *iommu)
3063 struct context_entry **ctxt_tbls;
3064 struct root_entry *old_rt;
3065 phys_addr_t old_rt_phys;
3066 int ctxt_table_entries;
3067 unsigned long flags;
3072 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3073 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3074 new_ext = !!ecap_ecs(iommu->ecap);
3077 * The RTT bit can only be changed when translation is disabled,
3078 * but disabling translation means to open a window for data
3079 * corruption. So bail out and don't copy anything if we would
3080 * have to change the bit.
3085 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3089 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3093 /* This is too big for the stack - allocate it from slab */
3094 ctxt_table_entries = ext ? 512 : 256;
3096 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3100 for (bus = 0; bus < 256; bus++) {
3101 ret = copy_context_table(iommu, &old_rt[bus],
3102 ctxt_tbls, bus, ext);
3104 pr_err("%s: Failed to copy context table for bus %d\n",
3110 spin_lock_irqsave(&iommu->lock, flags);
3112 /* Context tables are copied, now write them to the root_entry table */
3113 for (bus = 0; bus < 256; bus++) {
3114 int idx = ext ? bus * 2 : bus;
3117 if (ctxt_tbls[idx]) {
3118 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3119 iommu->root_entry[bus].lo = val;
3122 if (!ext || !ctxt_tbls[idx + 1])
3125 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3126 iommu->root_entry[bus].hi = val;
3129 spin_unlock_irqrestore(&iommu->lock, flags);
3133 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3143 static int __init init_dmars(void)
3145 struct dmar_drhd_unit *drhd;
3146 struct intel_iommu *iommu;
3152 * initialize and program root entry to not present
3155 for_each_drhd_unit(drhd) {
3157 * lock not needed as this is only incremented in the single
3158 * threaded kernel __init code path all other access are read
3161 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3165 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3168 /* Preallocate enough resources for IOMMU hot-addition */
3169 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3170 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3172 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3175 pr_err("Allocating global iommu array failed\n");
3180 for_each_iommu(iommu, drhd) {
3181 if (drhd->ignored) {
3182 iommu_disable_translation(iommu);
3187 * Find the max pasid size of all IOMMU's in the system.
3188 * We need to ensure the system pasid table is no bigger
3189 * than the smallest supported.
3191 if (pasid_supported(iommu)) {
3192 u32 temp = 2 << ecap_pss(iommu->ecap);
3194 intel_pasid_max_id = min_t(u32, temp,
3195 intel_pasid_max_id);
3198 g_iommus[iommu->seq_id] = iommu;
3200 intel_iommu_init_qi(iommu);
3202 ret = iommu_init_domains(iommu);
3206 init_translation_status(iommu);
3208 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3209 iommu_disable_translation(iommu);
3210 clear_translation_pre_enabled(iommu);
3211 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3217 * we could share the same root & context tables
3218 * among all IOMMU's. Need to Split it later.
3220 ret = iommu_alloc_root_entry(iommu);
3224 if (translation_pre_enabled(iommu)) {
3225 pr_info("Translation already enabled - trying to copy translation structures\n");
3227 ret = copy_translation_tables(iommu);
3230 * We found the IOMMU with translation
3231 * enabled - but failed to copy over the
3232 * old root-entry table. Try to proceed
3233 * by disabling translation now and
3234 * allocating a clean root-entry table.
3235 * This might cause DMAR faults, but
3236 * probably the dump will still succeed.
3238 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3240 iommu_disable_translation(iommu);
3241 clear_translation_pre_enabled(iommu);
3243 pr_info("Copied translation tables from previous kernel for %s\n",
3248 if (!ecap_pass_through(iommu->ecap))
3249 hw_pass_through = 0;
3250 #ifdef CONFIG_INTEL_IOMMU_SVM
3251 if (pasid_supported(iommu))
3252 intel_svm_init(iommu);
3257 * Now that qi is enabled on all iommus, set the root entry and flush
3258 * caches. This is required on some Intel X58 chipsets, otherwise the
3259 * flush_context function will loop forever and the boot hangs.
3261 for_each_active_iommu(iommu, drhd) {
3262 iommu_flush_write_buffer(iommu);
3263 iommu_set_root_entry(iommu);
3264 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3265 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3268 if (iommu_pass_through)
3269 iommu_identity_mapping |= IDENTMAP_ALL;
3271 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3276 iommu_identity_mapping |= IDENTMAP_GFX;
3278 check_tylersburg_isoch();
3280 ret = si_domain_init(hw_pass_through);
3287 * global invalidate context cache
3288 * global invalidate iotlb
3289 * enable translation
3291 for_each_iommu(iommu, drhd) {
3292 if (drhd->ignored) {
3294 * we always have to disable PMRs or DMA may fail on
3298 iommu_disable_protect_mem_regions(iommu);
3302 iommu_flush_write_buffer(iommu);
3304 #ifdef CONFIG_INTEL_IOMMU_SVM
3305 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3307 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3308 * could cause possible lock race condition.
3310 up_write(&dmar_global_lock);
3311 ret = intel_svm_enable_prq(iommu);
3312 down_write(&dmar_global_lock);
3317 ret = dmar_set_interrupt(iommu);
3325 for_each_active_iommu(iommu, drhd) {
3326 disable_dmar_iommu(iommu);
3327 free_dmar_iommu(iommu);
3336 /* This takes a number of _MM_ pages, not VTD pages */
3337 static unsigned long intel_alloc_iova(struct device *dev,
3338 struct dmar_domain *domain,
3339 unsigned long nrpages, uint64_t dma_mask)
3341 unsigned long iova_pfn;
3343 /* Restrict dma_mask to the width that the iommu can handle */
3344 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3345 /* Ensure we reserve the whole size-aligned region */
3346 nrpages = __roundup_pow_of_two(nrpages);
3348 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3350 * First try to allocate an io virtual address in
3351 * DMA_BIT_MASK(32) and if that fails then try allocating
3354 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3355 IOVA_PFN(DMA_BIT_MASK(32)), false);
3359 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3360 IOVA_PFN(dma_mask), true);
3361 if (unlikely(!iova_pfn)) {
3362 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3369 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3371 struct dmar_domain *domain, *tmp;
3372 struct dmar_rmrr_unit *rmrr;
3373 struct device *i_dev;
3376 /* Device shouldn't be attached by any domains. */
3377 domain = find_domain(dev);
3381 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3385 /* We have a new domain - setup possible RMRRs for the device */
3387 for_each_rmrr_units(rmrr) {
3388 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3393 ret = domain_prepare_identity_map(dev, domain,
3397 dev_err(dev, "Mapping reserved region failed\n");
3402 tmp = set_domain_for_dev(dev, domain);
3403 if (!tmp || domain != tmp) {
3404 domain_exit(domain);
3410 dev_err(dev, "Allocating domain failed\n");
3412 domain->domain.type = IOMMU_DOMAIN_DMA;
3417 /* Check if the dev needs to go through non-identity map and unmap process.*/
3418 static bool iommu_need_mapping(struct device *dev)
3422 if (iommu_dummy(dev))
3425 ret = identity_mapping(dev);
3427 u64 dma_mask = *dev->dma_mask;
3429 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3430 dma_mask = dev->coherent_dma_mask;
3432 if (dma_mask >= dma_get_required_mask(dev))
3436 * 32 bit DMA is removed from si_domain and fall back to
3437 * non-identity mapping.
3439 dmar_remove_one_dev_info(dev);
3440 ret = iommu_request_dma_domain_for_dev(dev);
3442 struct iommu_domain *domain;
3443 struct dmar_domain *dmar_domain;
3445 domain = iommu_get_domain_for_dev(dev);
3447 dmar_domain = to_dmar_domain(domain);
3448 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3450 get_private_domain_for_dev(dev);
3453 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3459 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3460 size_t size, int dir, u64 dma_mask)
3462 struct dmar_domain *domain;
3463 phys_addr_t start_paddr;
3464 unsigned long iova_pfn;
3467 struct intel_iommu *iommu;
3468 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3470 BUG_ON(dir == DMA_NONE);
3472 domain = find_domain(dev);
3474 return DMA_MAPPING_ERROR;
3476 iommu = domain_get_iommu(domain);
3477 size = aligned_nrpages(paddr, size);
3479 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3484 * Check if DMAR supports zero-length reads on write only
3487 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3488 !cap_zlr(iommu->cap))
3489 prot |= DMA_PTE_READ;
3490 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3491 prot |= DMA_PTE_WRITE;
3493 * paddr - (paddr + size) might be partial page, we should map the whole
3494 * page. Note: if two part of one page are separately mapped, we
3495 * might have two guest_addr mapping to the same host paddr, but this
3496 * is not a big problem
3498 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3499 mm_to_dma_pfn(paddr_pfn), size, prot);
3503 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3504 start_paddr += paddr & ~PAGE_MASK;
3509 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3510 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3511 size, (unsigned long long)paddr, dir);
3512 return DMA_MAPPING_ERROR;
3515 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3516 unsigned long offset, size_t size,
3517 enum dma_data_direction dir,
3518 unsigned long attrs)
3520 if (iommu_need_mapping(dev))
3521 return __intel_map_single(dev, page_to_phys(page) + offset,
3522 size, dir, *dev->dma_mask);
3523 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3526 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3527 size_t size, enum dma_data_direction dir,
3528 unsigned long attrs)
3530 if (iommu_need_mapping(dev))
3531 return __intel_map_single(dev, phys_addr, size, dir,
3533 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3536 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3538 struct dmar_domain *domain;
3539 unsigned long start_pfn, last_pfn;
3540 unsigned long nrpages;
3541 unsigned long iova_pfn;
3542 struct intel_iommu *iommu;
3543 struct page *freelist;
3544 struct pci_dev *pdev = NULL;
3546 domain = find_domain(dev);
3549 iommu = domain_get_iommu(domain);
3551 iova_pfn = IOVA_PFN(dev_addr);
3553 nrpages = aligned_nrpages(dev_addr, size);
3554 start_pfn = mm_to_dma_pfn(iova_pfn);
3555 last_pfn = start_pfn + nrpages - 1;
3557 if (dev_is_pci(dev))
3558 pdev = to_pci_dev(dev);
3560 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3562 freelist = domain_unmap(domain, start_pfn, last_pfn);
3564 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3565 !has_iova_flush_queue(&domain->iovad)) {
3566 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3567 nrpages, !freelist, 0);
3569 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3570 dma_free_pagelist(freelist);
3572 queue_iova(&domain->iovad, iova_pfn, nrpages,
3573 (unsigned long)freelist);
3575 * queue up the release of the unmap to save the 1/6th of the
3576 * cpu used up by the iotlb flush operation...
3581 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3582 size_t size, enum dma_data_direction dir,
3583 unsigned long attrs)
3585 if (iommu_need_mapping(dev))
3586 intel_unmap(dev, dev_addr, size);
3588 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3591 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3592 size_t size, enum dma_data_direction dir, unsigned long attrs)
3594 if (iommu_need_mapping(dev))
3595 intel_unmap(dev, dev_addr, size);
3598 static void *intel_alloc_coherent(struct device *dev, size_t size,
3599 dma_addr_t *dma_handle, gfp_t flags,
3600 unsigned long attrs)
3602 struct page *page = NULL;
3605 if (!iommu_need_mapping(dev))
3606 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3608 size = PAGE_ALIGN(size);
3609 order = get_order(size);
3611 if (gfpflags_allow_blocking(flags)) {
3612 unsigned int count = size >> PAGE_SHIFT;
3614 page = dma_alloc_from_contiguous(dev, count, order,
3615 flags & __GFP_NOWARN);
3619 page = alloc_pages(flags, order);
3622 memset(page_address(page), 0, size);
3624 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3626 dev->coherent_dma_mask);
3627 if (*dma_handle != DMA_MAPPING_ERROR)
3628 return page_address(page);
3629 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3630 __free_pages(page, order);
3635 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3636 dma_addr_t dma_handle, unsigned long attrs)
3639 struct page *page = virt_to_page(vaddr);
3641 if (!iommu_need_mapping(dev))
3642 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3644 size = PAGE_ALIGN(size);
3645 order = get_order(size);
3647 intel_unmap(dev, dma_handle, size);
3648 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3649 __free_pages(page, order);
3652 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3653 int nelems, enum dma_data_direction dir,
3654 unsigned long attrs)
3656 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3657 unsigned long nrpages = 0;
3658 struct scatterlist *sg;
3661 if (!iommu_need_mapping(dev))
3662 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3664 for_each_sg(sglist, sg, nelems, i) {
3665 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3668 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3671 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3672 enum dma_data_direction dir, unsigned long attrs)
3675 struct dmar_domain *domain;
3678 unsigned long iova_pfn;
3680 struct scatterlist *sg;
3681 unsigned long start_vpfn;
3682 struct intel_iommu *iommu;
3684 BUG_ON(dir == DMA_NONE);
3685 if (!iommu_need_mapping(dev))
3686 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3688 domain = find_domain(dev);
3692 iommu = domain_get_iommu(domain);
3694 for_each_sg(sglist, sg, nelems, i)
3695 size += aligned_nrpages(sg->offset, sg->length);
3697 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3700 sglist->dma_length = 0;
3705 * Check if DMAR supports zero-length reads on write only
3708 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3709 !cap_zlr(iommu->cap))
3710 prot |= DMA_PTE_READ;
3711 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3712 prot |= DMA_PTE_WRITE;
3714 start_vpfn = mm_to_dma_pfn(iova_pfn);
3716 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3717 if (unlikely(ret)) {
3718 dma_pte_free_pagetable(domain, start_vpfn,
3719 start_vpfn + size - 1,
3720 agaw_to_level(domain->agaw) + 1);
3721 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3728 static const struct dma_map_ops intel_dma_ops = {
3729 .alloc = intel_alloc_coherent,
3730 .free = intel_free_coherent,
3731 .map_sg = intel_map_sg,
3732 .unmap_sg = intel_unmap_sg,
3733 .map_page = intel_map_page,
3734 .unmap_page = intel_unmap_page,
3735 .map_resource = intel_map_resource,
3736 .unmap_resource = intel_unmap_resource,
3737 .dma_supported = dma_direct_supported,
3740 static inline int iommu_domain_cache_init(void)
3744 iommu_domain_cache = kmem_cache_create("iommu_domain",
3745 sizeof(struct dmar_domain),
3750 if (!iommu_domain_cache) {
3751 pr_err("Couldn't create iommu_domain cache\n");
3758 static inline int iommu_devinfo_cache_init(void)
3762 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3763 sizeof(struct device_domain_info),
3767 if (!iommu_devinfo_cache) {
3768 pr_err("Couldn't create devinfo cache\n");
3775 static int __init iommu_init_mempool(void)
3778 ret = iova_cache_get();
3782 ret = iommu_domain_cache_init();
3786 ret = iommu_devinfo_cache_init();
3790 kmem_cache_destroy(iommu_domain_cache);
3797 static void __init iommu_exit_mempool(void)
3799 kmem_cache_destroy(iommu_devinfo_cache);
3800 kmem_cache_destroy(iommu_domain_cache);
3804 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3806 struct dmar_drhd_unit *drhd;
3810 /* We know that this device on this chipset has its own IOMMU.
3811 * If we find it under a different IOMMU, then the BIOS is lying
3812 * to us. Hope that the IOMMU for this device is actually
3813 * disabled, and it needs no translation...
3815 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3817 /* "can't" happen */
3818 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3821 vtbar &= 0xffff0000;
3823 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3824 drhd = dmar_find_matched_drhd_unit(pdev);
3825 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3826 TAINT_FIRMWARE_WORKAROUND,
3827 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3828 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3830 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3832 static void __init init_no_remapping_devices(void)
3834 struct dmar_drhd_unit *drhd;
3838 for_each_drhd_unit(drhd) {
3839 if (!drhd->include_all) {
3840 for_each_active_dev_scope(drhd->devices,
3841 drhd->devices_cnt, i, dev)
3843 /* ignore DMAR unit if no devices exist */
3844 if (i == drhd->devices_cnt)
3849 for_each_active_drhd_unit(drhd) {
3850 if (drhd->include_all)
3853 for_each_active_dev_scope(drhd->devices,
3854 drhd->devices_cnt, i, dev)
3855 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3857 if (i < drhd->devices_cnt)
3860 /* This IOMMU has *only* gfx devices. Either bypass it or
3861 set the gfx_mapped flag, as appropriate */
3862 if (!dmar_map_gfx) {
3864 for_each_active_dev_scope(drhd->devices,
3865 drhd->devices_cnt, i, dev)
3866 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3871 #ifdef CONFIG_SUSPEND
3872 static int init_iommu_hw(void)
3874 struct dmar_drhd_unit *drhd;
3875 struct intel_iommu *iommu = NULL;
3877 for_each_active_iommu(iommu, drhd)
3879 dmar_reenable_qi(iommu);
3881 for_each_iommu(iommu, drhd) {
3882 if (drhd->ignored) {
3884 * we always have to disable PMRs or DMA may fail on
3888 iommu_disable_protect_mem_regions(iommu);
3892 iommu_flush_write_buffer(iommu);
3894 iommu_set_root_entry(iommu);
3896 iommu->flush.flush_context(iommu, 0, 0, 0,
3897 DMA_CCMD_GLOBAL_INVL);
3898 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3899 iommu_enable_translation(iommu);
3900 iommu_disable_protect_mem_regions(iommu);
3906 static void iommu_flush_all(void)
3908 struct dmar_drhd_unit *drhd;
3909 struct intel_iommu *iommu;
3911 for_each_active_iommu(iommu, drhd) {
3912 iommu->flush.flush_context(iommu, 0, 0, 0,
3913 DMA_CCMD_GLOBAL_INVL);
3914 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3915 DMA_TLB_GLOBAL_FLUSH);
3919 static int iommu_suspend(void)
3921 struct dmar_drhd_unit *drhd;
3922 struct intel_iommu *iommu = NULL;
3925 for_each_active_iommu(iommu, drhd) {
3926 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3928 if (!iommu->iommu_state)
3934 for_each_active_iommu(iommu, drhd) {
3935 iommu_disable_translation(iommu);
3937 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3939 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3940 readl(iommu->reg + DMAR_FECTL_REG);
3941 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3942 readl(iommu->reg + DMAR_FEDATA_REG);
3943 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3944 readl(iommu->reg + DMAR_FEADDR_REG);
3945 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3946 readl(iommu->reg + DMAR_FEUADDR_REG);
3948 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3953 for_each_active_iommu(iommu, drhd)
3954 kfree(iommu->iommu_state);
3959 static void iommu_resume(void)
3961 struct dmar_drhd_unit *drhd;
3962 struct intel_iommu *iommu = NULL;
3965 if (init_iommu_hw()) {
3967 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3969 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3973 for_each_active_iommu(iommu, drhd) {
3975 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3977 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3978 iommu->reg + DMAR_FECTL_REG);
3979 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3980 iommu->reg + DMAR_FEDATA_REG);
3981 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3982 iommu->reg + DMAR_FEADDR_REG);
3983 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3984 iommu->reg + DMAR_FEUADDR_REG);
3986 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3989 for_each_active_iommu(iommu, drhd)
3990 kfree(iommu->iommu_state);
3993 static struct syscore_ops iommu_syscore_ops = {
3994 .resume = iommu_resume,
3995 .suspend = iommu_suspend,
3998 static void __init init_iommu_pm_ops(void)
4000 register_syscore_ops(&iommu_syscore_ops);
4004 static inline void init_iommu_pm_ops(void) {}
4005 #endif /* CONFIG_PM */
4007 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4009 struct acpi_dmar_reserved_memory *rmrr;
4010 struct dmar_rmrr_unit *rmrru;
4012 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4016 rmrru->hdr = header;
4017 rmrr = (struct acpi_dmar_reserved_memory *)header;
4018 rmrru->base_address = rmrr->base_address;
4019 rmrru->end_address = rmrr->end_address;
4021 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4022 ((void *)rmrr) + rmrr->header.length,
4023 &rmrru->devices_cnt);
4024 if (rmrru->devices_cnt && rmrru->devices == NULL)
4027 list_add(&rmrru->list, &dmar_rmrr_units);
4036 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4038 struct dmar_atsr_unit *atsru;
4039 struct acpi_dmar_atsr *tmp;
4041 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4042 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4043 if (atsr->segment != tmp->segment)
4045 if (atsr->header.length != tmp->header.length)
4047 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4054 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4056 struct acpi_dmar_atsr *atsr;
4057 struct dmar_atsr_unit *atsru;
4059 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4062 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4063 atsru = dmar_find_atsr(atsr);
4067 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4072 * If memory is allocated from slab by ACPI _DSM method, we need to
4073 * copy the memory content because the memory buffer will be freed
4076 atsru->hdr = (void *)(atsru + 1);
4077 memcpy(atsru->hdr, hdr, hdr->length);
4078 atsru->include_all = atsr->flags & 0x1;
4079 if (!atsru->include_all) {
4080 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4081 (void *)atsr + atsr->header.length,
4082 &atsru->devices_cnt);
4083 if (atsru->devices_cnt && atsru->devices == NULL) {
4089 list_add_rcu(&atsru->list, &dmar_atsr_units);
4094 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4096 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4100 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4102 struct acpi_dmar_atsr *atsr;
4103 struct dmar_atsr_unit *atsru;
4105 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4106 atsru = dmar_find_atsr(atsr);
4108 list_del_rcu(&atsru->list);
4110 intel_iommu_free_atsr(atsru);
4116 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4120 struct acpi_dmar_atsr *atsr;
4121 struct dmar_atsr_unit *atsru;
4123 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4124 atsru = dmar_find_atsr(atsr);
4128 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4129 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4137 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4140 struct intel_iommu *iommu = dmaru->iommu;
4142 if (g_iommus[iommu->seq_id])
4145 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4146 pr_warn("%s: Doesn't support hardware pass through.\n",
4150 if (!ecap_sc_support(iommu->ecap) &&
4151 domain_update_iommu_snooping(iommu)) {
4152 pr_warn("%s: Doesn't support snooping.\n",
4156 sp = domain_update_iommu_superpage(iommu) - 1;
4157 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4158 pr_warn("%s: Doesn't support large page.\n",
4164 * Disable translation if already enabled prior to OS handover.
4166 if (iommu->gcmd & DMA_GCMD_TE)
4167 iommu_disable_translation(iommu);
4169 g_iommus[iommu->seq_id] = iommu;
4170 ret = iommu_init_domains(iommu);
4172 ret = iommu_alloc_root_entry(iommu);
4176 #ifdef CONFIG_INTEL_IOMMU_SVM
4177 if (pasid_supported(iommu))
4178 intel_svm_init(iommu);
4181 if (dmaru->ignored) {
4183 * we always have to disable PMRs or DMA may fail on this device
4186 iommu_disable_protect_mem_regions(iommu);
4190 intel_iommu_init_qi(iommu);
4191 iommu_flush_write_buffer(iommu);
4193 #ifdef CONFIG_INTEL_IOMMU_SVM
4194 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4195 ret = intel_svm_enable_prq(iommu);
4200 ret = dmar_set_interrupt(iommu);
4204 iommu_set_root_entry(iommu);
4205 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4206 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4207 iommu_enable_translation(iommu);
4209 iommu_disable_protect_mem_regions(iommu);
4213 disable_dmar_iommu(iommu);
4215 free_dmar_iommu(iommu);
4219 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4222 struct intel_iommu *iommu = dmaru->iommu;
4224 if (!intel_iommu_enabled)
4230 ret = intel_iommu_add(dmaru);
4232 disable_dmar_iommu(iommu);
4233 free_dmar_iommu(iommu);
4239 static void intel_iommu_free_dmars(void)
4241 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4242 struct dmar_atsr_unit *atsru, *atsr_n;
4244 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4245 list_del(&rmrru->list);
4246 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4250 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4251 list_del(&atsru->list);
4252 intel_iommu_free_atsr(atsru);
4256 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4259 struct pci_bus *bus;
4260 struct pci_dev *bridge = NULL;
4262 struct acpi_dmar_atsr *atsr;
4263 struct dmar_atsr_unit *atsru;
4265 dev = pci_physfn(dev);
4266 for (bus = dev->bus; bus; bus = bus->parent) {
4268 /* If it's an integrated device, allow ATS */
4271 /* Connected via non-PCIe: no ATS */
4272 if (!pci_is_pcie(bridge) ||
4273 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4275 /* If we found the root port, look it up in the ATSR */
4276 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4281 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4282 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4283 if (atsr->segment != pci_domain_nr(dev->bus))
4286 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4287 if (tmp == &bridge->dev)
4290 if (atsru->include_all)
4300 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4303 struct dmar_rmrr_unit *rmrru;
4304 struct dmar_atsr_unit *atsru;
4305 struct acpi_dmar_atsr *atsr;
4306 struct acpi_dmar_reserved_memory *rmrr;
4308 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4311 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4312 rmrr = container_of(rmrru->hdr,
4313 struct acpi_dmar_reserved_memory, header);
4314 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4315 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4316 ((void *)rmrr) + rmrr->header.length,
4317 rmrr->segment, rmrru->devices,
4318 rmrru->devices_cnt);
4321 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4322 dmar_remove_dev_scope(info, rmrr->segment,
4323 rmrru->devices, rmrru->devices_cnt);
4327 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4328 if (atsru->include_all)
4331 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4332 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4333 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4334 (void *)atsr + atsr->header.length,
4335 atsr->segment, atsru->devices,
4336 atsru->devices_cnt);
4341 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4342 if (dmar_remove_dev_scope(info, atsr->segment,
4343 atsru->devices, atsru->devices_cnt))
4351 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4352 unsigned long val, void *v)
4354 struct memory_notify *mhp = v;
4355 unsigned long long start, end;
4356 unsigned long start_vpfn, last_vpfn;
4359 case MEM_GOING_ONLINE:
4360 start = mhp->start_pfn << PAGE_SHIFT;
4361 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4362 if (iommu_domain_identity_map(si_domain, start, end)) {
4363 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4370 case MEM_CANCEL_ONLINE:
4371 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4372 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4373 while (start_vpfn <= last_vpfn) {
4375 struct dmar_drhd_unit *drhd;
4376 struct intel_iommu *iommu;
4377 struct page *freelist;
4379 iova = find_iova(&si_domain->iovad, start_vpfn);
4381 pr_debug("Failed get IOVA for PFN %lx\n",
4386 iova = split_and_remove_iova(&si_domain->iovad, iova,
4387 start_vpfn, last_vpfn);
4389 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4390 start_vpfn, last_vpfn);
4394 freelist = domain_unmap(si_domain, iova->pfn_lo,
4398 for_each_active_iommu(iommu, drhd)
4399 iommu_flush_iotlb_psi(iommu, si_domain,
4400 iova->pfn_lo, iova_size(iova),
4403 dma_free_pagelist(freelist);
4405 start_vpfn = iova->pfn_hi + 1;
4406 free_iova_mem(iova);
4414 static struct notifier_block intel_iommu_memory_nb = {
4415 .notifier_call = intel_iommu_memory_notifier,
4419 static void free_all_cpu_cached_iovas(unsigned int cpu)
4423 for (i = 0; i < g_num_of_iommus; i++) {
4424 struct intel_iommu *iommu = g_iommus[i];
4425 struct dmar_domain *domain;
4431 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4432 domain = get_iommu_domain(iommu, (u16)did);
4436 free_cpu_cached_iovas(cpu, &domain->iovad);
4441 static int intel_iommu_cpu_dead(unsigned int cpu)
4443 free_all_cpu_cached_iovas(cpu);
4447 static void intel_disable_iommus(void)
4449 struct intel_iommu *iommu = NULL;
4450 struct dmar_drhd_unit *drhd;
4452 for_each_iommu(iommu, drhd)
4453 iommu_disable_translation(iommu);
4456 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4458 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4460 return container_of(iommu_dev, struct intel_iommu, iommu);
4463 static ssize_t intel_iommu_show_version(struct device *dev,
4464 struct device_attribute *attr,
4467 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4468 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4469 return sprintf(buf, "%d:%d\n",
4470 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4472 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4474 static ssize_t intel_iommu_show_address(struct device *dev,
4475 struct device_attribute *attr,
4478 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4479 return sprintf(buf, "%llx\n", iommu->reg_phys);
4481 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4483 static ssize_t intel_iommu_show_cap(struct device *dev,
4484 struct device_attribute *attr,
4487 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4488 return sprintf(buf, "%llx\n", iommu->cap);
4490 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4492 static ssize_t intel_iommu_show_ecap(struct device *dev,
4493 struct device_attribute *attr,
4496 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4497 return sprintf(buf, "%llx\n", iommu->ecap);
4499 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4501 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4502 struct device_attribute *attr,
4505 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4506 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4508 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4510 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4511 struct device_attribute *attr,
4514 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4515 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4516 cap_ndoms(iommu->cap)));
4518 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4520 static struct attribute *intel_iommu_attrs[] = {
4521 &dev_attr_version.attr,
4522 &dev_attr_address.attr,
4524 &dev_attr_ecap.attr,
4525 &dev_attr_domains_supported.attr,
4526 &dev_attr_domains_used.attr,
4530 static struct attribute_group intel_iommu_group = {
4531 .name = "intel-iommu",
4532 .attrs = intel_iommu_attrs,
4535 const struct attribute_group *intel_iommu_groups[] = {
4540 static int __init platform_optin_force_iommu(void)
4542 struct pci_dev *pdev = NULL;
4543 bool has_untrusted_dev = false;
4545 if (!dmar_platform_optin() || no_platform_optin)
4548 for_each_pci_dev(pdev) {
4549 if (pdev->untrusted) {
4550 has_untrusted_dev = true;
4555 if (!has_untrusted_dev)
4558 if (no_iommu || dmar_disabled)
4559 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4562 * If Intel-IOMMU is disabled by default, we will apply identity
4563 * map for all devices except those marked as being untrusted.
4566 iommu_identity_mapping |= IDENTMAP_ALL;
4569 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4577 static int __init probe_acpi_namespace_devices(void)
4579 struct dmar_drhd_unit *drhd;
4580 /* To avoid a -Wunused-but-set-variable warning. */
4581 struct intel_iommu *iommu __maybe_unused;
4585 for_each_active_iommu(iommu, drhd) {
4586 for_each_active_dev_scope(drhd->devices,
4587 drhd->devices_cnt, i, dev) {
4588 struct acpi_device_physical_node *pn;
4589 struct iommu_group *group;
4590 struct acpi_device *adev;
4592 if (dev->bus != &acpi_bus_type)
4595 adev = to_acpi_device(dev);
4596 mutex_lock(&adev->physical_node_lock);
4597 list_for_each_entry(pn,
4598 &adev->physical_node_list, node) {
4599 group = iommu_group_get(pn->dev);
4601 iommu_group_put(group);
4605 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4606 ret = iommu_probe_device(pn->dev);
4610 mutex_unlock(&adev->physical_node_lock);
4620 int __init intel_iommu_init(void)
4623 struct dmar_drhd_unit *drhd;
4624 struct intel_iommu *iommu;
4627 * Intel IOMMU is required for a TXT/tboot launch or platform
4628 * opt in, so enforce that.
4630 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4632 if (iommu_init_mempool()) {
4634 panic("tboot: Failed to initialize iommu memory\n");
4638 down_write(&dmar_global_lock);
4639 if (dmar_table_init()) {
4641 panic("tboot: Failed to initialize DMAR table\n");
4645 if (dmar_dev_scope_init() < 0) {
4647 panic("tboot: Failed to initialize DMAR device scope\n");
4651 up_write(&dmar_global_lock);
4654 * The bus notifier takes the dmar_global_lock, so lockdep will
4655 * complain later when we register it under the lock.
4657 dmar_register_bus_notifier();
4659 down_write(&dmar_global_lock);
4661 if (no_iommu || dmar_disabled) {
4663 * We exit the function here to ensure IOMMU's remapping and
4664 * mempool aren't setup, which means that the IOMMU's PMRs
4665 * won't be disabled via the call to init_dmars(). So disable
4666 * it explicitly here. The PMRs were setup by tboot prior to
4667 * calling SENTER, but the kernel is expected to reset/tear
4670 if (intel_iommu_tboot_noforce) {
4671 for_each_iommu(iommu, drhd)
4672 iommu_disable_protect_mem_regions(iommu);
4676 * Make sure the IOMMUs are switched off, even when we
4677 * boot into a kexec kernel and the previous kernel left
4680 intel_disable_iommus();
4684 if (list_empty(&dmar_rmrr_units))
4685 pr_info("No RMRR found\n");
4687 if (list_empty(&dmar_atsr_units))
4688 pr_info("No ATSR found\n");
4690 if (dmar_init_reserved_ranges()) {
4692 panic("tboot: Failed to reserve iommu ranges\n");
4693 goto out_free_reserved_range;
4697 intel_iommu_gfx_mapped = 1;
4699 init_no_remapping_devices();
4704 panic("tboot: Failed to initialize DMARs\n");
4705 pr_err("Initialization failed\n");
4706 goto out_free_reserved_range;
4708 up_write(&dmar_global_lock);
4710 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4713 dma_ops = &intel_dma_ops;
4715 init_iommu_pm_ops();
4717 for_each_active_iommu(iommu, drhd) {
4718 iommu_device_sysfs_add(&iommu->iommu, NULL,
4721 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4722 iommu_device_register(&iommu->iommu);
4725 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4726 if (si_domain && !hw_pass_through)
4727 register_memory_notifier(&intel_iommu_memory_nb);
4728 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4729 intel_iommu_cpu_dead);
4731 down_read(&dmar_global_lock);
4732 if (probe_acpi_namespace_devices())
4733 pr_warn("ACPI name space devices didn't probe correctly\n");
4734 up_read(&dmar_global_lock);
4736 /* Finally, we enable the DMA remapping hardware. */
4737 for_each_iommu(iommu, drhd) {
4738 if (!drhd->ignored && !translation_pre_enabled(iommu))
4739 iommu_enable_translation(iommu);
4741 iommu_disable_protect_mem_regions(iommu);
4743 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4745 intel_iommu_enabled = 1;
4746 intel_iommu_debugfs_init();
4750 out_free_reserved_range:
4751 put_iova_domain(&reserved_iova_list);
4753 intel_iommu_free_dmars();
4754 up_write(&dmar_global_lock);
4755 iommu_exit_mempool();
4759 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4761 struct dmar_domain *domain;
4762 struct intel_iommu *iommu;
4763 unsigned long flags;
4765 assert_spin_locked(&device_domain_lock);
4770 iommu = info->iommu;
4771 domain = info->domain;
4774 if (dev_is_pci(info->dev) && sm_supported(iommu))
4775 intel_pasid_tear_down_entry(iommu, info->dev,
4778 iommu_disable_dev_iotlb(info);
4779 domain_context_clear_one(iommu, info->bus, info->devfn);
4780 intel_pasid_free_table(info->dev);
4783 unlink_domain_info(info);
4785 spin_lock_irqsave(&iommu->lock, flags);
4786 domain_detach_iommu(domain, iommu);
4787 spin_unlock_irqrestore(&iommu->lock, flags);
4789 /* free the private domain */
4790 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4791 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4792 domain_exit(info->domain);
4794 free_devinfo_mem(info);
4797 static void dmar_remove_one_dev_info(struct device *dev)
4799 struct device_domain_info *info;
4800 unsigned long flags;
4802 spin_lock_irqsave(&device_domain_lock, flags);
4803 info = dev->archdata.iommu;
4804 __dmar_remove_one_dev_info(info);
4805 spin_unlock_irqrestore(&device_domain_lock, flags);
4808 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4812 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4813 domain_reserve_special_ranges(domain);
4815 /* calculate AGAW */
4816 domain->gaw = guest_width;
4817 adjust_width = guestwidth_to_adjustwidth(guest_width);
4818 domain->agaw = width_to_agaw(adjust_width);
4820 domain->iommu_coherency = 0;
4821 domain->iommu_snooping = 0;
4822 domain->iommu_superpage = 0;
4823 domain->max_addr = 0;
4825 /* always allocate the top pgd */
4826 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4829 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4833 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4835 struct dmar_domain *dmar_domain;
4836 struct iommu_domain *domain;
4839 case IOMMU_DOMAIN_DMA:
4841 case IOMMU_DOMAIN_UNMANAGED:
4842 dmar_domain = alloc_domain(0);
4844 pr_err("Can't allocate dmar_domain\n");
4847 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4848 pr_err("Domain initialization failed\n");
4849 domain_exit(dmar_domain);
4853 if (type == IOMMU_DOMAIN_DMA &&
4854 init_iova_flush_queue(&dmar_domain->iovad,
4855 iommu_flush_iova, iova_entry_free)) {
4856 pr_warn("iova flush queue initialization failed\n");
4857 intel_iommu_strict = 1;
4860 domain_update_iommu_cap(dmar_domain);
4862 domain = &dmar_domain->domain;
4863 domain->geometry.aperture_start = 0;
4864 domain->geometry.aperture_end =
4865 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4866 domain->geometry.force_aperture = true;
4869 case IOMMU_DOMAIN_IDENTITY:
4870 return &si_domain->domain;
4878 static void intel_iommu_domain_free(struct iommu_domain *domain)
4880 if (domain != &si_domain->domain)
4881 domain_exit(to_dmar_domain(domain));
4885 * Check whether a @domain could be attached to the @dev through the
4886 * aux-domain attach/detach APIs.
4889 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4891 struct device_domain_info *info = dev->archdata.iommu;
4893 return info && info->auxd_enabled &&
4894 domain->type == IOMMU_DOMAIN_UNMANAGED;
4897 static void auxiliary_link_device(struct dmar_domain *domain,
4900 struct device_domain_info *info = dev->archdata.iommu;
4902 assert_spin_locked(&device_domain_lock);
4906 domain->auxd_refcnt++;
4907 list_add(&domain->auxd, &info->auxiliary_domains);
4910 static void auxiliary_unlink_device(struct dmar_domain *domain,
4913 struct device_domain_info *info = dev->archdata.iommu;
4915 assert_spin_locked(&device_domain_lock);
4919 list_del(&domain->auxd);
4920 domain->auxd_refcnt--;
4922 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4923 intel_pasid_free_id(domain->default_pasid);
4926 static int aux_domain_add_dev(struct dmar_domain *domain,
4931 unsigned long flags;
4932 struct intel_iommu *iommu;
4934 iommu = device_to_iommu(dev, &bus, &devfn);
4938 if (domain->default_pasid <= 0) {
4941 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4942 pci_max_pasids(to_pci_dev(dev)),
4945 pr_err("Can't allocate default pasid\n");
4948 domain->default_pasid = pasid;
4951 spin_lock_irqsave(&device_domain_lock, flags);
4953 * iommu->lock must be held to attach domain to iommu and setup the
4954 * pasid entry for second level translation.
4956 spin_lock(&iommu->lock);
4957 ret = domain_attach_iommu(domain, iommu);
4961 /* Setup the PASID entry for mediated devices: */
4962 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4963 domain->default_pasid);
4966 spin_unlock(&iommu->lock);
4968 auxiliary_link_device(domain, dev);
4970 spin_unlock_irqrestore(&device_domain_lock, flags);
4975 domain_detach_iommu(domain, iommu);
4977 spin_unlock(&iommu->lock);
4978 spin_unlock_irqrestore(&device_domain_lock, flags);
4979 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4980 intel_pasid_free_id(domain->default_pasid);
4985 static void aux_domain_remove_dev(struct dmar_domain *domain,
4988 struct device_domain_info *info;
4989 struct intel_iommu *iommu;
4990 unsigned long flags;
4992 if (!is_aux_domain(dev, &domain->domain))
4995 spin_lock_irqsave(&device_domain_lock, flags);
4996 info = dev->archdata.iommu;
4997 iommu = info->iommu;
4999 auxiliary_unlink_device(domain, dev);
5001 spin_lock(&iommu->lock);
5002 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5003 domain_detach_iommu(domain, iommu);
5004 spin_unlock(&iommu->lock);
5006 spin_unlock_irqrestore(&device_domain_lock, flags);
5009 static int prepare_domain_attach_device(struct iommu_domain *domain,
5012 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5013 struct intel_iommu *iommu;
5017 iommu = device_to_iommu(dev, &bus, &devfn);
5021 /* check if this iommu agaw is sufficient for max mapped address */
5022 addr_width = agaw_to_width(iommu->agaw);
5023 if (addr_width > cap_mgaw(iommu->cap))
5024 addr_width = cap_mgaw(iommu->cap);
5026 if (dmar_domain->max_addr > (1LL << addr_width)) {
5027 dev_err(dev, "%s: iommu width (%d) is not "
5028 "sufficient for the mapped address (%llx)\n",
5029 __func__, addr_width, dmar_domain->max_addr);
5032 dmar_domain->gaw = addr_width;
5035 * Knock out extra levels of page tables if necessary
5037 while (iommu->agaw < dmar_domain->agaw) {
5038 struct dma_pte *pte;
5040 pte = dmar_domain->pgd;
5041 if (dma_pte_present(pte)) {
5042 dmar_domain->pgd = (struct dma_pte *)
5043 phys_to_virt(dma_pte_addr(pte));
5044 free_pgtable_page(pte);
5046 dmar_domain->agaw--;
5052 static int intel_iommu_attach_device(struct iommu_domain *domain,
5057 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5058 device_is_rmrr_locked(dev)) {
5059 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5063 if (is_aux_domain(dev, domain))
5066 /* normally dev is not mapped */
5067 if (unlikely(domain_context_mapped(dev))) {
5068 struct dmar_domain *old_domain;
5070 old_domain = find_domain(dev);
5072 dmar_remove_one_dev_info(dev);
5075 ret = prepare_domain_attach_device(domain, dev);
5079 return domain_add_dev_info(to_dmar_domain(domain), dev);
5082 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5087 if (!is_aux_domain(dev, domain))
5090 ret = prepare_domain_attach_device(domain, dev);
5094 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5097 static void intel_iommu_detach_device(struct iommu_domain *domain,
5100 dmar_remove_one_dev_info(dev);
5103 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5106 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5109 static int intel_iommu_map(struct iommu_domain *domain,
5110 unsigned long iova, phys_addr_t hpa,
5111 size_t size, int iommu_prot)
5113 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5118 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5121 if (iommu_prot & IOMMU_READ)
5122 prot |= DMA_PTE_READ;
5123 if (iommu_prot & IOMMU_WRITE)
5124 prot |= DMA_PTE_WRITE;
5125 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5126 prot |= DMA_PTE_SNP;
5128 max_addr = iova + size;
5129 if (dmar_domain->max_addr < max_addr) {
5132 /* check if minimum agaw is sufficient for mapped address */
5133 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5134 if (end < max_addr) {
5135 pr_err("%s: iommu width (%d) is not "
5136 "sufficient for the mapped address (%llx)\n",
5137 __func__, dmar_domain->gaw, max_addr);
5140 dmar_domain->max_addr = max_addr;
5142 /* Round up size to next multiple of PAGE_SIZE, if it and
5143 the low bits of hpa would take us onto the next page */
5144 size = aligned_nrpages(hpa, size);
5145 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5146 hpa >> VTD_PAGE_SHIFT, size, prot);
5150 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5151 unsigned long iova, size_t size)
5153 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5154 struct page *freelist = NULL;
5155 unsigned long start_pfn, last_pfn;
5156 unsigned int npages;
5157 int iommu_id, level = 0;
5159 /* Cope with horrid API which requires us to unmap more than the
5160 size argument if it happens to be a large-page mapping. */
5161 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5162 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5165 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5166 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5168 start_pfn = iova >> VTD_PAGE_SHIFT;
5169 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5171 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5173 npages = last_pfn - start_pfn + 1;
5175 for_each_domain_iommu(iommu_id, dmar_domain)
5176 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5177 start_pfn, npages, !freelist, 0);
5179 dma_free_pagelist(freelist);
5181 if (dmar_domain->max_addr == iova + size)
5182 dmar_domain->max_addr = iova;
5187 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5190 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5191 struct dma_pte *pte;
5195 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5198 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5200 phys = dma_pte_addr(pte);
5205 static inline bool scalable_mode_support(void)
5207 struct dmar_drhd_unit *drhd;
5208 struct intel_iommu *iommu;
5212 for_each_active_iommu(iommu, drhd) {
5213 if (!sm_supported(iommu)) {
5223 static inline bool iommu_pasid_support(void)
5225 struct dmar_drhd_unit *drhd;
5226 struct intel_iommu *iommu;
5230 for_each_active_iommu(iommu, drhd) {
5231 if (!pasid_supported(iommu)) {
5241 static bool intel_iommu_capable(enum iommu_cap cap)
5243 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5244 return domain_update_iommu_snooping(NULL) == 1;
5245 if (cap == IOMMU_CAP_INTR_REMAP)
5246 return irq_remapping_enabled == 1;
5251 static int intel_iommu_add_device(struct device *dev)
5253 struct dmar_domain *dmar_domain;
5254 struct iommu_domain *domain;
5255 struct intel_iommu *iommu;
5256 struct iommu_group *group;
5260 iommu = device_to_iommu(dev, &bus, &devfn);
5264 iommu_device_link(&iommu->iommu, dev);
5266 if (translation_pre_enabled(iommu))
5267 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5269 group = iommu_group_get_for_dev(dev);
5272 return PTR_ERR(group);
5274 iommu_group_put(group);
5276 domain = iommu_get_domain_for_dev(dev);
5277 dmar_domain = to_dmar_domain(domain);
5278 if (domain->type == IOMMU_DOMAIN_DMA) {
5279 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5280 ret = iommu_request_dm_for_dev(dev);
5282 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5283 domain_add_dev_info(si_domain, dev);
5285 "Device uses a private identity domain.\n");
5289 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5290 ret = iommu_request_dma_domain_for_dev(dev);
5292 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5293 if (!get_private_domain_for_dev(dev)) {
5295 "Failed to get a private domain.\n");
5300 "Device uses a private dma domain.\n");
5308 static void intel_iommu_remove_device(struct device *dev)
5310 struct intel_iommu *iommu;
5313 iommu = device_to_iommu(dev, &bus, &devfn);
5317 iommu_group_remove_device(dev);
5319 iommu_device_unlink(&iommu->iommu, dev);
5322 static void intel_iommu_get_resv_regions(struct device *device,
5323 struct list_head *head)
5325 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5326 struct iommu_resv_region *reg;
5327 struct dmar_rmrr_unit *rmrr;
5328 struct device *i_dev;
5331 down_read(&dmar_global_lock);
5332 for_each_rmrr_units(rmrr) {
5333 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5335 struct iommu_resv_region *resv;
5336 enum iommu_resv_type type;
5339 if (i_dev != device &&
5340 !is_downstream_to_pci_bridge(device, i_dev))
5343 length = rmrr->end_address - rmrr->base_address + 1;
5345 type = device_rmrr_is_relaxable(device) ?
5346 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5348 resv = iommu_alloc_resv_region(rmrr->base_address,
5349 length, prot, type);
5353 list_add_tail(&resv->list, head);
5356 up_read(&dmar_global_lock);
5358 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5359 if (dev_is_pci(device)) {
5360 struct pci_dev *pdev = to_pci_dev(device);
5362 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5363 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5366 list_add_tail(®->list, head);
5369 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5371 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5372 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5376 list_add_tail(®->list, head);
5379 static void intel_iommu_put_resv_regions(struct device *dev,
5380 struct list_head *head)
5382 struct iommu_resv_region *entry, *next;
5384 list_for_each_entry_safe(entry, next, head, list)
5388 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5390 struct device_domain_info *info;
5391 struct context_entry *context;
5392 struct dmar_domain *domain;
5393 unsigned long flags;
5397 domain = find_domain(dev);
5401 spin_lock_irqsave(&device_domain_lock, flags);
5402 spin_lock(&iommu->lock);
5405 info = dev->archdata.iommu;
5406 if (!info || !info->pasid_supported)
5409 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5410 if (WARN_ON(!context))
5413 ctx_lo = context[0].lo;
5415 if (!(ctx_lo & CONTEXT_PASIDE)) {
5416 ctx_lo |= CONTEXT_PASIDE;
5417 context[0].lo = ctx_lo;
5419 iommu->flush.flush_context(iommu,
5420 domain->iommu_did[iommu->seq_id],
5421 PCI_DEVID(info->bus, info->devfn),
5422 DMA_CCMD_MASK_NOBIT,
5423 DMA_CCMD_DEVICE_INVL);
5426 /* Enable PASID support in the device, if it wasn't already */
5427 if (!info->pasid_enabled)
5428 iommu_enable_dev_iotlb(info);
5433 spin_unlock(&iommu->lock);
5434 spin_unlock_irqrestore(&device_domain_lock, flags);
5439 static void intel_iommu_apply_resv_region(struct device *dev,
5440 struct iommu_domain *domain,
5441 struct iommu_resv_region *region)
5443 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5444 unsigned long start, end;
5446 start = IOVA_PFN(region->start);
5447 end = IOVA_PFN(region->start + region->length - 1);
5449 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5452 #ifdef CONFIG_INTEL_IOMMU_SVM
5453 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5455 struct intel_iommu *iommu;
5458 if (iommu_dummy(dev)) {
5460 "No IOMMU translation for device; cannot enable SVM\n");
5464 iommu = device_to_iommu(dev, &bus, &devfn);
5466 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5472 #endif /* CONFIG_INTEL_IOMMU_SVM */
5474 static int intel_iommu_enable_auxd(struct device *dev)
5476 struct device_domain_info *info;
5477 struct intel_iommu *iommu;
5478 unsigned long flags;
5482 iommu = device_to_iommu(dev, &bus, &devfn);
5483 if (!iommu || dmar_disabled)
5486 if (!sm_supported(iommu) || !pasid_supported(iommu))
5489 ret = intel_iommu_enable_pasid(iommu, dev);
5493 spin_lock_irqsave(&device_domain_lock, flags);
5494 info = dev->archdata.iommu;
5495 info->auxd_enabled = 1;
5496 spin_unlock_irqrestore(&device_domain_lock, flags);
5501 static int intel_iommu_disable_auxd(struct device *dev)
5503 struct device_domain_info *info;
5504 unsigned long flags;
5506 spin_lock_irqsave(&device_domain_lock, flags);
5507 info = dev->archdata.iommu;
5508 if (!WARN_ON(!info))
5509 info->auxd_enabled = 0;
5510 spin_unlock_irqrestore(&device_domain_lock, flags);
5516 * A PCI express designated vendor specific extended capability is defined
5517 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5518 * for system software and tools to detect endpoint devices supporting the
5519 * Intel scalable IO virtualization without host driver dependency.
5521 * Returns the address of the matching extended capability structure within
5522 * the device's PCI configuration space or 0 if the device does not support
5525 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5530 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5532 pci_read_config_word(pdev, pos + 4, &vendor);
5533 pci_read_config_word(pdev, pos + 8, &id);
5534 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5537 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5544 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5546 if (feat == IOMMU_DEV_FEAT_AUX) {
5549 if (!dev_is_pci(dev) || dmar_disabled ||
5550 !scalable_mode_support() || !iommu_pasid_support())
5553 ret = pci_pasid_features(to_pci_dev(dev));
5557 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5564 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5566 if (feat == IOMMU_DEV_FEAT_AUX)
5567 return intel_iommu_enable_auxd(dev);
5573 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5575 if (feat == IOMMU_DEV_FEAT_AUX)
5576 return intel_iommu_disable_auxd(dev);
5582 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5584 struct device_domain_info *info = dev->archdata.iommu;
5586 if (feat == IOMMU_DEV_FEAT_AUX)
5587 return scalable_mode_support() && info && info->auxd_enabled;
5593 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5595 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5597 return dmar_domain->default_pasid > 0 ?
5598 dmar_domain->default_pasid : -EINVAL;
5601 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5604 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5607 const struct iommu_ops intel_iommu_ops = {
5608 .capable = intel_iommu_capable,
5609 .domain_alloc = intel_iommu_domain_alloc,
5610 .domain_free = intel_iommu_domain_free,
5611 .attach_dev = intel_iommu_attach_device,
5612 .detach_dev = intel_iommu_detach_device,
5613 .aux_attach_dev = intel_iommu_aux_attach_device,
5614 .aux_detach_dev = intel_iommu_aux_detach_device,
5615 .aux_get_pasid = intel_iommu_aux_get_pasid,
5616 .map = intel_iommu_map,
5617 .unmap = intel_iommu_unmap,
5618 .iova_to_phys = intel_iommu_iova_to_phys,
5619 .add_device = intel_iommu_add_device,
5620 .remove_device = intel_iommu_remove_device,
5621 .get_resv_regions = intel_iommu_get_resv_regions,
5622 .put_resv_regions = intel_iommu_put_resv_regions,
5623 .apply_resv_region = intel_iommu_apply_resv_region,
5624 .device_group = pci_device_group,
5625 .dev_has_feat = intel_iommu_dev_has_feat,
5626 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5627 .dev_enable_feat = intel_iommu_dev_enable_feat,
5628 .dev_disable_feat = intel_iommu_dev_disable_feat,
5629 .is_attach_deferred = intel_iommu_is_attach_deferred,
5630 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5633 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5635 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5636 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5641 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5642 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5643 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5644 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5645 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5646 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5648 static void quirk_iommu_rwbf(struct pci_dev *dev)
5651 * Mobile 4 Series Chipset neglects to set RWBF capability,
5652 * but needs it. Same seems to hold for the desktop versions.
5654 pci_info(dev, "Forcing write-buffer flush capability\n");
5658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5659 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5660 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5661 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5662 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5663 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5664 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5667 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5668 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5669 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5670 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5671 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5672 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5673 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5674 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5676 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5680 if (pci_read_config_word(dev, GGC, &ggc))
5683 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5684 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5686 } else if (dmar_map_gfx) {
5687 /* we have to ensure the gfx device is idle before we flush */
5688 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5689 intel_iommu_strict = 1;
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5697 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5698 ISOCH DMAR unit for the Azalia sound device, but not give it any
5699 TLB entries, which causes it to deadlock. Check for that. We do
5700 this in a function called from init_dmars(), instead of in a PCI
5701 quirk, because we don't want to print the obnoxious "BIOS broken"
5702 message if VT-d is actually disabled.
5704 static void __init check_tylersburg_isoch(void)
5706 struct pci_dev *pdev;
5707 uint32_t vtisochctrl;
5709 /* If there's no Azalia in the system anyway, forget it. */
5710 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5715 /* System Management Registers. Might be hidden, in which case
5716 we can't do the sanity check. But that's OK, because the
5717 known-broken BIOSes _don't_ actually hide it, so far. */
5718 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5722 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5729 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5730 if (vtisochctrl & 1)
5733 /* Drop all bits other than the number of TLB entries */
5734 vtisochctrl &= 0x1c;
5736 /* If we have the recommended number of TLB entries (16), fine. */
5737 if (vtisochctrl == 0x10)
5740 /* Zero TLB entries? You get to ride the short bus to school. */
5742 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5743 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5744 dmi_get_system_info(DMI_BIOS_VENDOR),
5745 dmi_get_system_info(DMI_BIOS_VERSION),
5746 dmi_get_system_info(DMI_PRODUCT_VERSION));
5747 iommu_identity_mapping |= IDENTMAP_AZALIA;
5751 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",