1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright © 2006-2014 Intel Corporation.
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
13 #define pr_fmt(fmt) "DMAR: " fmt
14 #define dev_fmt(fmt) pr_fmt(fmt)
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
51 #define ROOT_SIZE VTD_PAGE_SIZE
52 #define CONTEXT_SIZE VTD_PAGE_SIZE
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
59 #define IOAPIC_RANGE_START (0xfee00000)
60 #define IOAPIC_RANGE_END (0xfeefffff)
61 #define IOVA_START_ADDR (0x1000)
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
68 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
74 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN (1)
80 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
82 /* page table handling */
83 #define LEVEL_STRIDE (9)
84 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
87 * This bitmap is used to advertise the page sizes our hardware support
88 * to the IOMMU core, which will then use this information to split
89 * physically contiguous memory regions it is mapping into page sizes
92 * Traditionally the IOMMU core just handed us the mappings directly,
93 * after making sure the size is an order of a 4KiB page and that the
94 * mapping has natural alignment.
96 * To retain this behavior, we currently advertise that we support
97 * all page sizes that are an order of 4KiB.
99 * If at some point we'd like to utilize the IOMMU core's new behavior,
100 * we could change this to advertise the real page sizes we support.
102 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
104 static inline int agaw_to_level(int agaw)
109 static inline int agaw_to_width(int agaw)
111 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 static inline int width_to_agaw(int width)
116 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 static inline unsigned int level_to_offset_bits(int level)
121 return (level - 1) * LEVEL_STRIDE;
124 static inline int pfn_level_offset(unsigned long pfn, int level)
126 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 static inline unsigned long level_mask(int level)
131 return -1UL << level_to_offset_bits(level);
134 static inline unsigned long level_size(int level)
136 return 1UL << level_to_offset_bits(level);
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
141 return (pfn + level_size(level) - 1) & level_mask(level);
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
146 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150 are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
153 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
158 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
162 return mm_to_dma_pfn(page_to_pfn(pg));
164 static inline unsigned long virt_to_dma_pfn(void *p)
166 return page_to_dma_pfn(virt_to_page(p));
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
176 * set to 1 to panic kernel if can't successfully enable VT-d
177 * (used when kernel is launched w/ TXT)
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
194 return re->lo & VTD_PAGE_MASK;
198 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
206 return re->hi & VTD_PAGE_MASK;
209 static inline void context_clear_pasid_enable(struct context_entry *context)
211 context->lo &= ~(1ULL << 11);
214 static inline bool context_pasid_enabled(struct context_entry *context)
216 return !!(context->lo & (1ULL << 11));
219 static inline void context_set_copied(struct context_entry *context)
221 context->hi |= (1ull << 3);
224 static inline bool context_copied(struct context_entry *context)
226 return !!(context->hi & (1ULL << 3));
229 static inline bool __context_present(struct context_entry *context)
231 return (context->lo & 1);
234 bool context_present(struct context_entry *context)
236 return context_pasid_enabled(context) ?
237 __context_present(context) :
238 __context_present(context) && !context_copied(context);
241 static inline void context_set_present(struct context_entry *context)
246 static inline void context_set_fault_enable(struct context_entry *context)
248 context->lo &= (((u64)-1) << 2) | 1;
251 static inline void context_set_translation_type(struct context_entry *context,
254 context->lo &= (((u64)-1) << 4) | 3;
255 context->lo |= (value & 3) << 2;
258 static inline void context_set_address_root(struct context_entry *context,
261 context->lo &= ~VTD_PAGE_MASK;
262 context->lo |= value & VTD_PAGE_MASK;
265 static inline void context_set_address_width(struct context_entry *context,
268 context->hi |= value & 7;
271 static inline void context_set_domain_id(struct context_entry *context,
274 context->hi |= (value & ((1 << 16) - 1)) << 8;
277 static inline int context_domain_id(struct context_entry *c)
279 return((c->hi >> 8) & 0xffff);
282 static inline void context_clear_entry(struct context_entry *context)
289 * This domain is a statically identity mapping domain.
290 * 1. This domain creats a static 1:1 mapping to all usable memory.
291 * 2. It maps to each iommu if successful.
292 * 3. Each iommu mapps to this domain if successful.
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY BIT(0)
301 * This is a DMA domain allocated through the iommu domain allocation
302 * interface. But one or more devices belonging to this domain have
303 * been chosen to use a private domain. We should avoid to use the
304 * map/unmap/iova_to_phys APIs on it.
306 #define DOMAIN_FLAG_LOSE_CHILDREN BIT(1)
308 #define for_each_domain_iommu(idx, domain) \
309 for (idx = 0; idx < g_num_of_iommus; idx++) \
310 if (domain->iommu_refcnt[idx])
312 struct dmar_rmrr_unit {
313 struct list_head list; /* list of rmrr units */
314 struct acpi_dmar_header *hdr; /* ACPI header */
315 u64 base_address; /* reserved base address*/
316 u64 end_address; /* reserved end address */
317 struct dmar_dev_scope *devices; /* target devices */
318 int devices_cnt; /* target device count */
321 struct dmar_atsr_unit {
322 struct list_head list; /* list of ATSR units */
323 struct acpi_dmar_header *hdr; /* ACPI header */
324 struct dmar_dev_scope *devices; /* target devices */
325 int devices_cnt; /* target device count */
326 u8 include_all:1; /* include all ports */
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
332 #define for_each_rmrr_units(rmrr) \
333 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static int domain_detach_iommu(struct dmar_domain *domain,
343 struct intel_iommu *iommu);
344 static bool device_is_rmrr_locked(struct device *dev);
345 static int intel_iommu_attach_device(struct iommu_domain *domain,
348 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
349 int dmar_disabled = 0;
351 int dmar_disabled = 1;
352 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355 int intel_iommu_enabled = 0;
356 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
358 static int dmar_map_gfx = 1;
359 static int dmar_forcedac;
360 static int intel_iommu_strict;
361 static int intel_iommu_superpage = 1;
362 static int iommu_identity_mapping;
364 #define IDENTMAP_ALL 1
365 #define IDENTMAP_GFX 2
366 #define IDENTMAP_AZALIA 4
368 int intel_iommu_gfx_mapped;
369 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
371 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
372 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
373 static DEFINE_SPINLOCK(device_domain_lock);
374 static LIST_HEAD(device_domain_list);
377 * Iterate over elements in device_domain_list and call the specified
378 * callback @fn against each element.
380 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
381 void *data), void *data)
385 struct device_domain_info *info;
387 spin_lock_irqsave(&device_domain_lock, flags);
388 list_for_each_entry(info, &device_domain_list, global) {
389 ret = fn(info, data);
391 spin_unlock_irqrestore(&device_domain_lock, flags);
395 spin_unlock_irqrestore(&device_domain_lock, flags);
400 const struct iommu_ops intel_iommu_ops;
402 static bool translation_pre_enabled(struct intel_iommu *iommu)
404 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
409 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 static void init_translation_status(struct intel_iommu *iommu)
416 gsts = readl(iommu->reg + DMAR_GSTS_REG);
417 if (gsts & DMA_GSTS_TES)
418 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
422 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
424 return container_of(dom, struct dmar_domain, domain);
427 static int __init intel_iommu_setup(char *str)
432 if (!strncmp(str, "on", 2)) {
434 pr_info("IOMMU enabled\n");
435 } else if (!strncmp(str, "off", 3)) {
437 no_platform_optin = 1;
438 pr_info("IOMMU disabled\n");
439 } else if (!strncmp(str, "igfx_off", 8)) {
441 pr_info("Disable GFX device mapping\n");
442 } else if (!strncmp(str, "forcedac", 8)) {
443 pr_info("Forcing DAC for PCI devices\n");
445 } else if (!strncmp(str, "strict", 6)) {
446 pr_info("Disable batched IOTLB flush\n");
447 intel_iommu_strict = 1;
448 } else if (!strncmp(str, "sp_off", 6)) {
449 pr_info("Disable supported super page\n");
450 intel_iommu_superpage = 0;
451 } else if (!strncmp(str, "sm_on", 5)) {
452 pr_info("Intel-IOMMU: scalable mode supported\n");
454 } else if (!strncmp(str, "tboot_noforce", 13)) {
456 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
457 intel_iommu_tboot_noforce = 1;
460 str += strcspn(str, ",");
466 __setup("intel_iommu=", intel_iommu_setup);
468 static struct kmem_cache *iommu_domain_cache;
469 static struct kmem_cache *iommu_devinfo_cache;
471 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
473 struct dmar_domain **domains;
476 domains = iommu->domains[idx];
480 return domains[did & 0xff];
483 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
484 struct dmar_domain *domain)
486 struct dmar_domain **domains;
489 if (!iommu->domains[idx]) {
490 size_t size = 256 * sizeof(struct dmar_domain *);
491 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494 domains = iommu->domains[idx];
495 if (WARN_ON(!domains))
498 domains[did & 0xff] = domain;
501 void *alloc_pgtable_page(int node)
506 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
508 vaddr = page_address(page);
512 void free_pgtable_page(void *vaddr)
514 free_page((unsigned long)vaddr);
517 static inline void *alloc_domain_mem(void)
519 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 static void free_domain_mem(void *vaddr)
524 kmem_cache_free(iommu_domain_cache, vaddr);
527 static inline void * alloc_devinfo_mem(void)
529 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 static inline void free_devinfo_mem(void *vaddr)
534 kmem_cache_free(iommu_devinfo_cache, vaddr);
537 static inline int domain_type_is_si(struct dmar_domain *domain)
539 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 static inline int domain_pfn_supported(struct dmar_domain *domain,
545 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
547 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
555 sagaw = cap_sagaw(iommu->cap);
556 for (agaw = width_to_agaw(max_gaw);
558 if (test_bit(agaw, &sagaw))
566 * Calculate max SAGAW for each iommu.
568 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
570 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
574 * calculate agaw for each iommu.
575 * "SAGAW" may be different across iommus, use a default agaw, and
576 * get a supported less agaw for iommus that don't support the default agaw.
578 int iommu_calculate_agaw(struct intel_iommu *iommu)
580 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 /* This functionin only returns single iommu in a domain */
584 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
588 /* si_domain and vm domain should not get here. */
589 if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592 for_each_domain_iommu(iommu_id, domain)
595 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598 return g_iommus[iommu_id];
601 static void domain_update_iommu_coherency(struct dmar_domain *domain)
603 struct dmar_drhd_unit *drhd;
604 struct intel_iommu *iommu;
608 domain->iommu_coherency = 1;
610 for_each_domain_iommu(i, domain) {
612 if (!ecap_coherent(g_iommus[i]->ecap)) {
613 domain->iommu_coherency = 0;
620 /* No hardware attached; use lowest common denominator */
622 for_each_active_iommu(iommu, drhd) {
623 if (!ecap_coherent(iommu->ecap)) {
624 domain->iommu_coherency = 0;
631 static int domain_update_iommu_snooping(struct intel_iommu *skip)
633 struct dmar_drhd_unit *drhd;
634 struct intel_iommu *iommu;
638 for_each_active_iommu(iommu, drhd) {
640 if (!ecap_sc_support(iommu->ecap)) {
651 static int domain_update_iommu_superpage(struct intel_iommu *skip)
653 struct dmar_drhd_unit *drhd;
654 struct intel_iommu *iommu;
657 if (!intel_iommu_superpage) {
661 /* set iommu_superpage to the smallest common denominator */
663 for_each_active_iommu(iommu, drhd) {
665 mask &= cap_super_page_val(iommu->cap);
675 /* Some capabilities may be different across iommus */
676 static void domain_update_iommu_cap(struct dmar_domain *domain)
678 domain_update_iommu_coherency(domain);
679 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
680 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686 struct root_entry *root = &iommu->root_entry[bus];
687 struct context_entry *context;
691 if (sm_supported(iommu)) {
699 context = phys_to_virt(*entry & VTD_PAGE_MASK);
701 unsigned long phy_addr;
705 context = alloc_pgtable_page(iommu->node);
709 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
710 phy_addr = virt_to_phys((void *)context);
711 *entry = phy_addr | 1;
712 __iommu_flush_cache(iommu, entry, sizeof(*entry));
714 return &context[devfn];
717 static int iommu_dummy(struct device *dev)
719 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
723 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
724 * sub-hierarchy of a candidate PCI-PCI bridge
725 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
726 * @bridge: the candidate PCI-PCI bridge
728 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
733 struct pci_dev *pdev, *pbridge;
735 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738 pdev = to_pci_dev(dev);
739 pbridge = to_pci_dev(bridge);
741 if (pbridge->subordinate &&
742 pbridge->subordinate->number <= pdev->bus->number &&
743 pbridge->subordinate->busn_res.end >= pdev->bus->number)
749 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
751 struct dmar_drhd_unit *drhd = NULL;
752 struct intel_iommu *iommu;
754 struct pci_dev *pdev = NULL;
758 if (iommu_dummy(dev))
761 if (dev_is_pci(dev)) {
762 struct pci_dev *pf_pdev;
764 pdev = to_pci_dev(dev);
767 /* VMD child devices currently cannot be handled individually */
768 if (is_vmd(pdev->bus))
772 /* VFs aren't listed in scope tables; we need to look up
773 * the PF instead to find the IOMMU. */
774 pf_pdev = pci_physfn(pdev);
776 segment = pci_domain_nr(pdev->bus);
777 } else if (has_acpi_companion(dev))
778 dev = &ACPI_COMPANION(dev)->dev;
781 for_each_active_iommu(iommu, drhd) {
782 if (pdev && segment != drhd->segment)
785 for_each_active_dev_scope(drhd->devices,
786 drhd->devices_cnt, i, tmp) {
788 /* For a VF use its original BDF# not that of the PF
789 * which we used for the IOMMU lookup. Strictly speaking
790 * we could do this for all PCI devices; we only need to
791 * get the BDF# from the scope table for ACPI matches. */
792 if (pdev && pdev->is_virtfn)
795 *bus = drhd->devices[i].bus;
796 *devfn = drhd->devices[i].devfn;
800 if (is_downstream_to_pci_bridge(dev, tmp))
804 if (pdev && drhd->include_all) {
806 *bus = pdev->bus->number;
807 *devfn = pdev->devfn;
818 static void domain_flush_cache(struct dmar_domain *domain,
819 void *addr, int size)
821 if (!domain->iommu_coherency)
822 clflush_cache_range(addr, size);
825 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
827 struct context_entry *context;
831 spin_lock_irqsave(&iommu->lock, flags);
832 context = iommu_context_addr(iommu, bus, devfn, 0);
834 ret = context_present(context);
835 spin_unlock_irqrestore(&iommu->lock, flags);
839 static void free_context_table(struct intel_iommu *iommu)
843 struct context_entry *context;
845 spin_lock_irqsave(&iommu->lock, flags);
846 if (!iommu->root_entry) {
849 for (i = 0; i < ROOT_ENTRY_NR; i++) {
850 context = iommu_context_addr(iommu, i, 0, 0);
852 free_pgtable_page(context);
854 if (!sm_supported(iommu))
857 context = iommu_context_addr(iommu, i, 0x80, 0);
859 free_pgtable_page(context);
862 free_pgtable_page(iommu->root_entry);
863 iommu->root_entry = NULL;
865 spin_unlock_irqrestore(&iommu->lock, flags);
868 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
869 unsigned long pfn, int *target_level)
871 struct dma_pte *parent, *pte;
872 int level = agaw_to_level(domain->agaw);
875 BUG_ON(!domain->pgd);
877 if (!domain_pfn_supported(domain, pfn))
878 /* Address beyond IOMMU's addressing capabilities. */
881 parent = domain->pgd;
886 offset = pfn_level_offset(pfn, level);
887 pte = &parent[offset];
888 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
890 if (level == *target_level)
893 if (!dma_pte_present(pte)) {
896 tmp_page = alloc_pgtable_page(domain->nid);
901 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
902 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
903 if (cmpxchg64(&pte->val, 0ULL, pteval))
904 /* Someone else set it while we were thinking; use theirs. */
905 free_pgtable_page(tmp_page);
907 domain_flush_cache(domain, pte, sizeof(*pte));
912 parent = phys_to_virt(dma_pte_addr(pte));
917 *target_level = level;
922 /* return address's pte at specific level */
923 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
925 int level, int *large_page)
927 struct dma_pte *parent, *pte;
928 int total = agaw_to_level(domain->agaw);
931 parent = domain->pgd;
932 while (level <= total) {
933 offset = pfn_level_offset(pfn, total);
934 pte = &parent[offset];
938 if (!dma_pte_present(pte)) {
943 if (dma_pte_superpage(pte)) {
948 parent = phys_to_virt(dma_pte_addr(pte));
954 /* clear last level pte, a tlb flush should be followed */
955 static void dma_pte_clear_range(struct dmar_domain *domain,
956 unsigned long start_pfn,
957 unsigned long last_pfn)
959 unsigned int large_page;
960 struct dma_pte *first_pte, *pte;
962 BUG_ON(!domain_pfn_supported(domain, start_pfn));
963 BUG_ON(!domain_pfn_supported(domain, last_pfn));
964 BUG_ON(start_pfn > last_pfn);
966 /* we don't need lock here; nobody else touches the iova range */
969 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
971 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
976 start_pfn += lvl_to_nr_pages(large_page);
978 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
980 domain_flush_cache(domain, first_pte,
981 (void *)pte - (void *)first_pte);
983 } while (start_pfn && start_pfn <= last_pfn);
986 static void dma_pte_free_level(struct dmar_domain *domain, int level,
987 int retain_level, struct dma_pte *pte,
988 unsigned long pfn, unsigned long start_pfn,
989 unsigned long last_pfn)
991 pfn = max(start_pfn, pfn);
992 pte = &pte[pfn_level_offset(pfn, level)];
995 unsigned long level_pfn;
996 struct dma_pte *level_pte;
998 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001 level_pfn = pfn & level_mask(level);
1002 level_pte = phys_to_virt(dma_pte_addr(pte));
1005 dma_pte_free_level(domain, level - 1, retain_level,
1006 level_pte, level_pfn, start_pfn,
1011 * Free the page table if we're below the level we want to
1012 * retain and the range covers the entire table.
1014 if (level < retain_level && !(start_pfn > level_pfn ||
1015 last_pfn < level_pfn + level_size(level) - 1)) {
1017 domain_flush_cache(domain, pte, sizeof(*pte));
1018 free_pgtable_page(level_pte);
1021 pfn += level_size(level);
1022 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1026 * clear last level (leaf) ptes and free page table pages below the
1027 * level we wish to keep intact.
1029 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1030 unsigned long start_pfn,
1031 unsigned long last_pfn,
1034 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1035 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1036 BUG_ON(start_pfn > last_pfn);
1038 dma_pte_clear_range(domain, start_pfn, last_pfn);
1040 /* We don't need lock here; nobody else touches the iova range */
1041 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1042 domain->pgd, 0, start_pfn, last_pfn);
1045 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1046 free_pgtable_page(domain->pgd);
1051 /* When a page at a given level is being unlinked from its parent, we don't
1052 need to *modify* it at all. All we need to do is make a list of all the
1053 pages which can be freed just as soon as we've flushed the IOTLB and we
1054 know the hardware page-walk will no longer touch them.
1055 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1057 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1058 int level, struct dma_pte *pte,
1059 struct page *freelist)
1063 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1064 pg->freelist = freelist;
1070 pte = page_address(pg);
1072 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1073 freelist = dma_pte_list_pagetables(domain, level - 1,
1076 } while (!first_pte_in_page(pte));
1081 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1082 struct dma_pte *pte, unsigned long pfn,
1083 unsigned long start_pfn,
1084 unsigned long last_pfn,
1085 struct page *freelist)
1087 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1089 pfn = max(start_pfn, pfn);
1090 pte = &pte[pfn_level_offset(pfn, level)];
1093 unsigned long level_pfn;
1095 if (!dma_pte_present(pte))
1098 level_pfn = pfn & level_mask(level);
1100 /* If range covers entire pagetable, free it */
1101 if (start_pfn <= level_pfn &&
1102 last_pfn >= level_pfn + level_size(level) - 1) {
1103 /* These suborbinate page tables are going away entirely. Don't
1104 bother to clear them; we're just going to *free* them. */
1105 if (level > 1 && !dma_pte_superpage(pte))
1106 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1112 } else if (level > 1) {
1113 /* Recurse down into a level that isn't *entirely* obsolete */
1114 freelist = dma_pte_clear_level(domain, level - 1,
1115 phys_to_virt(dma_pte_addr(pte)),
1116 level_pfn, start_pfn, last_pfn,
1120 pfn += level_size(level);
1121 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124 domain_flush_cache(domain, first_pte,
1125 (void *)++last_pte - (void *)first_pte);
1130 /* We can't just free the pages because the IOMMU may still be walking
1131 the page tables, and may have cached the intermediate levels. The
1132 pages can only be freed after the IOTLB flush has been done. */
1133 static struct page *domain_unmap(struct dmar_domain *domain,
1134 unsigned long start_pfn,
1135 unsigned long last_pfn)
1137 struct page *freelist;
1139 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1140 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1141 BUG_ON(start_pfn > last_pfn);
1143 /* we don't need lock here; nobody else touches the iova range */
1144 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1145 domain->pgd, 0, start_pfn, last_pfn, NULL);
1148 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1149 struct page *pgd_page = virt_to_page(domain->pgd);
1150 pgd_page->freelist = freelist;
1151 freelist = pgd_page;
1159 static void dma_free_pagelist(struct page *freelist)
1163 while ((pg = freelist)) {
1164 freelist = pg->freelist;
1165 free_pgtable_page(page_address(pg));
1169 static void iova_entry_free(unsigned long data)
1171 struct page *freelist = (struct page *)data;
1173 dma_free_pagelist(freelist);
1176 /* iommu handling */
1177 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1179 struct root_entry *root;
1180 unsigned long flags;
1182 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1184 pr_err("Allocating root entry for %s failed\n",
1189 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1191 spin_lock_irqsave(&iommu->lock, flags);
1192 iommu->root_entry = root;
1193 spin_unlock_irqrestore(&iommu->lock, flags);
1198 static void iommu_set_root_entry(struct intel_iommu *iommu)
1204 addr = virt_to_phys(iommu->root_entry);
1205 if (sm_supported(iommu))
1206 addr |= DMA_RTADDR_SMT;
1208 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1209 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1211 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1213 /* Make sure hardware complete it */
1214 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1215 readl, (sts & DMA_GSTS_RTPS), sts);
1217 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1225 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1229 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1231 /* Make sure hardware complete it */
1232 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1233 readl, (!(val & DMA_GSTS_WBFS)), val);
1235 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 /* return value determine if we need a write buffer flush */
1239 static void __iommu_flush_context(struct intel_iommu *iommu,
1240 u16 did, u16 source_id, u8 function_mask,
1247 case DMA_CCMD_GLOBAL_INVL:
1248 val = DMA_CCMD_GLOBAL_INVL;
1250 case DMA_CCMD_DOMAIN_INVL:
1251 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1253 case DMA_CCMD_DEVICE_INVL:
1254 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1255 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1260 val |= DMA_CCMD_ICC;
1262 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1263 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1265 /* Make sure hardware complete it */
1266 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1267 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1269 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 /* return value determine if we need a write buffer flush */
1273 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1274 u64 addr, unsigned int size_order, u64 type)
1276 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1277 u64 val = 0, val_iva = 0;
1281 case DMA_TLB_GLOBAL_FLUSH:
1282 /* global flush doesn't need set IVA_REG */
1283 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1285 case DMA_TLB_DSI_FLUSH:
1286 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 case DMA_TLB_PSI_FLUSH:
1289 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1290 /* IH bit is passed in as part of address */
1291 val_iva = size_order | addr;
1296 /* Note: set drain read/write */
1299 * This is probably to be super secure.. Looks like we can
1300 * ignore it without any impact.
1302 if (cap_read_drain(iommu->cap))
1303 val |= DMA_TLB_READ_DRAIN;
1305 if (cap_write_drain(iommu->cap))
1306 val |= DMA_TLB_WRITE_DRAIN;
1308 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1309 /* Note: Only uses first TLB reg currently */
1311 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1312 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1314 /* Make sure hardware complete it */
1315 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1316 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1318 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1320 /* check IOTLB invalidation granularity */
1321 if (DMA_TLB_IAIG(val) == 0)
1322 pr_err("Flush IOTLB failed\n");
1323 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1324 pr_debug("TLB flush request %Lx, actual %Lx\n",
1325 (unsigned long long)DMA_TLB_IIRG(type),
1326 (unsigned long long)DMA_TLB_IAIG(val));
1329 static struct device_domain_info *
1330 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333 struct device_domain_info *info;
1335 assert_spin_locked(&device_domain_lock);
1340 list_for_each_entry(info, &domain->devices, link)
1341 if (info->iommu == iommu && info->bus == bus &&
1342 info->devfn == devfn) {
1343 if (info->ats_supported && info->dev)
1351 static void domain_update_iotlb(struct dmar_domain *domain)
1353 struct device_domain_info *info;
1354 bool has_iotlb_device = false;
1356 assert_spin_locked(&device_domain_lock);
1358 list_for_each_entry(info, &domain->devices, link) {
1359 struct pci_dev *pdev;
1361 if (!info->dev || !dev_is_pci(info->dev))
1364 pdev = to_pci_dev(info->dev);
1365 if (pdev->ats_enabled) {
1366 has_iotlb_device = true;
1371 domain->has_iotlb_device = has_iotlb_device;
1374 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1376 struct pci_dev *pdev;
1378 assert_spin_locked(&device_domain_lock);
1380 if (!info || !dev_is_pci(info->dev))
1383 pdev = to_pci_dev(info->dev);
1384 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1385 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1386 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1387 * reserved, which should be set to 0.
1389 if (!ecap_dit(info->iommu->ecap))
1392 struct pci_dev *pf_pdev;
1394 /* pdev will be returned if device is not a vf */
1395 pf_pdev = pci_physfn(pdev);
1396 info->pfsid = pci_dev_id(pf_pdev);
1399 #ifdef CONFIG_INTEL_IOMMU_SVM
1400 /* The PCIe spec, in its wisdom, declares that the behaviour of
1401 the device if you enable PASID support after ATS support is
1402 undefined. So always enable PASID support on devices which
1403 have it, even if we can't yet know if we're ever going to
1405 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1406 info->pasid_enabled = 1;
1408 if (info->pri_supported &&
1409 (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1) &&
1410 !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1411 info->pri_enabled = 1;
1413 if (!pdev->untrusted && info->ats_supported &&
1414 pci_ats_page_aligned(pdev) &&
1415 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1416 info->ats_enabled = 1;
1417 domain_update_iotlb(info->domain);
1418 info->ats_qdep = pci_ats_queue_depth(pdev);
1422 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1424 struct pci_dev *pdev;
1426 assert_spin_locked(&device_domain_lock);
1428 if (!dev_is_pci(info->dev))
1431 pdev = to_pci_dev(info->dev);
1433 if (info->ats_enabled) {
1434 pci_disable_ats(pdev);
1435 info->ats_enabled = 0;
1436 domain_update_iotlb(info->domain);
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439 if (info->pri_enabled) {
1440 pci_disable_pri(pdev);
1441 info->pri_enabled = 0;
1443 if (info->pasid_enabled) {
1444 pci_disable_pasid(pdev);
1445 info->pasid_enabled = 0;
1450 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1451 u64 addr, unsigned mask)
1454 unsigned long flags;
1455 struct device_domain_info *info;
1457 if (!domain->has_iotlb_device)
1460 spin_lock_irqsave(&device_domain_lock, flags);
1461 list_for_each_entry(info, &domain->devices, link) {
1462 if (!info->ats_enabled)
1465 sid = info->bus << 8 | info->devfn;
1466 qdep = info->ats_qdep;
1467 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1473 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1474 struct dmar_domain *domain,
1475 unsigned long pfn, unsigned int pages,
1478 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1479 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1480 u16 did = domain->iommu_did[iommu->seq_id];
1487 * Fallback to domain selective flush if no PSI support or the size is
1489 * PSI requires page size to be 2 ^ x, and the base address is naturally
1490 * aligned to the size
1492 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1493 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1500 * In caching mode, changes of pages from non-present to present require
1501 * flush. However, device IOTLB doesn't need to be flushed in this case.
1503 if (!cap_caching_mode(iommu->cap) || !map)
1504 iommu_flush_dev_iotlb(domain, addr, mask);
1507 /* Notification for newly created mappings */
1508 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1509 struct dmar_domain *domain,
1510 unsigned long pfn, unsigned int pages)
1512 /* It's a non-present to present mapping. Only flush if caching mode */
1513 if (cap_caching_mode(iommu->cap))
1514 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1516 iommu_flush_write_buffer(iommu);
1519 static void iommu_flush_iova(struct iova_domain *iovad)
1521 struct dmar_domain *domain;
1524 domain = container_of(iovad, struct dmar_domain, iovad);
1526 for_each_domain_iommu(idx, domain) {
1527 struct intel_iommu *iommu = g_iommus[idx];
1528 u16 did = domain->iommu_did[iommu->seq_id];
1530 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1532 if (!cap_caching_mode(iommu->cap))
1533 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1534 0, MAX_AGAW_PFN_WIDTH);
1538 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 unsigned long flags;
1543 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1547 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1548 pmen &= ~DMA_PMEN_EPM;
1549 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1551 /* wait for the protected region status bit to clear */
1552 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1553 readl, !(pmen & DMA_PMEN_PRS), pmen);
1555 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 unsigned long flags;
1563 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1564 iommu->gcmd |= DMA_GCMD_TE;
1565 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1567 /* Make sure hardware complete it */
1568 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1569 readl, (sts & DMA_GSTS_TES), sts);
1571 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 static void iommu_disable_translation(struct intel_iommu *iommu)
1579 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1580 iommu->gcmd &= ~DMA_GCMD_TE;
1581 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1583 /* Make sure hardware complete it */
1584 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1585 readl, (!(sts & DMA_GSTS_TES)), sts);
1587 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 static int iommu_init_domains(struct intel_iommu *iommu)
1592 u32 ndomains, nlongs;
1595 ndomains = cap_ndoms(iommu->cap);
1596 pr_debug("%s: Number of Domains supported <%d>\n",
1597 iommu->name, ndomains);
1598 nlongs = BITS_TO_LONGS(ndomains);
1600 spin_lock_init(&iommu->lock);
1602 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1603 if (!iommu->domain_ids) {
1604 pr_err("%s: Allocating domain id array failed\n",
1609 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1610 iommu->domains = kzalloc(size, GFP_KERNEL);
1612 if (iommu->domains) {
1613 size = 256 * sizeof(struct dmar_domain *);
1614 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617 if (!iommu->domains || !iommu->domains[0]) {
1618 pr_err("%s: Allocating domain array failed\n",
1620 kfree(iommu->domain_ids);
1621 kfree(iommu->domains);
1622 iommu->domain_ids = NULL;
1623 iommu->domains = NULL;
1628 * If Caching mode is set, then invalid translations are tagged
1629 * with domain-id 0, hence we need to pre-allocate it. We also
1630 * use domain-id 0 as a marker for non-allocated domain-id, so
1631 * make sure it is not used for a real domain.
1633 set_bit(0, iommu->domain_ids);
1636 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1637 * entry for first-level or pass-through translation modes should
1638 * be programmed with a domain id different from those used for
1639 * second-level or nested translation. We reserve a domain id for
1642 if (sm_supported(iommu))
1643 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1648 static void disable_dmar_iommu(struct intel_iommu *iommu)
1650 struct device_domain_info *info, *tmp;
1651 unsigned long flags;
1653 if (!iommu->domains || !iommu->domain_ids)
1656 spin_lock_irqsave(&device_domain_lock, flags);
1657 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1658 if (info->iommu != iommu)
1661 if (!info->dev || !info->domain)
1664 __dmar_remove_one_dev_info(info);
1666 spin_unlock_irqrestore(&device_domain_lock, flags);
1668 if (iommu->gcmd & DMA_GCMD_TE)
1669 iommu_disable_translation(iommu);
1672 static void free_dmar_iommu(struct intel_iommu *iommu)
1674 if ((iommu->domains) && (iommu->domain_ids)) {
1675 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678 for (i = 0; i < elems; i++)
1679 kfree(iommu->domains[i]);
1680 kfree(iommu->domains);
1681 kfree(iommu->domain_ids);
1682 iommu->domains = NULL;
1683 iommu->domain_ids = NULL;
1686 g_iommus[iommu->seq_id] = NULL;
1688 /* free context mapping */
1689 free_context_table(iommu);
1691 #ifdef CONFIG_INTEL_IOMMU_SVM
1692 if (pasid_supported(iommu)) {
1693 if (ecap_prs(iommu->ecap))
1694 intel_svm_finish_prq(iommu);
1699 static struct dmar_domain *alloc_domain(int flags)
1701 struct dmar_domain *domain;
1703 domain = alloc_domain_mem();
1707 memset(domain, 0, sizeof(*domain));
1708 domain->nid = NUMA_NO_NODE;
1709 domain->flags = flags;
1710 domain->has_iotlb_device = false;
1711 INIT_LIST_HEAD(&domain->devices);
1716 /* Must be called with iommu->lock */
1717 static int domain_attach_iommu(struct dmar_domain *domain,
1718 struct intel_iommu *iommu)
1720 unsigned long ndomains;
1723 assert_spin_locked(&device_domain_lock);
1724 assert_spin_locked(&iommu->lock);
1726 domain->iommu_refcnt[iommu->seq_id] += 1;
1727 domain->iommu_count += 1;
1728 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1729 ndomains = cap_ndoms(iommu->cap);
1730 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1732 if (num >= ndomains) {
1733 pr_err("%s: No free domain ids\n", iommu->name);
1734 domain->iommu_refcnt[iommu->seq_id] -= 1;
1735 domain->iommu_count -= 1;
1739 set_bit(num, iommu->domain_ids);
1740 set_iommu_domain(iommu, num, domain);
1742 domain->iommu_did[iommu->seq_id] = num;
1743 domain->nid = iommu->node;
1745 domain_update_iommu_cap(domain);
1751 static int domain_detach_iommu(struct dmar_domain *domain,
1752 struct intel_iommu *iommu)
1756 assert_spin_locked(&device_domain_lock);
1757 assert_spin_locked(&iommu->lock);
1759 domain->iommu_refcnt[iommu->seq_id] -= 1;
1760 count = --domain->iommu_count;
1761 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1762 num = domain->iommu_did[iommu->seq_id];
1763 clear_bit(num, iommu->domain_ids);
1764 set_iommu_domain(iommu, num, NULL);
1766 domain_update_iommu_cap(domain);
1767 domain->iommu_did[iommu->seq_id] = 0;
1773 static struct iova_domain reserved_iova_list;
1774 static struct lock_class_key reserved_rbtree_key;
1776 static int dmar_init_reserved_ranges(void)
1778 struct pci_dev *pdev = NULL;
1782 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1784 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1785 &reserved_rbtree_key);
1787 /* IOAPIC ranges shouldn't be accessed by DMA */
1788 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1789 IOVA_PFN(IOAPIC_RANGE_END));
1791 pr_err("Reserve IOAPIC range failed\n");
1795 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1796 for_each_pci_dev(pdev) {
1799 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1800 r = &pdev->resource[i];
1801 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1803 iova = reserve_iova(&reserved_iova_list,
1807 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1815 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1817 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 static inline int guestwidth_to_adjustwidth(int gaw)
1823 int r = (gaw - 12) % 9;
1834 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837 int adjust_width, agaw;
1838 unsigned long sagaw;
1841 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1843 err = init_iova_flush_queue(&domain->iovad,
1844 iommu_flush_iova, iova_entry_free);
1848 domain_reserve_special_ranges(domain);
1850 /* calculate AGAW */
1851 if (guest_width > cap_mgaw(iommu->cap))
1852 guest_width = cap_mgaw(iommu->cap);
1853 domain->gaw = guest_width;
1854 adjust_width = guestwidth_to_adjustwidth(guest_width);
1855 agaw = width_to_agaw(adjust_width);
1856 sagaw = cap_sagaw(iommu->cap);
1857 if (!test_bit(agaw, &sagaw)) {
1858 /* hardware doesn't support it, choose a bigger one */
1859 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1860 agaw = find_next_bit(&sagaw, 5, agaw);
1864 domain->agaw = agaw;
1866 if (ecap_coherent(iommu->ecap))
1867 domain->iommu_coherency = 1;
1869 domain->iommu_coherency = 0;
1871 if (ecap_sc_support(iommu->ecap))
1872 domain->iommu_snooping = 1;
1874 domain->iommu_snooping = 0;
1876 if (intel_iommu_superpage)
1877 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1879 domain->iommu_superpage = 0;
1881 domain->nid = iommu->node;
1883 /* always allocate the top pgd */
1884 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1891 static void domain_exit(struct dmar_domain *domain)
1894 /* Remove associated devices and clear attached or cached domains */
1895 domain_remove_dev_info(domain);
1898 put_iova_domain(&domain->iovad);
1901 struct page *freelist;
1903 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1904 dma_free_pagelist(freelist);
1907 free_domain_mem(domain);
1911 * Get the PASID directory size for scalable mode context entry.
1912 * Value of X in the PDTS field of a scalable mode context entry
1913 * indicates PASID directory with 2^(X + 7) entries.
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1919 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1928 * Set the RID_PASID field of a scalable mode context entry. The
1929 * IOMMU hardware will use the PASID value set in this field for
1930 * DMA translations of DMA requests without PASID.
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1935 context->hi |= pasid & ((1 << 20) - 1);
1936 context->hi |= (1 << 20);
1940 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943 static inline void context_set_sm_dte(struct context_entry *context)
1945 context->lo |= (1 << 2);
1949 * Set the PRE(Page Request Enable) field of a scalable mode context
1952 static inline void context_set_sm_pre(struct context_entry *context)
1954 context->lo |= (1 << 4);
1957 /* Convert value to context PASID directory size field coding. */
1958 #define context_pdts(pds) (((pds) & 0x7) << 9)
1960 static int domain_context_mapping_one(struct dmar_domain *domain,
1961 struct intel_iommu *iommu,
1962 struct pasid_table *table,
1965 u16 did = domain->iommu_did[iommu->seq_id];
1966 int translation = CONTEXT_TT_MULTI_LEVEL;
1967 struct device_domain_info *info = NULL;
1968 struct context_entry *context;
1969 unsigned long flags;
1974 if (hw_pass_through && domain_type_is_si(domain))
1975 translation = CONTEXT_TT_PASS_THROUGH;
1977 pr_debug("Set context mapping for %02x:%02x.%d\n",
1978 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1980 BUG_ON(!domain->pgd);
1982 spin_lock_irqsave(&device_domain_lock, flags);
1983 spin_lock(&iommu->lock);
1986 context = iommu_context_addr(iommu, bus, devfn, 1);
1991 if (context_present(context))
1995 * For kdump cases, old valid entries may be cached due to the
1996 * in-flight DMA and copied pgtable, but there is no unmapping
1997 * behaviour for them, thus we need an explicit cache flush for
1998 * the newly-mapped device. For kdump, at this point, the device
1999 * is supposed to finish reset at its driver probe stage, so no
2000 * in-flight DMA will exist, and we don't need to worry anymore
2003 if (context_copied(context)) {
2004 u16 did_old = context_domain_id(context);
2006 if (did_old < cap_ndoms(iommu->cap)) {
2007 iommu->flush.flush_context(iommu, did_old,
2008 (((u16)bus) << 8) | devfn,
2009 DMA_CCMD_MASK_NOBIT,
2010 DMA_CCMD_DEVICE_INVL);
2011 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2016 context_clear_entry(context);
2018 if (sm_supported(iommu)) {
2023 /* Setup the PASID DIR pointer: */
2024 pds = context_get_sm_pds(table);
2025 context->lo = (u64)virt_to_phys(table->table) |
2028 /* Setup the RID_PASID field: */
2029 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032 * Setup the Device-TLB enable bit and Page request
2035 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2036 if (info && info->ats_supported)
2037 context_set_sm_dte(context);
2038 if (info && info->pri_supported)
2039 context_set_sm_pre(context);
2041 struct dma_pte *pgd = domain->pgd;
2044 context_set_domain_id(context, did);
2046 if (translation != CONTEXT_TT_PASS_THROUGH) {
2048 * Skip top levels of page tables for iommu which has
2049 * less agaw than default. Unnecessary for PT mode.
2051 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2053 pgd = phys_to_virt(dma_pte_addr(pgd));
2054 if (!dma_pte_present(pgd))
2058 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059 if (info && info->ats_supported)
2060 translation = CONTEXT_TT_DEV_IOTLB;
2062 translation = CONTEXT_TT_MULTI_LEVEL;
2064 context_set_address_root(context, virt_to_phys(pgd));
2065 context_set_address_width(context, agaw);
2068 * In pass through mode, AW must be programmed to
2069 * indicate the largest AGAW value supported by
2070 * hardware. And ASR is ignored by hardware.
2072 context_set_address_width(context, iommu->msagaw);
2075 context_set_translation_type(context, translation);
2078 context_set_fault_enable(context);
2079 context_set_present(context);
2080 domain_flush_cache(domain, context, sizeof(*context));
2083 * It's a non-present to present mapping. If hardware doesn't cache
2084 * non-present entry we only need to flush the write-buffer. If the
2085 * _does_ cache non-present entries, then it does so in the special
2086 * domain #0, which we have to flush:
2088 if (cap_caching_mode(iommu->cap)) {
2089 iommu->flush.flush_context(iommu, 0,
2090 (((u16)bus) << 8) | devfn,
2091 DMA_CCMD_MASK_NOBIT,
2092 DMA_CCMD_DEVICE_INVL);
2093 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2095 iommu_flush_write_buffer(iommu);
2097 iommu_enable_dev_iotlb(info);
2102 spin_unlock(&iommu->lock);
2103 spin_unlock_irqrestore(&device_domain_lock, flags);
2109 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2111 struct pasid_table *table;
2112 struct intel_iommu *iommu;
2115 iommu = device_to_iommu(dev, &bus, &devfn);
2119 table = intel_pasid_get_table(dev);
2120 return domain_context_mapping_one(domain, iommu, table, bus, devfn);
2123 static int domain_context_mapped_cb(struct pci_dev *pdev,
2124 u16 alias, void *opaque)
2126 struct intel_iommu *iommu = opaque;
2128 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2131 static int domain_context_mapped(struct device *dev)
2133 struct intel_iommu *iommu;
2136 iommu = device_to_iommu(dev, &bus, &devfn);
2140 if (!dev_is_pci(dev))
2141 return device_context_mapped(iommu, bus, devfn);
2143 return !pci_for_each_dma_alias(to_pci_dev(dev),
2144 domain_context_mapped_cb, iommu);
2147 /* Returns a number of VTD pages, but aligned to MM page size */
2148 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2151 host_addr &= ~PAGE_MASK;
2152 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2155 /* Return largest possible superpage level for a given mapping */
2156 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2157 unsigned long iov_pfn,
2158 unsigned long phy_pfn,
2159 unsigned long pages)
2161 int support, level = 1;
2162 unsigned long pfnmerge;
2164 support = domain->iommu_superpage;
2166 /* To use a large page, the virtual *and* physical addresses
2167 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2168 of them will mean we have to use smaller pages. So just
2169 merge them and check both at once. */
2170 pfnmerge = iov_pfn | phy_pfn;
2172 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2173 pages >>= VTD_STRIDE_SHIFT;
2176 pfnmerge >>= VTD_STRIDE_SHIFT;
2183 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2184 struct scatterlist *sg, unsigned long phys_pfn,
2185 unsigned long nr_pages, int prot)
2187 struct dma_pte *first_pte = NULL, *pte = NULL;
2188 phys_addr_t uninitialized_var(pteval);
2189 unsigned long sg_res = 0;
2190 unsigned int largepage_lvl = 0;
2191 unsigned long lvl_pages = 0;
2193 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2195 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2198 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2202 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2205 while (nr_pages > 0) {
2209 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2211 sg_res = aligned_nrpages(sg->offset, sg->length);
2212 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2213 sg->dma_length = sg->length;
2214 pteval = (sg_phys(sg) - pgoff) | prot;
2215 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2219 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2221 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2224 /* It is large page*/
2225 if (largepage_lvl > 1) {
2226 unsigned long nr_superpages, end_pfn;
2228 pteval |= DMA_PTE_LARGE_PAGE;
2229 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2231 nr_superpages = sg_res / lvl_pages;
2232 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2235 * Ensure that old small page tables are
2236 * removed to make room for superpage(s).
2237 * We're adding new large pages, so make sure
2238 * we don't remove their parent tables.
2240 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2243 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2247 /* We don't need lock here, nobody else
2248 * touches the iova range
2250 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2252 static int dumps = 5;
2253 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2254 iov_pfn, tmp, (unsigned long long)pteval);
2257 debug_dma_dump_mappings(NULL);
2262 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2264 BUG_ON(nr_pages < lvl_pages);
2265 BUG_ON(sg_res < lvl_pages);
2267 nr_pages -= lvl_pages;
2268 iov_pfn += lvl_pages;
2269 phys_pfn += lvl_pages;
2270 pteval += lvl_pages * VTD_PAGE_SIZE;
2271 sg_res -= lvl_pages;
2273 /* If the next PTE would be the first in a new page, then we
2274 need to flush the cache on the entries we've just written.
2275 And then we'll need to recalculate 'pte', so clear it and
2276 let it get set again in the if (!pte) block above.
2278 If we're done (!nr_pages) we need to flush the cache too.
2280 Also if we've been setting superpages, we may need to
2281 recalculate 'pte' and switch back to smaller pages for the
2282 end of the mapping, if the trailing size is not enough to
2283 use another superpage (i.e. sg_res < lvl_pages). */
2285 if (!nr_pages || first_pte_in_page(pte) ||
2286 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2287 domain_flush_cache(domain, first_pte,
2288 (void *)pte - (void *)first_pte);
2292 if (!sg_res && nr_pages)
2298 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2299 struct scatterlist *sg, unsigned long phys_pfn,
2300 unsigned long nr_pages, int prot)
2303 struct intel_iommu *iommu;
2305 /* Do the real mapping first */
2306 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2310 for_each_domain_iommu(iommu_id, domain) {
2311 iommu = g_iommus[iommu_id];
2312 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2318 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2319 struct scatterlist *sg, unsigned long nr_pages,
2322 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2325 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326 unsigned long phys_pfn, unsigned long nr_pages,
2329 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2332 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2334 unsigned long flags;
2335 struct context_entry *context;
2341 spin_lock_irqsave(&iommu->lock, flags);
2342 context = iommu_context_addr(iommu, bus, devfn, 0);
2344 spin_unlock_irqrestore(&iommu->lock, flags);
2347 did_old = context_domain_id(context);
2348 context_clear_entry(context);
2349 __iommu_flush_cache(iommu, context, sizeof(*context));
2350 spin_unlock_irqrestore(&iommu->lock, flags);
2351 iommu->flush.flush_context(iommu,
2353 (((u16)bus) << 8) | devfn,
2354 DMA_CCMD_MASK_NOBIT,
2355 DMA_CCMD_DEVICE_INVL);
2356 iommu->flush.flush_iotlb(iommu,
2363 static inline void unlink_domain_info(struct device_domain_info *info)
2365 assert_spin_locked(&device_domain_lock);
2366 list_del(&info->link);
2367 list_del(&info->global);
2369 info->dev->archdata.iommu = NULL;
2372 static void domain_remove_dev_info(struct dmar_domain *domain)
2374 struct device_domain_info *info, *tmp;
2375 unsigned long flags;
2377 spin_lock_irqsave(&device_domain_lock, flags);
2378 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2379 __dmar_remove_one_dev_info(info);
2380 spin_unlock_irqrestore(&device_domain_lock, flags);
2385 * Note: we use struct device->archdata.iommu stores the info
2387 static struct dmar_domain *find_domain(struct device *dev)
2389 struct device_domain_info *info;
2391 if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2392 struct iommu_domain *domain;
2394 dev->archdata.iommu = NULL;
2395 domain = iommu_get_domain_for_dev(dev);
2397 intel_iommu_attach_device(domain, dev);
2400 /* No lock here, assumes no domain exit in normal case */
2401 info = dev->archdata.iommu;
2404 return info->domain;
2408 static inline struct device_domain_info *
2409 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2411 struct device_domain_info *info;
2413 list_for_each_entry(info, &device_domain_list, global)
2414 if (info->iommu->segment == segment && info->bus == bus &&
2415 info->devfn == devfn)
2421 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2424 struct dmar_domain *domain)
2426 struct dmar_domain *found = NULL;
2427 struct device_domain_info *info;
2428 unsigned long flags;
2431 info = alloc_devinfo_mem();
2436 info->devfn = devfn;
2437 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2438 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2441 info->domain = domain;
2442 info->iommu = iommu;
2443 info->pasid_table = NULL;
2444 info->auxd_enabled = 0;
2445 INIT_LIST_HEAD(&info->auxiliary_domains);
2447 if (dev && dev_is_pci(dev)) {
2448 struct pci_dev *pdev = to_pci_dev(info->dev);
2450 if (!pdev->untrusted &&
2451 !pci_ats_disabled() &&
2452 ecap_dev_iotlb_support(iommu->ecap) &&
2453 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2454 dmar_find_matched_atsr_unit(pdev))
2455 info->ats_supported = 1;
2457 if (sm_supported(iommu)) {
2458 if (pasid_supported(iommu)) {
2459 int features = pci_pasid_features(pdev);
2461 info->pasid_supported = features | 1;
2464 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2465 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2466 info->pri_supported = 1;
2470 spin_lock_irqsave(&device_domain_lock, flags);
2472 found = find_domain(dev);
2475 struct device_domain_info *info2;
2476 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2478 found = info2->domain;
2484 spin_unlock_irqrestore(&device_domain_lock, flags);
2485 free_devinfo_mem(info);
2486 /* Caller must free the original domain */
2490 spin_lock(&iommu->lock);
2491 ret = domain_attach_iommu(domain, iommu);
2492 spin_unlock(&iommu->lock);
2495 spin_unlock_irqrestore(&device_domain_lock, flags);
2496 free_devinfo_mem(info);
2500 list_add(&info->link, &domain->devices);
2501 list_add(&info->global, &device_domain_list);
2503 dev->archdata.iommu = info;
2504 spin_unlock_irqrestore(&device_domain_lock, flags);
2506 /* PASID table is mandatory for a PCI device in scalable mode. */
2507 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2508 ret = intel_pasid_alloc_table(dev);
2510 dev_err(dev, "PASID table allocation failed\n");
2511 dmar_remove_one_dev_info(dev);
2515 /* Setup the PASID entry for requests without PASID: */
2516 spin_lock(&iommu->lock);
2517 if (hw_pass_through && domain_type_is_si(domain))
2518 ret = intel_pasid_setup_pass_through(iommu, domain,
2519 dev, PASID_RID2PASID);
2521 ret = intel_pasid_setup_second_level(iommu, domain,
2522 dev, PASID_RID2PASID);
2523 spin_unlock(&iommu->lock);
2525 dev_err(dev, "Setup RID2PASID failed\n");
2526 dmar_remove_one_dev_info(dev);
2531 if (dev && domain_context_mapping(domain, dev)) {
2532 dev_err(dev, "Domain context map failed\n");
2533 dmar_remove_one_dev_info(dev);
2540 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2542 *(u16 *)opaque = alias;
2546 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2548 struct device_domain_info *info;
2549 struct dmar_domain *domain = NULL;
2550 struct intel_iommu *iommu;
2552 unsigned long flags;
2555 iommu = device_to_iommu(dev, &bus, &devfn);
2559 if (dev_is_pci(dev)) {
2560 struct pci_dev *pdev = to_pci_dev(dev);
2562 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2564 spin_lock_irqsave(&device_domain_lock, flags);
2565 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2566 PCI_BUS_NUM(dma_alias),
2569 iommu = info->iommu;
2570 domain = info->domain;
2572 spin_unlock_irqrestore(&device_domain_lock, flags);
2574 /* DMA alias already has a domain, use it */
2579 /* Allocate and initialize new domain for the device */
2580 domain = alloc_domain(0);
2583 if (domain_init(domain, iommu, gaw)) {
2584 domain_exit(domain);
2592 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2593 struct dmar_domain *domain)
2595 struct intel_iommu *iommu;
2596 struct dmar_domain *tmp;
2597 u16 req_id, dma_alias;
2600 iommu = device_to_iommu(dev, &bus, &devfn);
2604 req_id = ((u16)bus << 8) | devfn;
2606 if (dev_is_pci(dev)) {
2607 struct pci_dev *pdev = to_pci_dev(dev);
2609 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2611 /* register PCI DMA alias device */
2612 if (req_id != dma_alias) {
2613 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2614 dma_alias & 0xff, NULL, domain);
2616 if (!tmp || tmp != domain)
2621 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2622 if (!tmp || tmp != domain)
2628 static int iommu_domain_identity_map(struct dmar_domain *domain,
2629 unsigned long long start,
2630 unsigned long long end)
2632 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2633 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2635 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2636 dma_to_mm_pfn(last_vpfn))) {
2637 pr_err("Reserving iova failed\n");
2641 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2643 * RMRR range might have overlap with physical memory range,
2646 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2648 return __domain_mapping(domain, first_vpfn, NULL,
2649 first_vpfn, last_vpfn - first_vpfn + 1,
2650 DMA_PTE_READ|DMA_PTE_WRITE);
2653 static int domain_prepare_identity_map(struct device *dev,
2654 struct dmar_domain *domain,
2655 unsigned long long start,
2656 unsigned long long end)
2658 /* For _hardware_ passthrough, don't bother. But for software
2659 passthrough, we do it anyway -- it may indicate a memory
2660 range which is reserved in E820, so which didn't get set
2661 up to start with in si_domain */
2662 if (domain == si_domain && hw_pass_through) {
2663 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2668 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2671 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2672 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2673 dmi_get_system_info(DMI_BIOS_VENDOR),
2674 dmi_get_system_info(DMI_BIOS_VERSION),
2675 dmi_get_system_info(DMI_PRODUCT_VERSION));
2679 if (end >> agaw_to_width(domain->agaw)) {
2680 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2681 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2682 agaw_to_width(domain->agaw),
2683 dmi_get_system_info(DMI_BIOS_VENDOR),
2684 dmi_get_system_info(DMI_BIOS_VERSION),
2685 dmi_get_system_info(DMI_PRODUCT_VERSION));
2689 return iommu_domain_identity_map(domain, start, end);
2692 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2694 static int __init si_domain_init(int hw)
2696 struct dmar_rmrr_unit *rmrr;
2700 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2704 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2705 domain_exit(si_domain);
2712 for_each_online_node(nid) {
2713 unsigned long start_pfn, end_pfn;
2716 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2717 ret = iommu_domain_identity_map(si_domain,
2718 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2725 * Normally we use DMA domains for devices which have RMRRs. But we
2726 * loose this requirement for graphic and usb devices. Identity map
2727 * the RMRRs for graphic and USB devices so that they could use the
2730 for_each_rmrr_units(rmrr) {
2731 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2733 unsigned long long start = rmrr->base_address;
2734 unsigned long long end = rmrr->end_address;
2736 if (device_is_rmrr_locked(dev))
2739 if (WARN_ON(end < start ||
2740 end >> agaw_to_width(si_domain->agaw)))
2743 ret = iommu_domain_identity_map(si_domain, start, end);
2752 static int identity_mapping(struct device *dev)
2754 struct device_domain_info *info;
2756 info = dev->archdata.iommu;
2757 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2758 return (info->domain == si_domain);
2763 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2765 struct dmar_domain *ndomain;
2766 struct intel_iommu *iommu;
2769 iommu = device_to_iommu(dev, &bus, &devfn);
2773 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2774 if (ndomain != domain)
2780 static bool device_has_rmrr(struct device *dev)
2782 struct dmar_rmrr_unit *rmrr;
2787 for_each_rmrr_units(rmrr) {
2789 * Return TRUE if this RMRR contains the device that
2792 for_each_active_dev_scope(rmrr->devices,
2793 rmrr->devices_cnt, i, tmp)
2795 is_downstream_to_pci_bridge(dev, tmp)) {
2805 * device_rmrr_is_relaxable - Test whether the RMRR of this device
2806 * is relaxable (ie. is allowed to be not enforced under some conditions)
2807 * @dev: device handle
2809 * We assume that PCI USB devices with RMRRs have them largely
2810 * for historical reasons and that the RMRR space is not actively used post
2811 * boot. This exclusion may change if vendors begin to abuse it.
2813 * The same exception is made for graphics devices, with the requirement that
2814 * any use of the RMRR regions will be torn down before assigning the device
2817 * Return: true if the RMRR is relaxable, false otherwise
2819 static bool device_rmrr_is_relaxable(struct device *dev)
2821 struct pci_dev *pdev;
2823 if (!dev_is_pci(dev))
2826 pdev = to_pci_dev(dev);
2827 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2834 * There are a couple cases where we need to restrict the functionality of
2835 * devices associated with RMRRs. The first is when evaluating a device for
2836 * identity mapping because problems exist when devices are moved in and out
2837 * of domains and their respective RMRR information is lost. This means that
2838 * a device with associated RMRRs will never be in a "passthrough" domain.
2839 * The second is use of the device through the IOMMU API. This interface
2840 * expects to have full control of the IOVA space for the device. We cannot
2841 * satisfy both the requirement that RMRR access is maintained and have an
2842 * unencumbered IOVA space. We also have no ability to quiesce the device's
2843 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2844 * We therefore prevent devices associated with an RMRR from participating in
2845 * the IOMMU API, which eliminates them from device assignment.
2847 * In both cases, devices which have relaxable RMRRs are not concerned by this
2848 * restriction. See device_rmrr_is_relaxable comment.
2850 static bool device_is_rmrr_locked(struct device *dev)
2852 if (!device_has_rmrr(dev))
2855 if (device_rmrr_is_relaxable(dev))
2862 * Return the required default domain type for a specific device.
2864 * @dev: the device in query
2865 * @startup: true if this is during early boot
2868 * - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2869 * - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2870 * - 0: both identity and dynamic domains work for this device
2872 static int device_def_domain_type(struct device *dev)
2874 if (dev_is_pci(dev)) {
2875 struct pci_dev *pdev = to_pci_dev(dev);
2877 if (device_is_rmrr_locked(dev))
2878 return IOMMU_DOMAIN_DMA;
2881 * Prevent any device marked as untrusted from getting
2882 * placed into the statically identity mapping domain.
2884 if (pdev->untrusted)
2885 return IOMMU_DOMAIN_DMA;
2887 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2888 return IOMMU_DOMAIN_IDENTITY;
2890 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2891 return IOMMU_DOMAIN_IDENTITY;
2894 * We want to start off with all devices in the 1:1 domain, and
2895 * take them out later if we find they can't access all of memory.
2897 * However, we can't do this for PCI devices behind bridges,
2898 * because all PCI devices behind the same bridge will end up
2899 * with the same source-id on their transactions.
2901 * Practically speaking, we can't change things around for these
2902 * devices at run-time, because we can't be sure there'll be no
2903 * DMA transactions in flight for any of their siblings.
2905 * So PCI devices (unless they're on the root bus) as well as
2906 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2907 * the 1:1 domain, just in _case_ one of their siblings turns out
2908 * not to be able to map all of memory.
2910 if (!pci_is_pcie(pdev)) {
2911 if (!pci_is_root_bus(pdev->bus))
2912 return IOMMU_DOMAIN_DMA;
2913 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2914 return IOMMU_DOMAIN_DMA;
2915 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2916 return IOMMU_DOMAIN_DMA;
2918 if (device_has_rmrr(dev))
2919 return IOMMU_DOMAIN_DMA;
2922 return (iommu_identity_mapping & IDENTMAP_ALL) ?
2923 IOMMU_DOMAIN_IDENTITY : 0;
2926 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2929 * Start from the sane iommu hardware state.
2930 * If the queued invalidation is already initialized by us
2931 * (for example, while enabling interrupt-remapping) then
2932 * we got the things already rolling from a sane state.
2936 * Clear any previous faults.
2938 dmar_fault(-1, iommu);
2940 * Disable queued invalidation if supported and already enabled
2941 * before OS handover.
2943 dmar_disable_qi(iommu);
2946 if (dmar_enable_qi(iommu)) {
2948 * Queued Invalidate not enabled, use Register Based Invalidate
2950 iommu->flush.flush_context = __iommu_flush_context;
2951 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2952 pr_info("%s: Using Register based invalidation\n",
2955 iommu->flush.flush_context = qi_flush_context;
2956 iommu->flush.flush_iotlb = qi_flush_iotlb;
2957 pr_info("%s: Using Queued invalidation\n", iommu->name);
2961 static int copy_context_table(struct intel_iommu *iommu,
2962 struct root_entry *old_re,
2963 struct context_entry **tbl,
2966 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2967 struct context_entry *new_ce = NULL, ce;
2968 struct context_entry *old_ce = NULL;
2969 struct root_entry re;
2970 phys_addr_t old_ce_phys;
2972 tbl_idx = ext ? bus * 2 : bus;
2973 memcpy(&re, old_re, sizeof(re));
2975 for (devfn = 0; devfn < 256; devfn++) {
2976 /* First calculate the correct index */
2977 idx = (ext ? devfn * 2 : devfn) % 256;
2980 /* First save what we may have and clean up */
2982 tbl[tbl_idx] = new_ce;
2983 __iommu_flush_cache(iommu, new_ce,
2993 old_ce_phys = root_entry_lctp(&re);
2995 old_ce_phys = root_entry_uctp(&re);
2998 if (ext && devfn == 0) {
2999 /* No LCTP, try UCTP */
3008 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3013 new_ce = alloc_pgtable_page(iommu->node);
3020 /* Now copy the context entry */
3021 memcpy(&ce, old_ce + idx, sizeof(ce));
3023 if (!__context_present(&ce))
3026 did = context_domain_id(&ce);
3027 if (did >= 0 && did < cap_ndoms(iommu->cap))
3028 set_bit(did, iommu->domain_ids);
3031 * We need a marker for copied context entries. This
3032 * marker needs to work for the old format as well as
3033 * for extended context entries.
3035 * Bit 67 of the context entry is used. In the old
3036 * format this bit is available to software, in the
3037 * extended format it is the PGE bit, but PGE is ignored
3038 * by HW if PASIDs are disabled (and thus still
3041 * So disable PASIDs first and then mark the entry
3042 * copied. This means that we don't copy PASID
3043 * translations from the old kernel, but this is fine as
3044 * faults there are not fatal.
3046 context_clear_pasid_enable(&ce);
3047 context_set_copied(&ce);
3052 tbl[tbl_idx + pos] = new_ce;
3054 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3063 static int copy_translation_tables(struct intel_iommu *iommu)
3065 struct context_entry **ctxt_tbls;
3066 struct root_entry *old_rt;
3067 phys_addr_t old_rt_phys;
3068 int ctxt_table_entries;
3069 unsigned long flags;
3074 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3075 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3076 new_ext = !!ecap_ecs(iommu->ecap);
3079 * The RTT bit can only be changed when translation is disabled,
3080 * but disabling translation means to open a window for data
3081 * corruption. So bail out and don't copy anything if we would
3082 * have to change the bit.
3087 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3091 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3095 /* This is too big for the stack - allocate it from slab */
3096 ctxt_table_entries = ext ? 512 : 256;
3098 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3102 for (bus = 0; bus < 256; bus++) {
3103 ret = copy_context_table(iommu, &old_rt[bus],
3104 ctxt_tbls, bus, ext);
3106 pr_err("%s: Failed to copy context table for bus %d\n",
3112 spin_lock_irqsave(&iommu->lock, flags);
3114 /* Context tables are copied, now write them to the root_entry table */
3115 for (bus = 0; bus < 256; bus++) {
3116 int idx = ext ? bus * 2 : bus;
3119 if (ctxt_tbls[idx]) {
3120 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3121 iommu->root_entry[bus].lo = val;
3124 if (!ext || !ctxt_tbls[idx + 1])
3127 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3128 iommu->root_entry[bus].hi = val;
3131 spin_unlock_irqrestore(&iommu->lock, flags);
3135 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3145 static int __init init_dmars(void)
3147 struct dmar_drhd_unit *drhd;
3148 struct intel_iommu *iommu;
3154 * initialize and program root entry to not present
3157 for_each_drhd_unit(drhd) {
3159 * lock not needed as this is only incremented in the single
3160 * threaded kernel __init code path all other access are read
3163 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3167 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3170 /* Preallocate enough resources for IOMMU hot-addition */
3171 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3172 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3174 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3177 pr_err("Allocating global iommu array failed\n");
3182 for_each_iommu(iommu, drhd) {
3183 if (drhd->ignored) {
3184 iommu_disable_translation(iommu);
3189 * Find the max pasid size of all IOMMU's in the system.
3190 * We need to ensure the system pasid table is no bigger
3191 * than the smallest supported.
3193 if (pasid_supported(iommu)) {
3194 u32 temp = 2 << ecap_pss(iommu->ecap);
3196 intel_pasid_max_id = min_t(u32, temp,
3197 intel_pasid_max_id);
3200 g_iommus[iommu->seq_id] = iommu;
3202 intel_iommu_init_qi(iommu);
3204 ret = iommu_init_domains(iommu);
3208 init_translation_status(iommu);
3210 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3211 iommu_disable_translation(iommu);
3212 clear_translation_pre_enabled(iommu);
3213 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3219 * we could share the same root & context tables
3220 * among all IOMMU's. Need to Split it later.
3222 ret = iommu_alloc_root_entry(iommu);
3226 if (translation_pre_enabled(iommu)) {
3227 pr_info("Translation already enabled - trying to copy translation structures\n");
3229 ret = copy_translation_tables(iommu);
3232 * We found the IOMMU with translation
3233 * enabled - but failed to copy over the
3234 * old root-entry table. Try to proceed
3235 * by disabling translation now and
3236 * allocating a clean root-entry table.
3237 * This might cause DMAR faults, but
3238 * probably the dump will still succeed.
3240 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3242 iommu_disable_translation(iommu);
3243 clear_translation_pre_enabled(iommu);
3245 pr_info("Copied translation tables from previous kernel for %s\n",
3250 if (!ecap_pass_through(iommu->ecap))
3251 hw_pass_through = 0;
3252 #ifdef CONFIG_INTEL_IOMMU_SVM
3253 if (pasid_supported(iommu))
3254 intel_svm_init(iommu);
3259 * Now that qi is enabled on all iommus, set the root entry and flush
3260 * caches. This is required on some Intel X58 chipsets, otherwise the
3261 * flush_context function will loop forever and the boot hangs.
3263 for_each_active_iommu(iommu, drhd) {
3264 iommu_flush_write_buffer(iommu);
3265 iommu_set_root_entry(iommu);
3266 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3267 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3270 if (iommu_pass_through)
3271 iommu_identity_mapping |= IDENTMAP_ALL;
3273 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3278 iommu_identity_mapping |= IDENTMAP_GFX;
3280 check_tylersburg_isoch();
3282 ret = si_domain_init(hw_pass_through);
3289 * global invalidate context cache
3290 * global invalidate iotlb
3291 * enable translation
3293 for_each_iommu(iommu, drhd) {
3294 if (drhd->ignored) {
3296 * we always have to disable PMRs or DMA may fail on
3300 iommu_disable_protect_mem_regions(iommu);
3304 iommu_flush_write_buffer(iommu);
3306 #ifdef CONFIG_INTEL_IOMMU_SVM
3307 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3309 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3310 * could cause possible lock race condition.
3312 up_write(&dmar_global_lock);
3313 ret = intel_svm_enable_prq(iommu);
3314 down_write(&dmar_global_lock);
3319 ret = dmar_set_interrupt(iommu);
3327 for_each_active_iommu(iommu, drhd) {
3328 disable_dmar_iommu(iommu);
3329 free_dmar_iommu(iommu);
3338 /* This takes a number of _MM_ pages, not VTD pages */
3339 static unsigned long intel_alloc_iova(struct device *dev,
3340 struct dmar_domain *domain,
3341 unsigned long nrpages, uint64_t dma_mask)
3343 unsigned long iova_pfn;
3345 /* Restrict dma_mask to the width that the iommu can handle */
3346 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3347 /* Ensure we reserve the whole size-aligned region */
3348 nrpages = __roundup_pow_of_two(nrpages);
3350 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3352 * First try to allocate an io virtual address in
3353 * DMA_BIT_MASK(32) and if that fails then try allocating
3356 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3357 IOVA_PFN(DMA_BIT_MASK(32)), false);
3361 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3362 IOVA_PFN(dma_mask), true);
3363 if (unlikely(!iova_pfn)) {
3364 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3371 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3373 struct dmar_domain *domain, *tmp;
3374 struct dmar_rmrr_unit *rmrr;
3375 struct device *i_dev;
3378 /* Device shouldn't be attached by any domains. */
3379 domain = find_domain(dev);
3383 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3387 /* We have a new domain - setup possible RMRRs for the device */
3389 for_each_rmrr_units(rmrr) {
3390 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3395 ret = domain_prepare_identity_map(dev, domain,
3399 dev_err(dev, "Mapping reserved region failed\n");
3404 tmp = set_domain_for_dev(dev, domain);
3405 if (!tmp || domain != tmp) {
3406 domain_exit(domain);
3412 dev_err(dev, "Allocating domain failed\n");
3414 domain->domain.type = IOMMU_DOMAIN_DMA;
3419 /* Check if the dev needs to go through non-identity map and unmap process.*/
3420 static bool iommu_need_mapping(struct device *dev)
3424 if (iommu_dummy(dev))
3427 ret = identity_mapping(dev);
3429 u64 dma_mask = *dev->dma_mask;
3431 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3432 dma_mask = dev->coherent_dma_mask;
3434 if (dma_mask >= dma_get_required_mask(dev))
3438 * 32 bit DMA is removed from si_domain and fall back to
3439 * non-identity mapping.
3441 dmar_remove_one_dev_info(dev);
3442 ret = iommu_request_dma_domain_for_dev(dev);
3444 struct iommu_domain *domain;
3445 struct dmar_domain *dmar_domain;
3447 domain = iommu_get_domain_for_dev(dev);
3449 dmar_domain = to_dmar_domain(domain);
3450 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3452 dmar_remove_one_dev_info(dev);
3453 get_private_domain_for_dev(dev);
3456 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3462 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3463 size_t size, int dir, u64 dma_mask)
3465 struct dmar_domain *domain;
3466 phys_addr_t start_paddr;
3467 unsigned long iova_pfn;
3470 struct intel_iommu *iommu;
3471 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3473 BUG_ON(dir == DMA_NONE);
3475 domain = find_domain(dev);
3477 return DMA_MAPPING_ERROR;
3479 iommu = domain_get_iommu(domain);
3480 size = aligned_nrpages(paddr, size);
3482 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3487 * Check if DMAR supports zero-length reads on write only
3490 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3491 !cap_zlr(iommu->cap))
3492 prot |= DMA_PTE_READ;
3493 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3494 prot |= DMA_PTE_WRITE;
3496 * paddr - (paddr + size) might be partial page, we should map the whole
3497 * page. Note: if two part of one page are separately mapped, we
3498 * might have two guest_addr mapping to the same host paddr, but this
3499 * is not a big problem
3501 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3502 mm_to_dma_pfn(paddr_pfn), size, prot);
3506 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3507 start_paddr += paddr & ~PAGE_MASK;
3512 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3513 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3514 size, (unsigned long long)paddr, dir);
3515 return DMA_MAPPING_ERROR;
3518 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3519 unsigned long offset, size_t size,
3520 enum dma_data_direction dir,
3521 unsigned long attrs)
3523 if (iommu_need_mapping(dev))
3524 return __intel_map_single(dev, page_to_phys(page) + offset,
3525 size, dir, *dev->dma_mask);
3526 return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3529 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3530 size_t size, enum dma_data_direction dir,
3531 unsigned long attrs)
3533 if (iommu_need_mapping(dev))
3534 return __intel_map_single(dev, phys_addr, size, dir,
3536 return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3539 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3541 struct dmar_domain *domain;
3542 unsigned long start_pfn, last_pfn;
3543 unsigned long nrpages;
3544 unsigned long iova_pfn;
3545 struct intel_iommu *iommu;
3546 struct page *freelist;
3547 struct pci_dev *pdev = NULL;
3549 domain = find_domain(dev);
3552 iommu = domain_get_iommu(domain);
3554 iova_pfn = IOVA_PFN(dev_addr);
3556 nrpages = aligned_nrpages(dev_addr, size);
3557 start_pfn = mm_to_dma_pfn(iova_pfn);
3558 last_pfn = start_pfn + nrpages - 1;
3560 if (dev_is_pci(dev))
3561 pdev = to_pci_dev(dev);
3563 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3565 freelist = domain_unmap(domain, start_pfn, last_pfn);
3567 if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3568 !has_iova_flush_queue(&domain->iovad)) {
3569 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3570 nrpages, !freelist, 0);
3572 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3573 dma_free_pagelist(freelist);
3575 queue_iova(&domain->iovad, iova_pfn, nrpages,
3576 (unsigned long)freelist);
3578 * queue up the release of the unmap to save the 1/6th of the
3579 * cpu used up by the iotlb flush operation...
3584 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3585 size_t size, enum dma_data_direction dir,
3586 unsigned long attrs)
3588 if (iommu_need_mapping(dev))
3589 intel_unmap(dev, dev_addr, size);
3591 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3594 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3595 size_t size, enum dma_data_direction dir, unsigned long attrs)
3597 if (iommu_need_mapping(dev))
3598 intel_unmap(dev, dev_addr, size);
3601 static void *intel_alloc_coherent(struct device *dev, size_t size,
3602 dma_addr_t *dma_handle, gfp_t flags,
3603 unsigned long attrs)
3605 struct page *page = NULL;
3608 if (!iommu_need_mapping(dev))
3609 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3611 size = PAGE_ALIGN(size);
3612 order = get_order(size);
3614 if (gfpflags_allow_blocking(flags)) {
3615 unsigned int count = size >> PAGE_SHIFT;
3617 page = dma_alloc_from_contiguous(dev, count, order,
3618 flags & __GFP_NOWARN);
3622 page = alloc_pages(flags, order);
3625 memset(page_address(page), 0, size);
3627 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3629 dev->coherent_dma_mask);
3630 if (*dma_handle != DMA_MAPPING_ERROR)
3631 return page_address(page);
3632 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3633 __free_pages(page, order);
3638 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3639 dma_addr_t dma_handle, unsigned long attrs)
3642 struct page *page = virt_to_page(vaddr);
3644 if (!iommu_need_mapping(dev))
3645 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3647 size = PAGE_ALIGN(size);
3648 order = get_order(size);
3650 intel_unmap(dev, dma_handle, size);
3651 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3652 __free_pages(page, order);
3655 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3656 int nelems, enum dma_data_direction dir,
3657 unsigned long attrs)
3659 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3660 unsigned long nrpages = 0;
3661 struct scatterlist *sg;
3664 if (!iommu_need_mapping(dev))
3665 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3667 for_each_sg(sglist, sg, nelems, i) {
3668 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3671 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3674 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3675 enum dma_data_direction dir, unsigned long attrs)
3678 struct dmar_domain *domain;
3681 unsigned long iova_pfn;
3683 struct scatterlist *sg;
3684 unsigned long start_vpfn;
3685 struct intel_iommu *iommu;
3687 BUG_ON(dir == DMA_NONE);
3688 if (!iommu_need_mapping(dev))
3689 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3691 domain = find_domain(dev);
3695 iommu = domain_get_iommu(domain);
3697 for_each_sg(sglist, sg, nelems, i)
3698 size += aligned_nrpages(sg->offset, sg->length);
3700 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3703 sglist->dma_length = 0;
3708 * Check if DMAR supports zero-length reads on write only
3711 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3712 !cap_zlr(iommu->cap))
3713 prot |= DMA_PTE_READ;
3714 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3715 prot |= DMA_PTE_WRITE;
3717 start_vpfn = mm_to_dma_pfn(iova_pfn);
3719 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3720 if (unlikely(ret)) {
3721 dma_pte_free_pagetable(domain, start_vpfn,
3722 start_vpfn + size - 1,
3723 agaw_to_level(domain->agaw) + 1);
3724 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3731 static const struct dma_map_ops intel_dma_ops = {
3732 .alloc = intel_alloc_coherent,
3733 .free = intel_free_coherent,
3734 .map_sg = intel_map_sg,
3735 .unmap_sg = intel_unmap_sg,
3736 .map_page = intel_map_page,
3737 .unmap_page = intel_unmap_page,
3738 .map_resource = intel_map_resource,
3739 .unmap_resource = intel_unmap_resource,
3740 .dma_supported = dma_direct_supported,
3743 static inline int iommu_domain_cache_init(void)
3747 iommu_domain_cache = kmem_cache_create("iommu_domain",
3748 sizeof(struct dmar_domain),
3753 if (!iommu_domain_cache) {
3754 pr_err("Couldn't create iommu_domain cache\n");
3761 static inline int iommu_devinfo_cache_init(void)
3765 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3766 sizeof(struct device_domain_info),
3770 if (!iommu_devinfo_cache) {
3771 pr_err("Couldn't create devinfo cache\n");
3778 static int __init iommu_init_mempool(void)
3781 ret = iova_cache_get();
3785 ret = iommu_domain_cache_init();
3789 ret = iommu_devinfo_cache_init();
3793 kmem_cache_destroy(iommu_domain_cache);
3800 static void __init iommu_exit_mempool(void)
3802 kmem_cache_destroy(iommu_devinfo_cache);
3803 kmem_cache_destroy(iommu_domain_cache);
3807 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3809 struct dmar_drhd_unit *drhd;
3813 /* We know that this device on this chipset has its own IOMMU.
3814 * If we find it under a different IOMMU, then the BIOS is lying
3815 * to us. Hope that the IOMMU for this device is actually
3816 * disabled, and it needs no translation...
3818 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3820 /* "can't" happen */
3821 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3824 vtbar &= 0xffff0000;
3826 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3827 drhd = dmar_find_matched_drhd_unit(pdev);
3828 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3829 TAINT_FIRMWARE_WORKAROUND,
3830 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3831 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3833 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3835 static void __init init_no_remapping_devices(void)
3837 struct dmar_drhd_unit *drhd;
3841 for_each_drhd_unit(drhd) {
3842 if (!drhd->include_all) {
3843 for_each_active_dev_scope(drhd->devices,
3844 drhd->devices_cnt, i, dev)
3846 /* ignore DMAR unit if no devices exist */
3847 if (i == drhd->devices_cnt)
3852 for_each_active_drhd_unit(drhd) {
3853 if (drhd->include_all)
3856 for_each_active_dev_scope(drhd->devices,
3857 drhd->devices_cnt, i, dev)
3858 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3860 if (i < drhd->devices_cnt)
3863 /* This IOMMU has *only* gfx devices. Either bypass it or
3864 set the gfx_mapped flag, as appropriate */
3865 if (!dmar_map_gfx) {
3867 for_each_active_dev_scope(drhd->devices,
3868 drhd->devices_cnt, i, dev)
3869 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3874 #ifdef CONFIG_SUSPEND
3875 static int init_iommu_hw(void)
3877 struct dmar_drhd_unit *drhd;
3878 struct intel_iommu *iommu = NULL;
3880 for_each_active_iommu(iommu, drhd)
3882 dmar_reenable_qi(iommu);
3884 for_each_iommu(iommu, drhd) {
3885 if (drhd->ignored) {
3887 * we always have to disable PMRs or DMA may fail on
3891 iommu_disable_protect_mem_regions(iommu);
3895 iommu_flush_write_buffer(iommu);
3897 iommu_set_root_entry(iommu);
3899 iommu->flush.flush_context(iommu, 0, 0, 0,
3900 DMA_CCMD_GLOBAL_INVL);
3901 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3902 iommu_enable_translation(iommu);
3903 iommu_disable_protect_mem_regions(iommu);
3909 static void iommu_flush_all(void)
3911 struct dmar_drhd_unit *drhd;
3912 struct intel_iommu *iommu;
3914 for_each_active_iommu(iommu, drhd) {
3915 iommu->flush.flush_context(iommu, 0, 0, 0,
3916 DMA_CCMD_GLOBAL_INVL);
3917 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3918 DMA_TLB_GLOBAL_FLUSH);
3922 static int iommu_suspend(void)
3924 struct dmar_drhd_unit *drhd;
3925 struct intel_iommu *iommu = NULL;
3928 for_each_active_iommu(iommu, drhd) {
3929 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3931 if (!iommu->iommu_state)
3937 for_each_active_iommu(iommu, drhd) {
3938 iommu_disable_translation(iommu);
3940 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3942 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3943 readl(iommu->reg + DMAR_FECTL_REG);
3944 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3945 readl(iommu->reg + DMAR_FEDATA_REG);
3946 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3947 readl(iommu->reg + DMAR_FEADDR_REG);
3948 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3949 readl(iommu->reg + DMAR_FEUADDR_REG);
3951 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3956 for_each_active_iommu(iommu, drhd)
3957 kfree(iommu->iommu_state);
3962 static void iommu_resume(void)
3964 struct dmar_drhd_unit *drhd;
3965 struct intel_iommu *iommu = NULL;
3968 if (init_iommu_hw()) {
3970 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3972 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3976 for_each_active_iommu(iommu, drhd) {
3978 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3980 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3981 iommu->reg + DMAR_FECTL_REG);
3982 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3983 iommu->reg + DMAR_FEDATA_REG);
3984 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3985 iommu->reg + DMAR_FEADDR_REG);
3986 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3987 iommu->reg + DMAR_FEUADDR_REG);
3989 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3992 for_each_active_iommu(iommu, drhd)
3993 kfree(iommu->iommu_state);
3996 static struct syscore_ops iommu_syscore_ops = {
3997 .resume = iommu_resume,
3998 .suspend = iommu_suspend,
4001 static void __init init_iommu_pm_ops(void)
4003 register_syscore_ops(&iommu_syscore_ops);
4007 static inline void init_iommu_pm_ops(void) {}
4008 #endif /* CONFIG_PM */
4010 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4012 struct acpi_dmar_reserved_memory *rmrr;
4013 struct dmar_rmrr_unit *rmrru;
4015 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4019 rmrru->hdr = header;
4020 rmrr = (struct acpi_dmar_reserved_memory *)header;
4021 rmrru->base_address = rmrr->base_address;
4022 rmrru->end_address = rmrr->end_address;
4024 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4025 ((void *)rmrr) + rmrr->header.length,
4026 &rmrru->devices_cnt);
4027 if (rmrru->devices_cnt && rmrru->devices == NULL)
4030 list_add(&rmrru->list, &dmar_rmrr_units);
4039 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4041 struct dmar_atsr_unit *atsru;
4042 struct acpi_dmar_atsr *tmp;
4044 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4045 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4046 if (atsr->segment != tmp->segment)
4048 if (atsr->header.length != tmp->header.length)
4050 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4057 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4059 struct acpi_dmar_atsr *atsr;
4060 struct dmar_atsr_unit *atsru;
4062 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4065 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4066 atsru = dmar_find_atsr(atsr);
4070 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4075 * If memory is allocated from slab by ACPI _DSM method, we need to
4076 * copy the memory content because the memory buffer will be freed
4079 atsru->hdr = (void *)(atsru + 1);
4080 memcpy(atsru->hdr, hdr, hdr->length);
4081 atsru->include_all = atsr->flags & 0x1;
4082 if (!atsru->include_all) {
4083 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4084 (void *)atsr + atsr->header.length,
4085 &atsru->devices_cnt);
4086 if (atsru->devices_cnt && atsru->devices == NULL) {
4092 list_add_rcu(&atsru->list, &dmar_atsr_units);
4097 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4099 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4103 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4105 struct acpi_dmar_atsr *atsr;
4106 struct dmar_atsr_unit *atsru;
4108 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4109 atsru = dmar_find_atsr(atsr);
4111 list_del_rcu(&atsru->list);
4113 intel_iommu_free_atsr(atsru);
4119 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4123 struct acpi_dmar_atsr *atsr;
4124 struct dmar_atsr_unit *atsru;
4126 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4127 atsru = dmar_find_atsr(atsr);
4131 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4132 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4140 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4143 struct intel_iommu *iommu = dmaru->iommu;
4145 if (g_iommus[iommu->seq_id])
4148 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4149 pr_warn("%s: Doesn't support hardware pass through.\n",
4153 if (!ecap_sc_support(iommu->ecap) &&
4154 domain_update_iommu_snooping(iommu)) {
4155 pr_warn("%s: Doesn't support snooping.\n",
4159 sp = domain_update_iommu_superpage(iommu) - 1;
4160 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4161 pr_warn("%s: Doesn't support large page.\n",
4167 * Disable translation if already enabled prior to OS handover.
4169 if (iommu->gcmd & DMA_GCMD_TE)
4170 iommu_disable_translation(iommu);
4172 g_iommus[iommu->seq_id] = iommu;
4173 ret = iommu_init_domains(iommu);
4175 ret = iommu_alloc_root_entry(iommu);
4179 #ifdef CONFIG_INTEL_IOMMU_SVM
4180 if (pasid_supported(iommu))
4181 intel_svm_init(iommu);
4184 if (dmaru->ignored) {
4186 * we always have to disable PMRs or DMA may fail on this device
4189 iommu_disable_protect_mem_regions(iommu);
4193 intel_iommu_init_qi(iommu);
4194 iommu_flush_write_buffer(iommu);
4196 #ifdef CONFIG_INTEL_IOMMU_SVM
4197 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4198 ret = intel_svm_enable_prq(iommu);
4203 ret = dmar_set_interrupt(iommu);
4207 iommu_set_root_entry(iommu);
4208 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4209 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4210 iommu_enable_translation(iommu);
4212 iommu_disable_protect_mem_regions(iommu);
4216 disable_dmar_iommu(iommu);
4218 free_dmar_iommu(iommu);
4222 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4225 struct intel_iommu *iommu = dmaru->iommu;
4227 if (!intel_iommu_enabled)
4233 ret = intel_iommu_add(dmaru);
4235 disable_dmar_iommu(iommu);
4236 free_dmar_iommu(iommu);
4242 static void intel_iommu_free_dmars(void)
4244 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4245 struct dmar_atsr_unit *atsru, *atsr_n;
4247 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4248 list_del(&rmrru->list);
4249 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4253 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4254 list_del(&atsru->list);
4255 intel_iommu_free_atsr(atsru);
4259 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4262 struct pci_bus *bus;
4263 struct pci_dev *bridge = NULL;
4265 struct acpi_dmar_atsr *atsr;
4266 struct dmar_atsr_unit *atsru;
4268 dev = pci_physfn(dev);
4269 for (bus = dev->bus; bus; bus = bus->parent) {
4271 /* If it's an integrated device, allow ATS */
4274 /* Connected via non-PCIe: no ATS */
4275 if (!pci_is_pcie(bridge) ||
4276 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4278 /* If we found the root port, look it up in the ATSR */
4279 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4284 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4285 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4286 if (atsr->segment != pci_domain_nr(dev->bus))
4289 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4290 if (tmp == &bridge->dev)
4293 if (atsru->include_all)
4303 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4306 struct dmar_rmrr_unit *rmrru;
4307 struct dmar_atsr_unit *atsru;
4308 struct acpi_dmar_atsr *atsr;
4309 struct acpi_dmar_reserved_memory *rmrr;
4311 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4314 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4315 rmrr = container_of(rmrru->hdr,
4316 struct acpi_dmar_reserved_memory, header);
4317 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4318 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4319 ((void *)rmrr) + rmrr->header.length,
4320 rmrr->segment, rmrru->devices,
4321 rmrru->devices_cnt);
4324 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4325 dmar_remove_dev_scope(info, rmrr->segment,
4326 rmrru->devices, rmrru->devices_cnt);
4330 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4331 if (atsru->include_all)
4334 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4335 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4336 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4337 (void *)atsr + atsr->header.length,
4338 atsr->segment, atsru->devices,
4339 atsru->devices_cnt);
4344 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4345 if (dmar_remove_dev_scope(info, atsr->segment,
4346 atsru->devices, atsru->devices_cnt))
4354 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4355 unsigned long val, void *v)
4357 struct memory_notify *mhp = v;
4358 unsigned long long start, end;
4359 unsigned long start_vpfn, last_vpfn;
4362 case MEM_GOING_ONLINE:
4363 start = mhp->start_pfn << PAGE_SHIFT;
4364 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4365 if (iommu_domain_identity_map(si_domain, start, end)) {
4366 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4373 case MEM_CANCEL_ONLINE:
4374 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4375 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4376 while (start_vpfn <= last_vpfn) {
4378 struct dmar_drhd_unit *drhd;
4379 struct intel_iommu *iommu;
4380 struct page *freelist;
4382 iova = find_iova(&si_domain->iovad, start_vpfn);
4384 pr_debug("Failed get IOVA for PFN %lx\n",
4389 iova = split_and_remove_iova(&si_domain->iovad, iova,
4390 start_vpfn, last_vpfn);
4392 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4393 start_vpfn, last_vpfn);
4397 freelist = domain_unmap(si_domain, iova->pfn_lo,
4401 for_each_active_iommu(iommu, drhd)
4402 iommu_flush_iotlb_psi(iommu, si_domain,
4403 iova->pfn_lo, iova_size(iova),
4406 dma_free_pagelist(freelist);
4408 start_vpfn = iova->pfn_hi + 1;
4409 free_iova_mem(iova);
4417 static struct notifier_block intel_iommu_memory_nb = {
4418 .notifier_call = intel_iommu_memory_notifier,
4422 static void free_all_cpu_cached_iovas(unsigned int cpu)
4426 for (i = 0; i < g_num_of_iommus; i++) {
4427 struct intel_iommu *iommu = g_iommus[i];
4428 struct dmar_domain *domain;
4434 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4435 domain = get_iommu_domain(iommu, (u16)did);
4439 free_cpu_cached_iovas(cpu, &domain->iovad);
4444 static int intel_iommu_cpu_dead(unsigned int cpu)
4446 free_all_cpu_cached_iovas(cpu);
4450 static void intel_disable_iommus(void)
4452 struct intel_iommu *iommu = NULL;
4453 struct dmar_drhd_unit *drhd;
4455 for_each_iommu(iommu, drhd)
4456 iommu_disable_translation(iommu);
4459 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4461 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4463 return container_of(iommu_dev, struct intel_iommu, iommu);
4466 static ssize_t intel_iommu_show_version(struct device *dev,
4467 struct device_attribute *attr,
4470 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4471 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4472 return sprintf(buf, "%d:%d\n",
4473 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4475 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4477 static ssize_t intel_iommu_show_address(struct device *dev,
4478 struct device_attribute *attr,
4481 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4482 return sprintf(buf, "%llx\n", iommu->reg_phys);
4484 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4486 static ssize_t intel_iommu_show_cap(struct device *dev,
4487 struct device_attribute *attr,
4490 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4491 return sprintf(buf, "%llx\n", iommu->cap);
4493 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4495 static ssize_t intel_iommu_show_ecap(struct device *dev,
4496 struct device_attribute *attr,
4499 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4500 return sprintf(buf, "%llx\n", iommu->ecap);
4502 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4504 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4505 struct device_attribute *attr,
4508 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4509 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4511 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4513 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4514 struct device_attribute *attr,
4517 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4518 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4519 cap_ndoms(iommu->cap)));
4521 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4523 static struct attribute *intel_iommu_attrs[] = {
4524 &dev_attr_version.attr,
4525 &dev_attr_address.attr,
4527 &dev_attr_ecap.attr,
4528 &dev_attr_domains_supported.attr,
4529 &dev_attr_domains_used.attr,
4533 static struct attribute_group intel_iommu_group = {
4534 .name = "intel-iommu",
4535 .attrs = intel_iommu_attrs,
4538 const struct attribute_group *intel_iommu_groups[] = {
4543 static int __init platform_optin_force_iommu(void)
4545 struct pci_dev *pdev = NULL;
4546 bool has_untrusted_dev = false;
4548 if (!dmar_platform_optin() || no_platform_optin)
4551 for_each_pci_dev(pdev) {
4552 if (pdev->untrusted) {
4553 has_untrusted_dev = true;
4558 if (!has_untrusted_dev)
4561 if (no_iommu || dmar_disabled)
4562 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4565 * If Intel-IOMMU is disabled by default, we will apply identity
4566 * map for all devices except those marked as being untrusted.
4569 iommu_identity_mapping |= IDENTMAP_ALL;
4572 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4580 static int __init probe_acpi_namespace_devices(void)
4582 struct dmar_drhd_unit *drhd;
4583 /* To avoid a -Wunused-but-set-variable warning. */
4584 struct intel_iommu *iommu __maybe_unused;
4588 for_each_active_iommu(iommu, drhd) {
4589 for_each_active_dev_scope(drhd->devices,
4590 drhd->devices_cnt, i, dev) {
4591 struct acpi_device_physical_node *pn;
4592 struct iommu_group *group;
4593 struct acpi_device *adev;
4595 if (dev->bus != &acpi_bus_type)
4598 adev = to_acpi_device(dev);
4599 mutex_lock(&adev->physical_node_lock);
4600 list_for_each_entry(pn,
4601 &adev->physical_node_list, node) {
4602 group = iommu_group_get(pn->dev);
4604 iommu_group_put(group);
4608 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4609 ret = iommu_probe_device(pn->dev);
4613 mutex_unlock(&adev->physical_node_lock);
4623 int __init intel_iommu_init(void)
4626 struct dmar_drhd_unit *drhd;
4627 struct intel_iommu *iommu;
4630 * Intel IOMMU is required for a TXT/tboot launch or platform
4631 * opt in, so enforce that.
4633 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4635 if (iommu_init_mempool()) {
4637 panic("tboot: Failed to initialize iommu memory\n");
4641 down_write(&dmar_global_lock);
4642 if (dmar_table_init()) {
4644 panic("tboot: Failed to initialize DMAR table\n");
4648 if (dmar_dev_scope_init() < 0) {
4650 panic("tboot: Failed to initialize DMAR device scope\n");
4654 up_write(&dmar_global_lock);
4657 * The bus notifier takes the dmar_global_lock, so lockdep will
4658 * complain later when we register it under the lock.
4660 dmar_register_bus_notifier();
4662 down_write(&dmar_global_lock);
4664 if (no_iommu || dmar_disabled) {
4666 * We exit the function here to ensure IOMMU's remapping and
4667 * mempool aren't setup, which means that the IOMMU's PMRs
4668 * won't be disabled via the call to init_dmars(). So disable
4669 * it explicitly here. The PMRs were setup by tboot prior to
4670 * calling SENTER, but the kernel is expected to reset/tear
4673 if (intel_iommu_tboot_noforce) {
4674 for_each_iommu(iommu, drhd)
4675 iommu_disable_protect_mem_regions(iommu);
4679 * Make sure the IOMMUs are switched off, even when we
4680 * boot into a kexec kernel and the previous kernel left
4683 intel_disable_iommus();
4687 if (list_empty(&dmar_rmrr_units))
4688 pr_info("No RMRR found\n");
4690 if (list_empty(&dmar_atsr_units))
4691 pr_info("No ATSR found\n");
4693 if (dmar_init_reserved_ranges()) {
4695 panic("tboot: Failed to reserve iommu ranges\n");
4696 goto out_free_reserved_range;
4700 intel_iommu_gfx_mapped = 1;
4702 init_no_remapping_devices();
4707 panic("tboot: Failed to initialize DMARs\n");
4708 pr_err("Initialization failed\n");
4709 goto out_free_reserved_range;
4711 up_write(&dmar_global_lock);
4713 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4716 dma_ops = &intel_dma_ops;
4718 init_iommu_pm_ops();
4720 for_each_active_iommu(iommu, drhd) {
4721 iommu_device_sysfs_add(&iommu->iommu, NULL,
4724 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4725 iommu_device_register(&iommu->iommu);
4728 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4729 if (si_domain && !hw_pass_through)
4730 register_memory_notifier(&intel_iommu_memory_nb);
4731 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4732 intel_iommu_cpu_dead);
4734 down_read(&dmar_global_lock);
4735 if (probe_acpi_namespace_devices())
4736 pr_warn("ACPI name space devices didn't probe correctly\n");
4737 up_read(&dmar_global_lock);
4739 /* Finally, we enable the DMA remapping hardware. */
4740 for_each_iommu(iommu, drhd) {
4741 if (!drhd->ignored && !translation_pre_enabled(iommu))
4742 iommu_enable_translation(iommu);
4744 iommu_disable_protect_mem_regions(iommu);
4746 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4748 intel_iommu_enabled = 1;
4749 intel_iommu_debugfs_init();
4753 out_free_reserved_range:
4754 put_iova_domain(&reserved_iova_list);
4756 intel_iommu_free_dmars();
4757 up_write(&dmar_global_lock);
4758 iommu_exit_mempool();
4762 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4764 struct dmar_domain *domain;
4765 struct intel_iommu *iommu;
4766 unsigned long flags;
4768 assert_spin_locked(&device_domain_lock);
4773 iommu = info->iommu;
4774 domain = info->domain;
4777 if (dev_is_pci(info->dev) && sm_supported(iommu))
4778 intel_pasid_tear_down_entry(iommu, info->dev,
4781 iommu_disable_dev_iotlb(info);
4782 domain_context_clear_one(iommu, info->bus, info->devfn);
4783 intel_pasid_free_table(info->dev);
4786 unlink_domain_info(info);
4788 spin_lock_irqsave(&iommu->lock, flags);
4789 domain_detach_iommu(domain, iommu);
4790 spin_unlock_irqrestore(&iommu->lock, flags);
4792 /* free the private domain */
4793 if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4794 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4795 list_empty(&domain->devices))
4796 domain_exit(info->domain);
4798 free_devinfo_mem(info);
4801 static void dmar_remove_one_dev_info(struct device *dev)
4803 struct device_domain_info *info;
4804 unsigned long flags;
4806 spin_lock_irqsave(&device_domain_lock, flags);
4807 info = dev->archdata.iommu;
4809 __dmar_remove_one_dev_info(info);
4810 spin_unlock_irqrestore(&device_domain_lock, flags);
4813 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4817 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4818 domain_reserve_special_ranges(domain);
4820 /* calculate AGAW */
4821 domain->gaw = guest_width;
4822 adjust_width = guestwidth_to_adjustwidth(guest_width);
4823 domain->agaw = width_to_agaw(adjust_width);
4825 domain->iommu_coherency = 0;
4826 domain->iommu_snooping = 0;
4827 domain->iommu_superpage = 0;
4828 domain->max_addr = 0;
4830 /* always allocate the top pgd */
4831 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4834 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4838 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4840 struct dmar_domain *dmar_domain;
4841 struct iommu_domain *domain;
4844 case IOMMU_DOMAIN_DMA:
4846 case IOMMU_DOMAIN_UNMANAGED:
4847 dmar_domain = alloc_domain(0);
4849 pr_err("Can't allocate dmar_domain\n");
4852 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4853 pr_err("Domain initialization failed\n");
4854 domain_exit(dmar_domain);
4858 if (type == IOMMU_DOMAIN_DMA &&
4859 init_iova_flush_queue(&dmar_domain->iovad,
4860 iommu_flush_iova, iova_entry_free)) {
4861 pr_warn("iova flush queue initialization failed\n");
4862 intel_iommu_strict = 1;
4865 domain_update_iommu_cap(dmar_domain);
4867 domain = &dmar_domain->domain;
4868 domain->geometry.aperture_start = 0;
4869 domain->geometry.aperture_end =
4870 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4871 domain->geometry.force_aperture = true;
4874 case IOMMU_DOMAIN_IDENTITY:
4875 return &si_domain->domain;
4883 static void intel_iommu_domain_free(struct iommu_domain *domain)
4885 if (domain != &si_domain->domain)
4886 domain_exit(to_dmar_domain(domain));
4890 * Check whether a @domain could be attached to the @dev through the
4891 * aux-domain attach/detach APIs.
4894 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4896 struct device_domain_info *info = dev->archdata.iommu;
4898 return info && info->auxd_enabled &&
4899 domain->type == IOMMU_DOMAIN_UNMANAGED;
4902 static void auxiliary_link_device(struct dmar_domain *domain,
4905 struct device_domain_info *info = dev->archdata.iommu;
4907 assert_spin_locked(&device_domain_lock);
4911 domain->auxd_refcnt++;
4912 list_add(&domain->auxd, &info->auxiliary_domains);
4915 static void auxiliary_unlink_device(struct dmar_domain *domain,
4918 struct device_domain_info *info = dev->archdata.iommu;
4920 assert_spin_locked(&device_domain_lock);
4924 list_del(&domain->auxd);
4925 domain->auxd_refcnt--;
4927 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4928 intel_pasid_free_id(domain->default_pasid);
4931 static int aux_domain_add_dev(struct dmar_domain *domain,
4936 unsigned long flags;
4937 struct intel_iommu *iommu;
4939 iommu = device_to_iommu(dev, &bus, &devfn);
4943 if (domain->default_pasid <= 0) {
4946 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4947 pci_max_pasids(to_pci_dev(dev)),
4950 pr_err("Can't allocate default pasid\n");
4953 domain->default_pasid = pasid;
4956 spin_lock_irqsave(&device_domain_lock, flags);
4958 * iommu->lock must be held to attach domain to iommu and setup the
4959 * pasid entry for second level translation.
4961 spin_lock(&iommu->lock);
4962 ret = domain_attach_iommu(domain, iommu);
4966 /* Setup the PASID entry for mediated devices: */
4967 ret = intel_pasid_setup_second_level(iommu, domain, dev,
4968 domain->default_pasid);
4971 spin_unlock(&iommu->lock);
4973 auxiliary_link_device(domain, dev);
4975 spin_unlock_irqrestore(&device_domain_lock, flags);
4980 domain_detach_iommu(domain, iommu);
4982 spin_unlock(&iommu->lock);
4983 spin_unlock_irqrestore(&device_domain_lock, flags);
4984 if (!domain->auxd_refcnt && domain->default_pasid > 0)
4985 intel_pasid_free_id(domain->default_pasid);
4990 static void aux_domain_remove_dev(struct dmar_domain *domain,
4993 struct device_domain_info *info;
4994 struct intel_iommu *iommu;
4995 unsigned long flags;
4997 if (!is_aux_domain(dev, &domain->domain))
5000 spin_lock_irqsave(&device_domain_lock, flags);
5001 info = dev->archdata.iommu;
5002 iommu = info->iommu;
5004 auxiliary_unlink_device(domain, dev);
5006 spin_lock(&iommu->lock);
5007 intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5008 domain_detach_iommu(domain, iommu);
5009 spin_unlock(&iommu->lock);
5011 spin_unlock_irqrestore(&device_domain_lock, flags);
5014 static int prepare_domain_attach_device(struct iommu_domain *domain,
5017 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5018 struct intel_iommu *iommu;
5022 iommu = device_to_iommu(dev, &bus, &devfn);
5026 /* check if this iommu agaw is sufficient for max mapped address */
5027 addr_width = agaw_to_width(iommu->agaw);
5028 if (addr_width > cap_mgaw(iommu->cap))
5029 addr_width = cap_mgaw(iommu->cap);
5031 if (dmar_domain->max_addr > (1LL << addr_width)) {
5032 dev_err(dev, "%s: iommu width (%d) is not "
5033 "sufficient for the mapped address (%llx)\n",
5034 __func__, addr_width, dmar_domain->max_addr);
5037 dmar_domain->gaw = addr_width;
5040 * Knock out extra levels of page tables if necessary
5042 while (iommu->agaw < dmar_domain->agaw) {
5043 struct dma_pte *pte;
5045 pte = dmar_domain->pgd;
5046 if (dma_pte_present(pte)) {
5047 dmar_domain->pgd = (struct dma_pte *)
5048 phys_to_virt(dma_pte_addr(pte));
5049 free_pgtable_page(pte);
5051 dmar_domain->agaw--;
5057 static int intel_iommu_attach_device(struct iommu_domain *domain,
5062 if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5063 device_is_rmrr_locked(dev)) {
5064 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5068 if (is_aux_domain(dev, domain))
5071 /* normally dev is not mapped */
5072 if (unlikely(domain_context_mapped(dev))) {
5073 struct dmar_domain *old_domain;
5075 old_domain = find_domain(dev);
5077 dmar_remove_one_dev_info(dev);
5080 ret = prepare_domain_attach_device(domain, dev);
5084 return domain_add_dev_info(to_dmar_domain(domain), dev);
5087 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5092 if (!is_aux_domain(dev, domain))
5095 ret = prepare_domain_attach_device(domain, dev);
5099 return aux_domain_add_dev(to_dmar_domain(domain), dev);
5102 static void intel_iommu_detach_device(struct iommu_domain *domain,
5105 dmar_remove_one_dev_info(dev);
5108 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5111 aux_domain_remove_dev(to_dmar_domain(domain), dev);
5114 static int intel_iommu_map(struct iommu_domain *domain,
5115 unsigned long iova, phys_addr_t hpa,
5116 size_t size, int iommu_prot)
5118 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5123 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5126 if (iommu_prot & IOMMU_READ)
5127 prot |= DMA_PTE_READ;
5128 if (iommu_prot & IOMMU_WRITE)
5129 prot |= DMA_PTE_WRITE;
5130 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5131 prot |= DMA_PTE_SNP;
5133 max_addr = iova + size;
5134 if (dmar_domain->max_addr < max_addr) {
5137 /* check if minimum agaw is sufficient for mapped address */
5138 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5139 if (end < max_addr) {
5140 pr_err("%s: iommu width (%d) is not "
5141 "sufficient for the mapped address (%llx)\n",
5142 __func__, dmar_domain->gaw, max_addr);
5145 dmar_domain->max_addr = max_addr;
5147 /* Round up size to next multiple of PAGE_SIZE, if it and
5148 the low bits of hpa would take us onto the next page */
5149 size = aligned_nrpages(hpa, size);
5150 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5151 hpa >> VTD_PAGE_SHIFT, size, prot);
5155 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5156 unsigned long iova, size_t size)
5158 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5159 struct page *freelist = NULL;
5160 unsigned long start_pfn, last_pfn;
5161 unsigned int npages;
5162 int iommu_id, level = 0;
5164 /* Cope with horrid API which requires us to unmap more than the
5165 size argument if it happens to be a large-page mapping. */
5166 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5167 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5170 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5171 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5173 start_pfn = iova >> VTD_PAGE_SHIFT;
5174 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5176 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5178 npages = last_pfn - start_pfn + 1;
5180 for_each_domain_iommu(iommu_id, dmar_domain)
5181 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5182 start_pfn, npages, !freelist, 0);
5184 dma_free_pagelist(freelist);
5186 if (dmar_domain->max_addr == iova + size)
5187 dmar_domain->max_addr = iova;
5192 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5195 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5196 struct dma_pte *pte;
5200 if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5203 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5205 phys = dma_pte_addr(pte);
5210 static inline bool scalable_mode_support(void)
5212 struct dmar_drhd_unit *drhd;
5213 struct intel_iommu *iommu;
5217 for_each_active_iommu(iommu, drhd) {
5218 if (!sm_supported(iommu)) {
5228 static inline bool iommu_pasid_support(void)
5230 struct dmar_drhd_unit *drhd;
5231 struct intel_iommu *iommu;
5235 for_each_active_iommu(iommu, drhd) {
5236 if (!pasid_supported(iommu)) {
5246 static bool intel_iommu_capable(enum iommu_cap cap)
5248 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5249 return domain_update_iommu_snooping(NULL) == 1;
5250 if (cap == IOMMU_CAP_INTR_REMAP)
5251 return irq_remapping_enabled == 1;
5256 static int intel_iommu_add_device(struct device *dev)
5258 struct dmar_domain *dmar_domain;
5259 struct iommu_domain *domain;
5260 struct intel_iommu *iommu;
5261 struct iommu_group *group;
5265 iommu = device_to_iommu(dev, &bus, &devfn);
5269 iommu_device_link(&iommu->iommu, dev);
5271 if (translation_pre_enabled(iommu))
5272 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5274 group = iommu_group_get_for_dev(dev);
5277 return PTR_ERR(group);
5279 iommu_group_put(group);
5281 domain = iommu_get_domain_for_dev(dev);
5282 dmar_domain = to_dmar_domain(domain);
5283 if (domain->type == IOMMU_DOMAIN_DMA) {
5284 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5285 ret = iommu_request_dm_for_dev(dev);
5287 dmar_remove_one_dev_info(dev);
5288 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5289 domain_add_dev_info(si_domain, dev);
5291 "Device uses a private identity domain.\n");
5295 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5296 ret = iommu_request_dma_domain_for_dev(dev);
5298 dmar_remove_one_dev_info(dev);
5299 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5300 if (!get_private_domain_for_dev(dev)) {
5302 "Failed to get a private domain.\n");
5307 "Device uses a private dma domain.\n");
5315 static void intel_iommu_remove_device(struct device *dev)
5317 struct intel_iommu *iommu;
5320 iommu = device_to_iommu(dev, &bus, &devfn);
5324 dmar_remove_one_dev_info(dev);
5326 iommu_group_remove_device(dev);
5328 iommu_device_unlink(&iommu->iommu, dev);
5331 static void intel_iommu_get_resv_regions(struct device *device,
5332 struct list_head *head)
5334 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5335 struct iommu_resv_region *reg;
5336 struct dmar_rmrr_unit *rmrr;
5337 struct device *i_dev;
5340 down_read(&dmar_global_lock);
5341 for_each_rmrr_units(rmrr) {
5342 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5344 struct iommu_resv_region *resv;
5345 enum iommu_resv_type type;
5348 if (i_dev != device &&
5349 !is_downstream_to_pci_bridge(device, i_dev))
5352 length = rmrr->end_address - rmrr->base_address + 1;
5354 type = device_rmrr_is_relaxable(device) ?
5355 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5357 resv = iommu_alloc_resv_region(rmrr->base_address,
5358 length, prot, type);
5362 list_add_tail(&resv->list, head);
5365 up_read(&dmar_global_lock);
5367 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5368 if (dev_is_pci(device)) {
5369 struct pci_dev *pdev = to_pci_dev(device);
5371 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5372 reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5375 list_add_tail(®->list, head);
5378 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5380 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5381 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5385 list_add_tail(®->list, head);
5388 static void intel_iommu_put_resv_regions(struct device *dev,
5389 struct list_head *head)
5391 struct iommu_resv_region *entry, *next;
5393 list_for_each_entry_safe(entry, next, head, list)
5397 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5399 struct device_domain_info *info;
5400 struct context_entry *context;
5401 struct dmar_domain *domain;
5402 unsigned long flags;
5406 domain = find_domain(dev);
5410 spin_lock_irqsave(&device_domain_lock, flags);
5411 spin_lock(&iommu->lock);
5414 info = dev->archdata.iommu;
5415 if (!info || !info->pasid_supported)
5418 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5419 if (WARN_ON(!context))
5422 ctx_lo = context[0].lo;
5424 if (!(ctx_lo & CONTEXT_PASIDE)) {
5425 ctx_lo |= CONTEXT_PASIDE;
5426 context[0].lo = ctx_lo;
5428 iommu->flush.flush_context(iommu,
5429 domain->iommu_did[iommu->seq_id],
5430 PCI_DEVID(info->bus, info->devfn),
5431 DMA_CCMD_MASK_NOBIT,
5432 DMA_CCMD_DEVICE_INVL);
5435 /* Enable PASID support in the device, if it wasn't already */
5436 if (!info->pasid_enabled)
5437 iommu_enable_dev_iotlb(info);
5442 spin_unlock(&iommu->lock);
5443 spin_unlock_irqrestore(&device_domain_lock, flags);
5448 static void intel_iommu_apply_resv_region(struct device *dev,
5449 struct iommu_domain *domain,
5450 struct iommu_resv_region *region)
5452 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5453 unsigned long start, end;
5455 start = IOVA_PFN(region->start);
5456 end = IOVA_PFN(region->start + region->length - 1);
5458 WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5461 #ifdef CONFIG_INTEL_IOMMU_SVM
5462 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5464 struct intel_iommu *iommu;
5467 if (iommu_dummy(dev)) {
5469 "No IOMMU translation for device; cannot enable SVM\n");
5473 iommu = device_to_iommu(dev, &bus, &devfn);
5475 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5481 #endif /* CONFIG_INTEL_IOMMU_SVM */
5483 static int intel_iommu_enable_auxd(struct device *dev)
5485 struct device_domain_info *info;
5486 struct intel_iommu *iommu;
5487 unsigned long flags;
5491 iommu = device_to_iommu(dev, &bus, &devfn);
5492 if (!iommu || dmar_disabled)
5495 if (!sm_supported(iommu) || !pasid_supported(iommu))
5498 ret = intel_iommu_enable_pasid(iommu, dev);
5502 spin_lock_irqsave(&device_domain_lock, flags);
5503 info = dev->archdata.iommu;
5504 info->auxd_enabled = 1;
5505 spin_unlock_irqrestore(&device_domain_lock, flags);
5510 static int intel_iommu_disable_auxd(struct device *dev)
5512 struct device_domain_info *info;
5513 unsigned long flags;
5515 spin_lock_irqsave(&device_domain_lock, flags);
5516 info = dev->archdata.iommu;
5517 if (!WARN_ON(!info))
5518 info->auxd_enabled = 0;
5519 spin_unlock_irqrestore(&device_domain_lock, flags);
5525 * A PCI express designated vendor specific extended capability is defined
5526 * in the section 3.7 of Intel scalable I/O virtualization technical spec
5527 * for system software and tools to detect endpoint devices supporting the
5528 * Intel scalable IO virtualization without host driver dependency.
5530 * Returns the address of the matching extended capability structure within
5531 * the device's PCI configuration space or 0 if the device does not support
5534 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5539 pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5541 pci_read_config_word(pdev, pos + 4, &vendor);
5542 pci_read_config_word(pdev, pos + 8, &id);
5543 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5546 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5553 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5555 if (feat == IOMMU_DEV_FEAT_AUX) {
5558 if (!dev_is_pci(dev) || dmar_disabled ||
5559 !scalable_mode_support() || !iommu_pasid_support())
5562 ret = pci_pasid_features(to_pci_dev(dev));
5566 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5573 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5575 if (feat == IOMMU_DEV_FEAT_AUX)
5576 return intel_iommu_enable_auxd(dev);
5582 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5584 if (feat == IOMMU_DEV_FEAT_AUX)
5585 return intel_iommu_disable_auxd(dev);
5591 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5593 struct device_domain_info *info = dev->archdata.iommu;
5595 if (feat == IOMMU_DEV_FEAT_AUX)
5596 return scalable_mode_support() && info && info->auxd_enabled;
5602 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5604 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5606 return dmar_domain->default_pasid > 0 ?
5607 dmar_domain->default_pasid : -EINVAL;
5610 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5613 return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5616 const struct iommu_ops intel_iommu_ops = {
5617 .capable = intel_iommu_capable,
5618 .domain_alloc = intel_iommu_domain_alloc,
5619 .domain_free = intel_iommu_domain_free,
5620 .attach_dev = intel_iommu_attach_device,
5621 .detach_dev = intel_iommu_detach_device,
5622 .aux_attach_dev = intel_iommu_aux_attach_device,
5623 .aux_detach_dev = intel_iommu_aux_detach_device,
5624 .aux_get_pasid = intel_iommu_aux_get_pasid,
5625 .map = intel_iommu_map,
5626 .unmap = intel_iommu_unmap,
5627 .iova_to_phys = intel_iommu_iova_to_phys,
5628 .add_device = intel_iommu_add_device,
5629 .remove_device = intel_iommu_remove_device,
5630 .get_resv_regions = intel_iommu_get_resv_regions,
5631 .put_resv_regions = intel_iommu_put_resv_regions,
5632 .apply_resv_region = intel_iommu_apply_resv_region,
5633 .device_group = pci_device_group,
5634 .dev_has_feat = intel_iommu_dev_has_feat,
5635 .dev_feat_enabled = intel_iommu_dev_feat_enabled,
5636 .dev_enable_feat = intel_iommu_dev_enable_feat,
5637 .dev_disable_feat = intel_iommu_dev_disable_feat,
5638 .is_attach_deferred = intel_iommu_is_attach_deferred,
5639 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5642 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5644 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5645 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5657 static void quirk_iommu_rwbf(struct pci_dev *dev)
5660 * Mobile 4 Series Chipset neglects to set RWBF capability,
5661 * but needs it. Same seems to hold for the desktop versions.
5663 pci_info(dev, "Forcing write-buffer flush capability\n");
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5672 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5673 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5676 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5677 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5678 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5679 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5680 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5681 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5682 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5683 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5685 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5689 if (pci_read_config_word(dev, GGC, &ggc))
5692 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5693 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5695 } else if (dmar_map_gfx) {
5696 /* we have to ensure the gfx device is idle before we flush */
5697 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5698 intel_iommu_strict = 1;
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5706 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5707 ISOCH DMAR unit for the Azalia sound device, but not give it any
5708 TLB entries, which causes it to deadlock. Check for that. We do
5709 this in a function called from init_dmars(), instead of in a PCI
5710 quirk, because we don't want to print the obnoxious "BIOS broken"
5711 message if VT-d is actually disabled.
5713 static void __init check_tylersburg_isoch(void)
5715 struct pci_dev *pdev;
5716 uint32_t vtisochctrl;
5718 /* If there's no Azalia in the system anyway, forget it. */
5719 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5724 /* System Management Registers. Might be hidden, in which case
5725 we can't do the sanity check. But that's OK, because the
5726 known-broken BIOSes _don't_ actually hide it, so far. */
5727 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5731 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5738 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5739 if (vtisochctrl & 1)
5742 /* Drop all bits other than the number of TLB entries */
5743 vtisochctrl &= 0x1c;
5745 /* If we have the recommended number of TLB entries (16), fine. */
5746 if (vtisochctrl == 0x10)
5749 /* Zero TLB entries? You get to ride the short bus to school. */
5751 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5752 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5753 dmi_get_system_info(DMI_BIOS_VENDOR),
5754 dmi_get_system_info(DMI_BIOS_VERSION),
5755 dmi_get_system_info(DMI_PRODUCT_VERSION));
5756 iommu_identity_mapping |= IDENTMAP_AZALIA;
5760 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",