2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
22 #define dev_fmt(fmt) pr_fmt(fmt)
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/cacheflush.h>
53 #include <asm/iommu.h>
55 #include "irq_remapping.h"
56 #include "intel-pasid.h"
58 #define ROOT_SIZE VTD_PAGE_SIZE
59 #define CONTEXT_SIZE VTD_PAGE_SIZE
61 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
62 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
63 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
64 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66 #define IOAPIC_RANGE_START (0xfee00000)
67 #define IOAPIC_RANGE_END (0xfeefffff)
68 #define IOVA_START_ADDR (0x1000)
70 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72 #define MAX_AGAW_WIDTH 64
73 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
76 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
79 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
80 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
81 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
82 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84 /* IO virtual address start page frame number */
85 #define IOVA_START_PFN (1)
87 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
89 /* page table handling */
90 #define LEVEL_STRIDE (9)
91 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
94 * This bitmap is used to advertise the page sizes our hardware support
95 * to the IOMMU core, which will then use this information to split
96 * physically contiguous memory regions it is mapping into page sizes
99 * Traditionally the IOMMU core just handed us the mappings directly,
100 * after making sure the size is an order of a 4KiB page and that the
101 * mapping has natural alignment.
103 * To retain this behavior, we currently advertise that we support
104 * all page sizes that are an order of 4KiB.
106 * If at some point we'd like to utilize the IOMMU core's new behavior,
107 * we could change this to advertise the real page sizes we support.
109 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
111 static inline int agaw_to_level(int agaw)
116 static inline int agaw_to_width(int agaw)
118 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
121 static inline int width_to_agaw(int width)
123 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
126 static inline unsigned int level_to_offset_bits(int level)
128 return (level - 1) * LEVEL_STRIDE;
131 static inline int pfn_level_offset(unsigned long pfn, int level)
133 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
136 static inline unsigned long level_mask(int level)
138 return -1UL << level_to_offset_bits(level);
141 static inline unsigned long level_size(int level)
143 return 1UL << level_to_offset_bits(level);
146 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 return (pfn + level_size(level) - 1) & level_mask(level);
151 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
156 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
157 are never going to work. */
158 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
163 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 return mm_to_dma_pfn(page_to_pfn(pg));
171 static inline unsigned long virt_to_dma_pfn(void *p)
173 return page_to_dma_pfn(virt_to_page(p));
176 /* global iommu list, set NULL for ignored DMAR units */
177 static struct intel_iommu **g_iommus;
179 static void __init check_tylersburg_isoch(void);
180 static int rwbf_quirk;
183 * set to 1 to panic kernel if can't successfully enable VT-d
184 * (used when kernel is launched w/ TXT)
186 static int force_on = 0;
187 int intel_iommu_tboot_noforce;
188 static int no_platform_optin;
190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
193 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
196 static phys_addr_t root_entry_lctp(struct root_entry *re)
201 return re->lo & VTD_PAGE_MASK;
205 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
208 static phys_addr_t root_entry_uctp(struct root_entry *re)
213 return re->hi & VTD_PAGE_MASK;
216 static inline void context_clear_pasid_enable(struct context_entry *context)
218 context->lo &= ~(1ULL << 11);
221 static inline bool context_pasid_enabled(struct context_entry *context)
223 return !!(context->lo & (1ULL << 11));
226 static inline void context_set_copied(struct context_entry *context)
228 context->hi |= (1ull << 3);
231 static inline bool context_copied(struct context_entry *context)
233 return !!(context->hi & (1ULL << 3));
236 static inline bool __context_present(struct context_entry *context)
238 return (context->lo & 1);
241 bool context_present(struct context_entry *context)
243 return context_pasid_enabled(context) ?
244 __context_present(context) :
245 __context_present(context) && !context_copied(context);
248 static inline void context_set_present(struct context_entry *context)
253 static inline void context_set_fault_enable(struct context_entry *context)
255 context->lo &= (((u64)-1) << 2) | 1;
258 static inline void context_set_translation_type(struct context_entry *context,
261 context->lo &= (((u64)-1) << 4) | 3;
262 context->lo |= (value & 3) << 2;
265 static inline void context_set_address_root(struct context_entry *context,
268 context->lo &= ~VTD_PAGE_MASK;
269 context->lo |= value & VTD_PAGE_MASK;
272 static inline void context_set_address_width(struct context_entry *context,
275 context->hi |= value & 7;
278 static inline void context_set_domain_id(struct context_entry *context,
281 context->hi |= (value & ((1 << 16) - 1)) << 8;
284 static inline int context_domain_id(struct context_entry *c)
286 return((c->hi >> 8) & 0xffff);
289 static inline void context_clear_entry(struct context_entry *context)
296 * This domain is a statically identity mapping domain.
297 * 1. This domain creats a static 1:1 mapping to all usable memory.
298 * 2. It maps to each iommu if successful.
299 * 3. Each iommu mapps to this domain if successful.
301 static struct dmar_domain *si_domain;
302 static int hw_pass_through = 1;
305 * Domain represents a virtual machine, more than one devices
306 * across iommus may be owned in one domain, e.g. kvm guest.
308 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
310 /* si_domain contains mulitple devices */
311 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
313 #define for_each_domain_iommu(idx, domain) \
314 for (idx = 0; idx < g_num_of_iommus; idx++) \
315 if (domain->iommu_refcnt[idx])
317 struct dmar_rmrr_unit {
318 struct list_head list; /* list of rmrr units */
319 struct acpi_dmar_header *hdr; /* ACPI header */
320 u64 base_address; /* reserved base address*/
321 u64 end_address; /* reserved end address */
322 struct dmar_dev_scope *devices; /* target devices */
323 int devices_cnt; /* target device count */
324 struct iommu_resv_region *resv; /* reserved region handle */
327 struct dmar_atsr_unit {
328 struct list_head list; /* list of ATSR units */
329 struct acpi_dmar_header *hdr; /* ACPI header */
330 struct dmar_dev_scope *devices; /* target devices */
331 int devices_cnt; /* target device count */
332 u8 include_all:1; /* include all ports */
335 static LIST_HEAD(dmar_atsr_units);
336 static LIST_HEAD(dmar_rmrr_units);
338 #define for_each_rmrr_units(rmrr) \
339 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
341 /* bitmap for indexing intel_iommus */
342 static int g_num_of_iommus;
344 static void domain_exit(struct dmar_domain *domain);
345 static void domain_remove_dev_info(struct dmar_domain *domain);
346 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
349 static void domain_context_clear(struct intel_iommu *iommu,
351 static int domain_detach_iommu(struct dmar_domain *domain,
352 struct intel_iommu *iommu);
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
360 int intel_iommu_enabled = 0;
361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363 static int dmar_map_gfx = 1;
364 static int dmar_forcedac;
365 static int intel_iommu_strict;
366 static int intel_iommu_superpage = 1;
367 static int intel_iommu_sm = 1;
368 static int iommu_identity_mapping;
370 #define IDENTMAP_ALL 1
371 #define IDENTMAP_GFX 2
372 #define IDENTMAP_AZALIA 4
374 #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
375 #define pasid_supported(iommu) (sm_supported(iommu) && \
376 ecap_pasid((iommu)->ecap))
378 int intel_iommu_gfx_mapped;
379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
382 static DEFINE_SPINLOCK(device_domain_lock);
383 static LIST_HEAD(device_domain_list);
386 * Iterate over elements in device_domain_list and call the specified
387 * callback @fn against each element.
389 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
390 void *data), void *data)
394 struct device_domain_info *info;
396 spin_lock_irqsave(&device_domain_lock, flags);
397 list_for_each_entry(info, &device_domain_list, global) {
398 ret = fn(info, data);
400 spin_unlock_irqrestore(&device_domain_lock, flags);
404 spin_unlock_irqrestore(&device_domain_lock, flags);
409 const struct iommu_ops intel_iommu_ops;
411 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
416 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
421 static void init_translation_status(struct intel_iommu *iommu)
425 gsts = readl(iommu->reg + DMAR_GSTS_REG);
426 if (gsts & DMA_GSTS_TES)
427 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
430 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
431 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 return container_of(dom, struct dmar_domain, domain);
436 static int __init intel_iommu_setup(char *str)
441 if (!strncmp(str, "on", 2)) {
443 pr_info("IOMMU enabled\n");
444 } else if (!strncmp(str, "off", 3)) {
446 no_platform_optin = 1;
447 pr_info("IOMMU disabled\n");
448 } else if (!strncmp(str, "igfx_off", 8)) {
450 pr_info("Disable GFX device mapping\n");
451 } else if (!strncmp(str, "forcedac", 8)) {
452 pr_info("Forcing DAC for PCI devices\n");
454 } else if (!strncmp(str, "strict", 6)) {
455 pr_info("Disable batched IOTLB flush\n");
456 intel_iommu_strict = 1;
457 } else if (!strncmp(str, "sp_off", 6)) {
458 pr_info("Disable supported super page\n");
459 intel_iommu_superpage = 0;
460 } else if (!strncmp(str, "sm_off", 6)) {
461 pr_info("Intel-IOMMU: disable scalable mode support\n");
463 } else if (!strncmp(str, "tboot_noforce", 13)) {
465 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
466 intel_iommu_tboot_noforce = 1;
469 str += strcspn(str, ",");
475 __setup("intel_iommu=", intel_iommu_setup);
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 struct dmar_domain **domains;
485 domains = iommu->domains[idx];
489 return domains[did & 0xff];
492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
493 struct dmar_domain *domain)
495 struct dmar_domain **domains;
498 if (!iommu->domains[idx]) {
499 size_t size = 256 * sizeof(struct dmar_domain *);
500 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503 domains = iommu->domains[idx];
504 if (WARN_ON(!domains))
507 domains[did & 0xff] = domain;
510 void *alloc_pgtable_page(int node)
515 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517 vaddr = page_address(page);
521 void free_pgtable_page(void *vaddr)
523 free_page((unsigned long)vaddr);
526 static inline void *alloc_domain_mem(void)
528 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 static void free_domain_mem(void *vaddr)
533 kmem_cache_free(iommu_domain_cache, vaddr);
536 static inline void * alloc_devinfo_mem(void)
538 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 static inline void free_devinfo_mem(void *vaddr)
543 kmem_cache_free(iommu_devinfo_cache, vaddr);
546 static inline int domain_type_is_vm(struct dmar_domain *domain)
548 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
551 static inline int domain_type_is_si(struct dmar_domain *domain)
553 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
556 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
558 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
559 DOMAIN_FLAG_STATIC_IDENTITY);
562 static inline int domain_pfn_supported(struct dmar_domain *domain,
565 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
567 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
570 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
575 sagaw = cap_sagaw(iommu->cap);
576 for (agaw = width_to_agaw(max_gaw);
578 if (test_bit(agaw, &sagaw))
586 * Calculate max SAGAW for each iommu.
588 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
590 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
594 * calculate agaw for each iommu.
595 * "SAGAW" may be different across iommus, use a default agaw, and
596 * get a supported less agaw for iommus that don't support the default agaw.
598 int iommu_calculate_agaw(struct intel_iommu *iommu)
600 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
603 /* This functionin only returns single iommu in a domain */
604 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
608 /* si_domain and vm domain should not get here. */
609 BUG_ON(domain_type_is_vm_or_si(domain));
610 for_each_domain_iommu(iommu_id, domain)
613 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
616 return g_iommus[iommu_id];
619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
621 struct dmar_drhd_unit *drhd;
622 struct intel_iommu *iommu;
626 domain->iommu_coherency = 1;
628 for_each_domain_iommu(i, domain) {
630 if (!ecap_coherent(g_iommus[i]->ecap)) {
631 domain->iommu_coherency = 0;
638 /* No hardware attached; use lowest common denominator */
640 for_each_active_iommu(iommu, drhd) {
641 if (!ecap_coherent(iommu->ecap)) {
642 domain->iommu_coherency = 0;
649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
651 struct dmar_drhd_unit *drhd;
652 struct intel_iommu *iommu;
656 for_each_active_iommu(iommu, drhd) {
658 if (!ecap_sc_support(iommu->ecap)) {
669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
671 struct dmar_drhd_unit *drhd;
672 struct intel_iommu *iommu;
675 if (!intel_iommu_superpage) {
679 /* set iommu_superpage to the smallest common denominator */
681 for_each_active_iommu(iommu, drhd) {
683 mask &= cap_super_page_val(iommu->cap);
693 /* Some capabilities may be different across iommus */
694 static void domain_update_iommu_cap(struct dmar_domain *domain)
696 domain_update_iommu_coherency(domain);
697 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
698 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
704 struct root_entry *root = &iommu->root_entry[bus];
705 struct context_entry *context;
709 if (sm_supported(iommu)) {
717 context = phys_to_virt(*entry & VTD_PAGE_MASK);
719 unsigned long phy_addr;
723 context = alloc_pgtable_page(iommu->node);
727 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
728 phy_addr = virt_to_phys((void *)context);
729 *entry = phy_addr | 1;
730 __iommu_flush_cache(iommu, entry, sizeof(*entry));
732 return &context[devfn];
735 static int iommu_dummy(struct device *dev)
737 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
740 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
742 struct dmar_drhd_unit *drhd = NULL;
743 struct intel_iommu *iommu;
745 struct pci_dev *ptmp, *pdev = NULL;
749 if (iommu_dummy(dev))
752 if (dev_is_pci(dev)) {
753 struct pci_dev *pf_pdev;
755 pdev = to_pci_dev(dev);
758 /* VMD child devices currently cannot be handled individually */
759 if (is_vmd(pdev->bus))
763 /* VFs aren't listed in scope tables; we need to look up
764 * the PF instead to find the IOMMU. */
765 pf_pdev = pci_physfn(pdev);
767 segment = pci_domain_nr(pdev->bus);
768 } else if (has_acpi_companion(dev))
769 dev = &ACPI_COMPANION(dev)->dev;
772 for_each_active_iommu(iommu, drhd) {
773 if (pdev && segment != drhd->segment)
776 for_each_active_dev_scope(drhd->devices,
777 drhd->devices_cnt, i, tmp) {
779 /* For a VF use its original BDF# not that of the PF
780 * which we used for the IOMMU lookup. Strictly speaking
781 * we could do this for all PCI devices; we only need to
782 * get the BDF# from the scope table for ACPI matches. */
783 if (pdev && pdev->is_virtfn)
786 *bus = drhd->devices[i].bus;
787 *devfn = drhd->devices[i].devfn;
791 if (!pdev || !dev_is_pci(tmp))
794 ptmp = to_pci_dev(tmp);
795 if (ptmp->subordinate &&
796 ptmp->subordinate->number <= pdev->bus->number &&
797 ptmp->subordinate->busn_res.end >= pdev->bus->number)
801 if (pdev && drhd->include_all) {
803 *bus = pdev->bus->number;
804 *devfn = pdev->devfn;
815 static void domain_flush_cache(struct dmar_domain *domain,
816 void *addr, int size)
818 if (!domain->iommu_coherency)
819 clflush_cache_range(addr, size);
822 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
824 struct context_entry *context;
828 spin_lock_irqsave(&iommu->lock, flags);
829 context = iommu_context_addr(iommu, bus, devfn, 0);
831 ret = context_present(context);
832 spin_unlock_irqrestore(&iommu->lock, flags);
836 static void free_context_table(struct intel_iommu *iommu)
840 struct context_entry *context;
842 spin_lock_irqsave(&iommu->lock, flags);
843 if (!iommu->root_entry) {
846 for (i = 0; i < ROOT_ENTRY_NR; i++) {
847 context = iommu_context_addr(iommu, i, 0, 0);
849 free_pgtable_page(context);
851 if (!sm_supported(iommu))
854 context = iommu_context_addr(iommu, i, 0x80, 0);
856 free_pgtable_page(context);
859 free_pgtable_page(iommu->root_entry);
860 iommu->root_entry = NULL;
862 spin_unlock_irqrestore(&iommu->lock, flags);
865 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
866 unsigned long pfn, int *target_level)
868 struct dma_pte *parent, *pte;
869 int level = agaw_to_level(domain->agaw);
872 BUG_ON(!domain->pgd);
874 if (!domain_pfn_supported(domain, pfn))
875 /* Address beyond IOMMU's addressing capabilities. */
878 parent = domain->pgd;
883 offset = pfn_level_offset(pfn, level);
884 pte = &parent[offset];
885 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
887 if (level == *target_level)
890 if (!dma_pte_present(pte)) {
893 tmp_page = alloc_pgtable_page(domain->nid);
898 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
899 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
900 if (cmpxchg64(&pte->val, 0ULL, pteval))
901 /* Someone else set it while we were thinking; use theirs. */
902 free_pgtable_page(tmp_page);
904 domain_flush_cache(domain, pte, sizeof(*pte));
909 parent = phys_to_virt(dma_pte_addr(pte));
914 *target_level = level;
920 /* return address's pte at specific level */
921 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
923 int level, int *large_page)
925 struct dma_pte *parent, *pte;
926 int total = agaw_to_level(domain->agaw);
929 parent = domain->pgd;
930 while (level <= total) {
931 offset = pfn_level_offset(pfn, total);
932 pte = &parent[offset];
936 if (!dma_pte_present(pte)) {
941 if (dma_pte_superpage(pte)) {
946 parent = phys_to_virt(dma_pte_addr(pte));
952 /* clear last level pte, a tlb flush should be followed */
953 static void dma_pte_clear_range(struct dmar_domain *domain,
954 unsigned long start_pfn,
955 unsigned long last_pfn)
957 unsigned int large_page;
958 struct dma_pte *first_pte, *pte;
960 BUG_ON(!domain_pfn_supported(domain, start_pfn));
961 BUG_ON(!domain_pfn_supported(domain, last_pfn));
962 BUG_ON(start_pfn > last_pfn);
964 /* we don't need lock here; nobody else touches the iova range */
967 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
969 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974 start_pfn += lvl_to_nr_pages(large_page);
976 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
978 domain_flush_cache(domain, first_pte,
979 (void *)pte - (void *)first_pte);
981 } while (start_pfn && start_pfn <= last_pfn);
984 static void dma_pte_free_level(struct dmar_domain *domain, int level,
985 int retain_level, struct dma_pte *pte,
986 unsigned long pfn, unsigned long start_pfn,
987 unsigned long last_pfn)
989 pfn = max(start_pfn, pfn);
990 pte = &pte[pfn_level_offset(pfn, level)];
993 unsigned long level_pfn;
994 struct dma_pte *level_pte;
996 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
999 level_pfn = pfn & level_mask(level);
1000 level_pte = phys_to_virt(dma_pte_addr(pte));
1003 dma_pte_free_level(domain, level - 1, retain_level,
1004 level_pte, level_pfn, start_pfn,
1009 * Free the page table if we're below the level we want to
1010 * retain and the range covers the entire table.
1012 if (level < retain_level && !(start_pfn > level_pfn ||
1013 last_pfn < level_pfn + level_size(level) - 1)) {
1015 domain_flush_cache(domain, pte, sizeof(*pte));
1016 free_pgtable_page(level_pte);
1019 pfn += level_size(level);
1020 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1024 * clear last level (leaf) ptes and free page table pages below the
1025 * level we wish to keep intact.
1027 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1028 unsigned long start_pfn,
1029 unsigned long last_pfn,
1032 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1033 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1034 BUG_ON(start_pfn > last_pfn);
1036 dma_pte_clear_range(domain, start_pfn, last_pfn);
1038 /* We don't need lock here; nobody else touches the iova range */
1039 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1040 domain->pgd, 0, start_pfn, last_pfn);
1043 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1044 free_pgtable_page(domain->pgd);
1049 /* When a page at a given level is being unlinked from its parent, we don't
1050 need to *modify* it at all. All we need to do is make a list of all the
1051 pages which can be freed just as soon as we've flushed the IOTLB and we
1052 know the hardware page-walk will no longer touch them.
1053 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1055 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1056 int level, struct dma_pte *pte,
1057 struct page *freelist)
1061 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1062 pg->freelist = freelist;
1068 pte = page_address(pg);
1070 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1071 freelist = dma_pte_list_pagetables(domain, level - 1,
1074 } while (!first_pte_in_page(pte));
1079 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1080 struct dma_pte *pte, unsigned long pfn,
1081 unsigned long start_pfn,
1082 unsigned long last_pfn,
1083 struct page *freelist)
1085 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1087 pfn = max(start_pfn, pfn);
1088 pte = &pte[pfn_level_offset(pfn, level)];
1091 unsigned long level_pfn;
1093 if (!dma_pte_present(pte))
1096 level_pfn = pfn & level_mask(level);
1098 /* If range covers entire pagetable, free it */
1099 if (start_pfn <= level_pfn &&
1100 last_pfn >= level_pfn + level_size(level) - 1) {
1101 /* These suborbinate page tables are going away entirely. Don't
1102 bother to clear them; we're just going to *free* them. */
1103 if (level > 1 && !dma_pte_superpage(pte))
1104 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1110 } else if (level > 1) {
1111 /* Recurse down into a level that isn't *entirely* obsolete */
1112 freelist = dma_pte_clear_level(domain, level - 1,
1113 phys_to_virt(dma_pte_addr(pte)),
1114 level_pfn, start_pfn, last_pfn,
1118 pfn += level_size(level);
1119 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1122 domain_flush_cache(domain, first_pte,
1123 (void *)++last_pte - (void *)first_pte);
1128 /* We can't just free the pages because the IOMMU may still be walking
1129 the page tables, and may have cached the intermediate levels. The
1130 pages can only be freed after the IOTLB flush has been done. */
1131 static struct page *domain_unmap(struct dmar_domain *domain,
1132 unsigned long start_pfn,
1133 unsigned long last_pfn)
1135 struct page *freelist;
1137 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1138 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1139 BUG_ON(start_pfn > last_pfn);
1141 /* we don't need lock here; nobody else touches the iova range */
1142 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1143 domain->pgd, 0, start_pfn, last_pfn, NULL);
1146 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1147 struct page *pgd_page = virt_to_page(domain->pgd);
1148 pgd_page->freelist = freelist;
1149 freelist = pgd_page;
1157 static void dma_free_pagelist(struct page *freelist)
1161 while ((pg = freelist)) {
1162 freelist = pg->freelist;
1163 free_pgtable_page(page_address(pg));
1167 static void iova_entry_free(unsigned long data)
1169 struct page *freelist = (struct page *)data;
1171 dma_free_pagelist(freelist);
1174 /* iommu handling */
1175 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1177 struct root_entry *root;
1178 unsigned long flags;
1180 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1182 pr_err("Allocating root entry for %s failed\n",
1187 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1189 spin_lock_irqsave(&iommu->lock, flags);
1190 iommu->root_entry = root;
1191 spin_unlock_irqrestore(&iommu->lock, flags);
1196 static void iommu_set_root_entry(struct intel_iommu *iommu)
1202 addr = virt_to_phys(iommu->root_entry);
1203 if (sm_supported(iommu))
1204 addr |= DMA_RTADDR_SMT;
1206 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1209 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1211 /* Make sure hardware complete it */
1212 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213 readl, (sts & DMA_GSTS_RTPS), sts);
1215 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1218 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1226 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1229 /* Make sure hardware complete it */
1230 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231 readl, (!(val & DMA_GSTS_WBFS)), val);
1233 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1236 /* return value determine if we need a write buffer flush */
1237 static void __iommu_flush_context(struct intel_iommu *iommu,
1238 u16 did, u16 source_id, u8 function_mask,
1245 case DMA_CCMD_GLOBAL_INVL:
1246 val = DMA_CCMD_GLOBAL_INVL;
1248 case DMA_CCMD_DOMAIN_INVL:
1249 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1251 case DMA_CCMD_DEVICE_INVL:
1252 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1253 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258 val |= DMA_CCMD_ICC;
1260 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1263 /* Make sure hardware complete it */
1264 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1265 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1267 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1272 u64 addr, unsigned int size_order, u64 type)
1274 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1275 u64 val = 0, val_iva = 0;
1279 case DMA_TLB_GLOBAL_FLUSH:
1280 /* global flush doesn't need set IVA_REG */
1281 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1283 case DMA_TLB_DSI_FLUSH:
1284 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1286 case DMA_TLB_PSI_FLUSH:
1287 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288 /* IH bit is passed in as part of address */
1289 val_iva = size_order | addr;
1294 /* Note: set drain read/write */
1297 * This is probably to be super secure.. Looks like we can
1298 * ignore it without any impact.
1300 if (cap_read_drain(iommu->cap))
1301 val |= DMA_TLB_READ_DRAIN;
1303 if (cap_write_drain(iommu->cap))
1304 val |= DMA_TLB_WRITE_DRAIN;
1306 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307 /* Note: Only uses first TLB reg currently */
1309 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1310 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1312 /* Make sure hardware complete it */
1313 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1314 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1316 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1318 /* check IOTLB invalidation granularity */
1319 if (DMA_TLB_IAIG(val) == 0)
1320 pr_err("Flush IOTLB failed\n");
1321 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1322 pr_debug("TLB flush request %Lx, actual %Lx\n",
1323 (unsigned long long)DMA_TLB_IIRG(type),
1324 (unsigned long long)DMA_TLB_IAIG(val));
1327 static struct device_domain_info *
1328 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1331 struct device_domain_info *info;
1333 assert_spin_locked(&device_domain_lock);
1338 list_for_each_entry(info, &domain->devices, link)
1339 if (info->iommu == iommu && info->bus == bus &&
1340 info->devfn == devfn) {
1341 if (info->ats_supported && info->dev)
1349 static void domain_update_iotlb(struct dmar_domain *domain)
1351 struct device_domain_info *info;
1352 bool has_iotlb_device = false;
1354 assert_spin_locked(&device_domain_lock);
1356 list_for_each_entry(info, &domain->devices, link) {
1357 struct pci_dev *pdev;
1359 if (!info->dev || !dev_is_pci(info->dev))
1362 pdev = to_pci_dev(info->dev);
1363 if (pdev->ats_enabled) {
1364 has_iotlb_device = true;
1369 domain->has_iotlb_device = has_iotlb_device;
1372 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1374 struct pci_dev *pdev;
1376 assert_spin_locked(&device_domain_lock);
1378 if (!info || !dev_is_pci(info->dev))
1381 pdev = to_pci_dev(info->dev);
1382 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1383 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1384 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1385 * reserved, which should be set to 0.
1387 if (!ecap_dit(info->iommu->ecap))
1390 struct pci_dev *pf_pdev;
1392 /* pdev will be returned if device is not a vf */
1393 pf_pdev = pci_physfn(pdev);
1394 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1397 #ifdef CONFIG_INTEL_IOMMU_SVM
1398 /* The PCIe spec, in its wisdom, declares that the behaviour of
1399 the device if you enable PASID support after ATS support is
1400 undefined. So always enable PASID support on devices which
1401 have it, even if we can't yet know if we're ever going to
1403 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1404 info->pasid_enabled = 1;
1406 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1407 info->pri_enabled = 1;
1409 if (!pdev->untrusted && info->ats_supported &&
1410 !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1411 info->ats_enabled = 1;
1412 domain_update_iotlb(info->domain);
1413 info->ats_qdep = pci_ats_queue_depth(pdev);
1417 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1419 struct pci_dev *pdev;
1421 assert_spin_locked(&device_domain_lock);
1423 if (!dev_is_pci(info->dev))
1426 pdev = to_pci_dev(info->dev);
1428 if (info->ats_enabled) {
1429 pci_disable_ats(pdev);
1430 info->ats_enabled = 0;
1431 domain_update_iotlb(info->domain);
1433 #ifdef CONFIG_INTEL_IOMMU_SVM
1434 if (info->pri_enabled) {
1435 pci_disable_pri(pdev);
1436 info->pri_enabled = 0;
1438 if (info->pasid_enabled) {
1439 pci_disable_pasid(pdev);
1440 info->pasid_enabled = 0;
1445 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1446 u64 addr, unsigned mask)
1449 unsigned long flags;
1450 struct device_domain_info *info;
1452 if (!domain->has_iotlb_device)
1455 spin_lock_irqsave(&device_domain_lock, flags);
1456 list_for_each_entry(info, &domain->devices, link) {
1457 if (!info->ats_enabled)
1460 sid = info->bus << 8 | info->devfn;
1461 qdep = info->ats_qdep;
1462 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1465 spin_unlock_irqrestore(&device_domain_lock, flags);
1468 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1469 struct dmar_domain *domain,
1470 unsigned long pfn, unsigned int pages,
1473 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1474 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1475 u16 did = domain->iommu_did[iommu->seq_id];
1482 * Fallback to domain selective flush if no PSI support or the size is
1484 * PSI requires page size to be 2 ^ x, and the base address is naturally
1485 * aligned to the size
1487 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1488 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1491 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1495 * In caching mode, changes of pages from non-present to present require
1496 * flush. However, device IOTLB doesn't need to be flushed in this case.
1498 if (!cap_caching_mode(iommu->cap) || !map)
1499 iommu_flush_dev_iotlb(domain, addr, mask);
1502 /* Notification for newly created mappings */
1503 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1504 struct dmar_domain *domain,
1505 unsigned long pfn, unsigned int pages)
1507 /* It's a non-present to present mapping. Only flush if caching mode */
1508 if (cap_caching_mode(iommu->cap))
1509 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1511 iommu_flush_write_buffer(iommu);
1514 static void iommu_flush_iova(struct iova_domain *iovad)
1516 struct dmar_domain *domain;
1519 domain = container_of(iovad, struct dmar_domain, iovad);
1521 for_each_domain_iommu(idx, domain) {
1522 struct intel_iommu *iommu = g_iommus[idx];
1523 u16 did = domain->iommu_did[iommu->seq_id];
1525 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1527 if (!cap_caching_mode(iommu->cap))
1528 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1529 0, MAX_AGAW_PFN_WIDTH);
1533 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1536 unsigned long flags;
1538 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1539 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1540 pmen &= ~DMA_PMEN_EPM;
1541 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1543 /* wait for the protected region status bit to clear */
1544 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1545 readl, !(pmen & DMA_PMEN_PRS), pmen);
1547 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1550 static void iommu_enable_translation(struct intel_iommu *iommu)
1553 unsigned long flags;
1555 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1556 iommu->gcmd |= DMA_GCMD_TE;
1557 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1559 /* Make sure hardware complete it */
1560 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1561 readl, (sts & DMA_GSTS_TES), sts);
1563 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1566 static void iommu_disable_translation(struct intel_iommu *iommu)
1571 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1572 iommu->gcmd &= ~DMA_GCMD_TE;
1573 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1575 /* Make sure hardware complete it */
1576 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1577 readl, (!(sts & DMA_GSTS_TES)), sts);
1579 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1583 static int iommu_init_domains(struct intel_iommu *iommu)
1585 u32 ndomains, nlongs;
1588 ndomains = cap_ndoms(iommu->cap);
1589 pr_debug("%s: Number of Domains supported <%d>\n",
1590 iommu->name, ndomains);
1591 nlongs = BITS_TO_LONGS(ndomains);
1593 spin_lock_init(&iommu->lock);
1595 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1596 if (!iommu->domain_ids) {
1597 pr_err("%s: Allocating domain id array failed\n",
1602 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1603 iommu->domains = kzalloc(size, GFP_KERNEL);
1605 if (iommu->domains) {
1606 size = 256 * sizeof(struct dmar_domain *);
1607 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1610 if (!iommu->domains || !iommu->domains[0]) {
1611 pr_err("%s: Allocating domain array failed\n",
1613 kfree(iommu->domain_ids);
1614 kfree(iommu->domains);
1615 iommu->domain_ids = NULL;
1616 iommu->domains = NULL;
1623 * If Caching mode is set, then invalid translations are tagged
1624 * with domain-id 0, hence we need to pre-allocate it. We also
1625 * use domain-id 0 as a marker for non-allocated domain-id, so
1626 * make sure it is not used for a real domain.
1628 set_bit(0, iommu->domain_ids);
1631 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1632 * entry for first-level or pass-through translation modes should
1633 * be programmed with a domain id different from those used for
1634 * second-level or nested translation. We reserve a domain id for
1637 if (sm_supported(iommu))
1638 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1643 static void disable_dmar_iommu(struct intel_iommu *iommu)
1645 struct device_domain_info *info, *tmp;
1646 unsigned long flags;
1648 if (!iommu->domains || !iommu->domain_ids)
1652 spin_lock_irqsave(&device_domain_lock, flags);
1653 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1654 struct dmar_domain *domain;
1656 if (info->iommu != iommu)
1659 if (!info->dev || !info->domain)
1662 domain = info->domain;
1664 __dmar_remove_one_dev_info(info);
1666 if (!domain_type_is_vm_or_si(domain)) {
1668 * The domain_exit() function can't be called under
1669 * device_domain_lock, as it takes this lock itself.
1670 * So release the lock here and re-run the loop
1673 spin_unlock_irqrestore(&device_domain_lock, flags);
1674 domain_exit(domain);
1678 spin_unlock_irqrestore(&device_domain_lock, flags);
1680 if (iommu->gcmd & DMA_GCMD_TE)
1681 iommu_disable_translation(iommu);
1684 static void free_dmar_iommu(struct intel_iommu *iommu)
1686 if ((iommu->domains) && (iommu->domain_ids)) {
1687 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1690 for (i = 0; i < elems; i++)
1691 kfree(iommu->domains[i]);
1692 kfree(iommu->domains);
1693 kfree(iommu->domain_ids);
1694 iommu->domains = NULL;
1695 iommu->domain_ids = NULL;
1698 g_iommus[iommu->seq_id] = NULL;
1700 /* free context mapping */
1701 free_context_table(iommu);
1703 #ifdef CONFIG_INTEL_IOMMU_SVM
1704 if (pasid_supported(iommu)) {
1705 if (ecap_prs(iommu->ecap))
1706 intel_svm_finish_prq(iommu);
1711 static struct dmar_domain *alloc_domain(int flags)
1713 struct dmar_domain *domain;
1715 domain = alloc_domain_mem();
1719 memset(domain, 0, sizeof(*domain));
1721 domain->flags = flags;
1722 domain->has_iotlb_device = false;
1723 INIT_LIST_HEAD(&domain->devices);
1728 /* Must be called with iommu->lock */
1729 static int domain_attach_iommu(struct dmar_domain *domain,
1730 struct intel_iommu *iommu)
1732 unsigned long ndomains;
1735 assert_spin_locked(&device_domain_lock);
1736 assert_spin_locked(&iommu->lock);
1738 domain->iommu_refcnt[iommu->seq_id] += 1;
1739 domain->iommu_count += 1;
1740 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1741 ndomains = cap_ndoms(iommu->cap);
1742 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1744 if (num >= ndomains) {
1745 pr_err("%s: No free domain ids\n", iommu->name);
1746 domain->iommu_refcnt[iommu->seq_id] -= 1;
1747 domain->iommu_count -= 1;
1751 set_bit(num, iommu->domain_ids);
1752 set_iommu_domain(iommu, num, domain);
1754 domain->iommu_did[iommu->seq_id] = num;
1755 domain->nid = iommu->node;
1757 domain_update_iommu_cap(domain);
1763 static int domain_detach_iommu(struct dmar_domain *domain,
1764 struct intel_iommu *iommu)
1768 assert_spin_locked(&device_domain_lock);
1769 assert_spin_locked(&iommu->lock);
1771 domain->iommu_refcnt[iommu->seq_id] -= 1;
1772 count = --domain->iommu_count;
1773 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1774 num = domain->iommu_did[iommu->seq_id];
1775 clear_bit(num, iommu->domain_ids);
1776 set_iommu_domain(iommu, num, NULL);
1778 domain_update_iommu_cap(domain);
1779 domain->iommu_did[iommu->seq_id] = 0;
1785 static struct iova_domain reserved_iova_list;
1786 static struct lock_class_key reserved_rbtree_key;
1788 static int dmar_init_reserved_ranges(void)
1790 struct pci_dev *pdev = NULL;
1794 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1796 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1797 &reserved_rbtree_key);
1799 /* IOAPIC ranges shouldn't be accessed by DMA */
1800 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1801 IOVA_PFN(IOAPIC_RANGE_END));
1803 pr_err("Reserve IOAPIC range failed\n");
1807 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1808 for_each_pci_dev(pdev) {
1811 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1812 r = &pdev->resource[i];
1813 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815 iova = reserve_iova(&reserved_iova_list,
1819 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1827 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1829 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1832 static inline int guestwidth_to_adjustwidth(int gaw)
1835 int r = (gaw - 12) % 9;
1846 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1849 int adjust_width, agaw;
1850 unsigned long sagaw;
1853 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1855 err = init_iova_flush_queue(&domain->iovad,
1856 iommu_flush_iova, iova_entry_free);
1860 domain_reserve_special_ranges(domain);
1862 /* calculate AGAW */
1863 if (guest_width > cap_mgaw(iommu->cap))
1864 guest_width = cap_mgaw(iommu->cap);
1865 domain->gaw = guest_width;
1866 adjust_width = guestwidth_to_adjustwidth(guest_width);
1867 agaw = width_to_agaw(adjust_width);
1868 sagaw = cap_sagaw(iommu->cap);
1869 if (!test_bit(agaw, &sagaw)) {
1870 /* hardware doesn't support it, choose a bigger one */
1871 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1872 agaw = find_next_bit(&sagaw, 5, agaw);
1876 domain->agaw = agaw;
1878 if (ecap_coherent(iommu->ecap))
1879 domain->iommu_coherency = 1;
1881 domain->iommu_coherency = 0;
1883 if (ecap_sc_support(iommu->ecap))
1884 domain->iommu_snooping = 1;
1886 domain->iommu_snooping = 0;
1888 if (intel_iommu_superpage)
1889 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891 domain->iommu_superpage = 0;
1893 domain->nid = iommu->node;
1895 /* always allocate the top pgd */
1896 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1899 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1903 static void domain_exit(struct dmar_domain *domain)
1905 struct page *freelist;
1907 /* Domain 0 is reserved, so dont process it */
1911 /* Remove associated devices and clear attached or cached domains */
1913 domain_remove_dev_info(domain);
1917 put_iova_domain(&domain->iovad);
1919 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1921 dma_free_pagelist(freelist);
1923 free_domain_mem(domain);
1927 * Get the PASID directory size for scalable mode context entry.
1928 * Value of X in the PDTS field of a scalable mode context entry
1929 * indicates PASID directory with 2^(X + 7) entries.
1931 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1935 max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1936 pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1944 * Set the RID_PASID field of a scalable mode context entry. The
1945 * IOMMU hardware will use the PASID value set in this field for
1946 * DMA translations of DMA requests without PASID.
1949 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1951 context->hi |= pasid & ((1 << 20) - 1);
1952 context->hi |= (1 << 20);
1956 * Set the DTE(Device-TLB Enable) field of a scalable mode context
1959 static inline void context_set_sm_dte(struct context_entry *context)
1961 context->lo |= (1 << 2);
1965 * Set the PRE(Page Request Enable) field of a scalable mode context
1968 static inline void context_set_sm_pre(struct context_entry *context)
1970 context->lo |= (1 << 4);
1973 /* Convert value to context PASID directory size field coding. */
1974 #define context_pdts(pds) (((pds) & 0x7) << 9)
1976 static int domain_context_mapping_one(struct dmar_domain *domain,
1977 struct intel_iommu *iommu,
1978 struct pasid_table *table,
1981 u16 did = domain->iommu_did[iommu->seq_id];
1982 int translation = CONTEXT_TT_MULTI_LEVEL;
1983 struct device_domain_info *info = NULL;
1984 struct context_entry *context;
1985 unsigned long flags;
1990 if (hw_pass_through && domain_type_is_si(domain))
1991 translation = CONTEXT_TT_PASS_THROUGH;
1993 pr_debug("Set context mapping for %02x:%02x.%d\n",
1994 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1996 BUG_ON(!domain->pgd);
1998 spin_lock_irqsave(&device_domain_lock, flags);
1999 spin_lock(&iommu->lock);
2002 context = iommu_context_addr(iommu, bus, devfn, 1);
2007 if (context_present(context))
2011 * For kdump cases, old valid entries may be cached due to the
2012 * in-flight DMA and copied pgtable, but there is no unmapping
2013 * behaviour for them, thus we need an explicit cache flush for
2014 * the newly-mapped device. For kdump, at this point, the device
2015 * is supposed to finish reset at its driver probe stage, so no
2016 * in-flight DMA will exist, and we don't need to worry anymore
2019 if (context_copied(context)) {
2020 u16 did_old = context_domain_id(context);
2022 if (did_old < cap_ndoms(iommu->cap)) {
2023 iommu->flush.flush_context(iommu, did_old,
2024 (((u16)bus) << 8) | devfn,
2025 DMA_CCMD_MASK_NOBIT,
2026 DMA_CCMD_DEVICE_INVL);
2027 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2032 context_clear_entry(context);
2034 if (sm_supported(iommu)) {
2039 /* Setup the PASID DIR pointer: */
2040 pds = context_get_sm_pds(table);
2041 context->lo = (u64)virt_to_phys(table->table) |
2044 /* Setup the RID_PASID field: */
2045 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2048 * Setup the Device-TLB enable bit and Page request
2051 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2052 if (info && info->ats_supported)
2053 context_set_sm_dte(context);
2054 if (info && info->pri_supported)
2055 context_set_sm_pre(context);
2057 struct dma_pte *pgd = domain->pgd;
2060 context_set_domain_id(context, did);
2061 context_set_translation_type(context, translation);
2063 if (translation != CONTEXT_TT_PASS_THROUGH) {
2065 * Skip top levels of page tables for iommu which has
2066 * less agaw than default. Unnecessary for PT mode.
2068 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2070 pgd = phys_to_virt(dma_pte_addr(pgd));
2071 if (!dma_pte_present(pgd))
2075 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2076 if (info && info->ats_supported)
2077 translation = CONTEXT_TT_DEV_IOTLB;
2079 translation = CONTEXT_TT_MULTI_LEVEL;
2081 context_set_address_root(context, virt_to_phys(pgd));
2082 context_set_address_width(context, agaw);
2085 * In pass through mode, AW must be programmed to
2086 * indicate the largest AGAW value supported by
2087 * hardware. And ASR is ignored by hardware.
2089 context_set_address_width(context, iommu->msagaw);
2093 context_set_fault_enable(context);
2094 context_set_present(context);
2095 domain_flush_cache(domain, context, sizeof(*context));
2098 * It's a non-present to present mapping. If hardware doesn't cache
2099 * non-present entry we only need to flush the write-buffer. If the
2100 * _does_ cache non-present entries, then it does so in the special
2101 * domain #0, which we have to flush:
2103 if (cap_caching_mode(iommu->cap)) {
2104 iommu->flush.flush_context(iommu, 0,
2105 (((u16)bus) << 8) | devfn,
2106 DMA_CCMD_MASK_NOBIT,
2107 DMA_CCMD_DEVICE_INVL);
2108 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2110 iommu_flush_write_buffer(iommu);
2112 iommu_enable_dev_iotlb(info);
2117 spin_unlock(&iommu->lock);
2118 spin_unlock_irqrestore(&device_domain_lock, flags);
2123 struct domain_context_mapping_data {
2124 struct dmar_domain *domain;
2125 struct intel_iommu *iommu;
2126 struct pasid_table *table;
2129 static int domain_context_mapping_cb(struct pci_dev *pdev,
2130 u16 alias, void *opaque)
2132 struct domain_context_mapping_data *data = opaque;
2134 return domain_context_mapping_one(data->domain, data->iommu,
2135 data->table, PCI_BUS_NUM(alias),
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2142 struct domain_context_mapping_data data;
2143 struct pasid_table *table;
2144 struct intel_iommu *iommu;
2147 iommu = device_to_iommu(dev, &bus, &devfn);
2151 table = intel_pasid_get_table(dev);
2153 if (!dev_is_pci(dev))
2154 return domain_context_mapping_one(domain, iommu, table,
2157 data.domain = domain;
2161 return pci_for_each_dma_alias(to_pci_dev(dev),
2162 &domain_context_mapping_cb, &data);
2165 static int domain_context_mapped_cb(struct pci_dev *pdev,
2166 u16 alias, void *opaque)
2168 struct intel_iommu *iommu = opaque;
2170 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2173 static int domain_context_mapped(struct device *dev)
2175 struct intel_iommu *iommu;
2178 iommu = device_to_iommu(dev, &bus, &devfn);
2182 if (!dev_is_pci(dev))
2183 return device_context_mapped(iommu, bus, devfn);
2185 return !pci_for_each_dma_alias(to_pci_dev(dev),
2186 domain_context_mapped_cb, iommu);
2189 /* Returns a number of VTD pages, but aligned to MM page size */
2190 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2193 host_addr &= ~PAGE_MASK;
2194 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2197 /* Return largest possible superpage level for a given mapping */
2198 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2199 unsigned long iov_pfn,
2200 unsigned long phy_pfn,
2201 unsigned long pages)
2203 int support, level = 1;
2204 unsigned long pfnmerge;
2206 support = domain->iommu_superpage;
2208 /* To use a large page, the virtual *and* physical addresses
2209 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2210 of them will mean we have to use smaller pages. So just
2211 merge them and check both at once. */
2212 pfnmerge = iov_pfn | phy_pfn;
2214 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2215 pages >>= VTD_STRIDE_SHIFT;
2218 pfnmerge >>= VTD_STRIDE_SHIFT;
2225 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2226 struct scatterlist *sg, unsigned long phys_pfn,
2227 unsigned long nr_pages, int prot)
2229 struct dma_pte *first_pte = NULL, *pte = NULL;
2230 phys_addr_t uninitialized_var(pteval);
2231 unsigned long sg_res = 0;
2232 unsigned int largepage_lvl = 0;
2233 unsigned long lvl_pages = 0;
2235 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2237 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2240 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2247 while (nr_pages > 0) {
2251 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2253 sg_res = aligned_nrpages(sg->offset, sg->length);
2254 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2255 sg->dma_length = sg->length;
2256 pteval = (sg_phys(sg) - pgoff) | prot;
2257 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2263 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2266 /* It is large page*/
2267 if (largepage_lvl > 1) {
2268 unsigned long nr_superpages, end_pfn;
2270 pteval |= DMA_PTE_LARGE_PAGE;
2271 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2273 nr_superpages = sg_res / lvl_pages;
2274 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2277 * Ensure that old small page tables are
2278 * removed to make room for superpage(s).
2279 * We're adding new large pages, so make sure
2280 * we don't remove their parent tables.
2282 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2285 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289 /* We don't need lock here, nobody else
2290 * touches the iova range
2292 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2294 static int dumps = 5;
2295 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2296 iov_pfn, tmp, (unsigned long long)pteval);
2299 debug_dma_dump_mappings(NULL);
2304 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2306 BUG_ON(nr_pages < lvl_pages);
2307 BUG_ON(sg_res < lvl_pages);
2309 nr_pages -= lvl_pages;
2310 iov_pfn += lvl_pages;
2311 phys_pfn += lvl_pages;
2312 pteval += lvl_pages * VTD_PAGE_SIZE;
2313 sg_res -= lvl_pages;
2315 /* If the next PTE would be the first in a new page, then we
2316 need to flush the cache on the entries we've just written.
2317 And then we'll need to recalculate 'pte', so clear it and
2318 let it get set again in the if (!pte) block above.
2320 If we're done (!nr_pages) we need to flush the cache too.
2322 Also if we've been setting superpages, we may need to
2323 recalculate 'pte' and switch back to smaller pages for the
2324 end of the mapping, if the trailing size is not enough to
2325 use another superpage (i.e. sg_res < lvl_pages). */
2327 if (!nr_pages || first_pte_in_page(pte) ||
2328 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2329 domain_flush_cache(domain, first_pte,
2330 (void *)pte - (void *)first_pte);
2334 if (!sg_res && nr_pages)
2340 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2341 struct scatterlist *sg, unsigned long phys_pfn,
2342 unsigned long nr_pages, int prot)
2345 struct intel_iommu *iommu;
2347 /* Do the real mapping first */
2348 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2352 /* Notify about the new mapping */
2353 if (domain_type_is_vm(domain)) {
2354 /* VM typed domains can have more than one IOMMUs */
2356 for_each_domain_iommu(iommu_id, domain) {
2357 iommu = g_iommus[iommu_id];
2358 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2361 /* General domains only have one IOMMU */
2362 iommu = domain_get_iommu(domain);
2363 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2369 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2370 struct scatterlist *sg, unsigned long nr_pages,
2373 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2376 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2377 unsigned long phys_pfn, unsigned long nr_pages,
2380 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2383 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2385 unsigned long flags;
2386 struct context_entry *context;
2392 spin_lock_irqsave(&iommu->lock, flags);
2393 context = iommu_context_addr(iommu, bus, devfn, 0);
2395 spin_unlock_irqrestore(&iommu->lock, flags);
2398 did_old = context_domain_id(context);
2399 context_clear_entry(context);
2400 __iommu_flush_cache(iommu, context, sizeof(*context));
2401 spin_unlock_irqrestore(&iommu->lock, flags);
2402 iommu->flush.flush_context(iommu,
2404 (((u16)bus) << 8) | devfn,
2405 DMA_CCMD_MASK_NOBIT,
2406 DMA_CCMD_DEVICE_INVL);
2407 iommu->flush.flush_iotlb(iommu,
2414 static inline void unlink_domain_info(struct device_domain_info *info)
2416 assert_spin_locked(&device_domain_lock);
2417 list_del(&info->link);
2418 list_del(&info->global);
2420 info->dev->archdata.iommu = NULL;
2423 static void domain_remove_dev_info(struct dmar_domain *domain)
2425 struct device_domain_info *info, *tmp;
2426 unsigned long flags;
2428 spin_lock_irqsave(&device_domain_lock, flags);
2429 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2430 __dmar_remove_one_dev_info(info);
2431 spin_unlock_irqrestore(&device_domain_lock, flags);
2436 * Note: we use struct device->archdata.iommu stores the info
2438 static struct dmar_domain *find_domain(struct device *dev)
2440 struct device_domain_info *info;
2442 /* No lock here, assumes no domain exit in normal case */
2443 info = dev->archdata.iommu;
2445 return info->domain;
2449 static inline struct device_domain_info *
2450 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2452 struct device_domain_info *info;
2454 list_for_each_entry(info, &device_domain_list, global)
2455 if (info->iommu->segment == segment && info->bus == bus &&
2456 info->devfn == devfn)
2462 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2465 struct dmar_domain *domain)
2467 struct dmar_domain *found = NULL;
2468 struct device_domain_info *info;
2469 unsigned long flags;
2472 info = alloc_devinfo_mem();
2477 info->devfn = devfn;
2478 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2479 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2482 info->domain = domain;
2483 info->iommu = iommu;
2484 info->pasid_table = NULL;
2486 if (dev && dev_is_pci(dev)) {
2487 struct pci_dev *pdev = to_pci_dev(info->dev);
2489 if (!pci_ats_disabled() &&
2490 ecap_dev_iotlb_support(iommu->ecap) &&
2491 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2492 dmar_find_matched_atsr_unit(pdev))
2493 info->ats_supported = 1;
2495 if (sm_supported(iommu)) {
2496 if (pasid_supported(iommu)) {
2497 int features = pci_pasid_features(pdev);
2499 info->pasid_supported = features | 1;
2502 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2503 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2504 info->pri_supported = 1;
2508 spin_lock_irqsave(&device_domain_lock, flags);
2510 found = find_domain(dev);
2513 struct device_domain_info *info2;
2514 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2516 found = info2->domain;
2522 spin_unlock_irqrestore(&device_domain_lock, flags);
2523 free_devinfo_mem(info);
2524 /* Caller must free the original domain */
2528 spin_lock(&iommu->lock);
2529 ret = domain_attach_iommu(domain, iommu);
2530 spin_unlock(&iommu->lock);
2533 spin_unlock_irqrestore(&device_domain_lock, flags);
2534 free_devinfo_mem(info);
2538 list_add(&info->link, &domain->devices);
2539 list_add(&info->global, &device_domain_list);
2541 dev->archdata.iommu = info;
2542 spin_unlock_irqrestore(&device_domain_lock, flags);
2544 /* PASID table is mandatory for a PCI device in scalable mode. */
2545 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2546 ret = intel_pasid_alloc_table(dev);
2548 dev_err(dev, "PASID table allocation failed\n");
2549 dmar_remove_one_dev_info(domain, dev);
2553 /* Setup the PASID entry for requests without PASID: */
2554 spin_lock(&iommu->lock);
2555 if (hw_pass_through && domain_type_is_si(domain))
2556 ret = intel_pasid_setup_pass_through(iommu, domain,
2557 dev, PASID_RID2PASID);
2559 ret = intel_pasid_setup_second_level(iommu, domain,
2560 dev, PASID_RID2PASID);
2561 spin_unlock(&iommu->lock);
2563 dev_err(dev, "Setup RID2PASID failed\n");
2564 dmar_remove_one_dev_info(domain, dev);
2569 if (dev && domain_context_mapping(domain, dev)) {
2570 dev_err(dev, "Domain context map failed\n");
2571 dmar_remove_one_dev_info(domain, dev);
2578 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2580 *(u16 *)opaque = alias;
2584 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2586 struct device_domain_info *info;
2587 struct dmar_domain *domain = NULL;
2588 struct intel_iommu *iommu;
2590 unsigned long flags;
2593 iommu = device_to_iommu(dev, &bus, &devfn);
2597 if (dev_is_pci(dev)) {
2598 struct pci_dev *pdev = to_pci_dev(dev);
2600 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2602 spin_lock_irqsave(&device_domain_lock, flags);
2603 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2604 PCI_BUS_NUM(dma_alias),
2607 iommu = info->iommu;
2608 domain = info->domain;
2610 spin_unlock_irqrestore(&device_domain_lock, flags);
2612 /* DMA alias already has a domain, use it */
2617 /* Allocate and initialize new domain for the device */
2618 domain = alloc_domain(0);
2621 if (domain_init(domain, iommu, gaw)) {
2622 domain_exit(domain);
2631 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2632 struct dmar_domain *domain)
2634 struct intel_iommu *iommu;
2635 struct dmar_domain *tmp;
2636 u16 req_id, dma_alias;
2639 iommu = device_to_iommu(dev, &bus, &devfn);
2643 req_id = ((u16)bus << 8) | devfn;
2645 if (dev_is_pci(dev)) {
2646 struct pci_dev *pdev = to_pci_dev(dev);
2648 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2650 /* register PCI DMA alias device */
2651 if (req_id != dma_alias) {
2652 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2653 dma_alias & 0xff, NULL, domain);
2655 if (!tmp || tmp != domain)
2660 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2661 if (!tmp || tmp != domain)
2667 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2669 struct dmar_domain *domain, *tmp;
2671 domain = find_domain(dev);
2675 domain = find_or_alloc_domain(dev, gaw);
2679 tmp = set_domain_for_dev(dev, domain);
2680 if (!tmp || domain != tmp) {
2681 domain_exit(domain);
2690 static int iommu_domain_identity_map(struct dmar_domain *domain,
2691 unsigned long long start,
2692 unsigned long long end)
2694 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2695 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2697 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2698 dma_to_mm_pfn(last_vpfn))) {
2699 pr_err("Reserving iova failed\n");
2703 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2705 * RMRR range might have overlap with physical memory range,
2708 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2710 return __domain_mapping(domain, first_vpfn, NULL,
2711 first_vpfn, last_vpfn - first_vpfn + 1,
2712 DMA_PTE_READ|DMA_PTE_WRITE);
2715 static int domain_prepare_identity_map(struct device *dev,
2716 struct dmar_domain *domain,
2717 unsigned long long start,
2718 unsigned long long end)
2720 /* For _hardware_ passthrough, don't bother. But for software
2721 passthrough, we do it anyway -- it may indicate a memory
2722 range which is reserved in E820, so which didn't get set
2723 up to start with in si_domain */
2724 if (domain == si_domain && hw_pass_through) {
2725 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2730 dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2733 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2734 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2735 dmi_get_system_info(DMI_BIOS_VENDOR),
2736 dmi_get_system_info(DMI_BIOS_VERSION),
2737 dmi_get_system_info(DMI_PRODUCT_VERSION));
2741 if (end >> agaw_to_width(domain->agaw)) {
2742 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2743 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2744 agaw_to_width(domain->agaw),
2745 dmi_get_system_info(DMI_BIOS_VENDOR),
2746 dmi_get_system_info(DMI_BIOS_VERSION),
2747 dmi_get_system_info(DMI_PRODUCT_VERSION));
2751 return iommu_domain_identity_map(domain, start, end);
2754 static int iommu_prepare_identity_map(struct device *dev,
2755 unsigned long long start,
2756 unsigned long long end)
2758 struct dmar_domain *domain;
2761 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2765 ret = domain_prepare_identity_map(dev, domain, start, end);
2767 domain_exit(domain);
2772 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2775 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2777 return iommu_prepare_identity_map(dev, rmrr->base_address,
2781 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2782 static inline void iommu_prepare_isa(void)
2784 struct pci_dev *pdev;
2787 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2791 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2792 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2795 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2800 static inline void iommu_prepare_isa(void)
2804 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2806 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2808 static int __init si_domain_init(int hw)
2812 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2816 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2817 domain_exit(si_domain);
2821 pr_debug("Identity mapping domain allocated\n");
2826 for_each_online_node(nid) {
2827 unsigned long start_pfn, end_pfn;
2830 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2831 ret = iommu_domain_identity_map(si_domain,
2832 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2841 static int identity_mapping(struct device *dev)
2843 struct device_domain_info *info;
2845 if (likely(!iommu_identity_mapping))
2848 info = dev->archdata.iommu;
2849 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2850 return (info->domain == si_domain);
2855 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2857 struct dmar_domain *ndomain;
2858 struct intel_iommu *iommu;
2861 iommu = device_to_iommu(dev, &bus, &devfn);
2865 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2866 if (ndomain != domain)
2872 static bool device_has_rmrr(struct device *dev)
2874 struct dmar_rmrr_unit *rmrr;
2879 for_each_rmrr_units(rmrr) {
2881 * Return TRUE if this RMRR contains the device that
2884 for_each_active_dev_scope(rmrr->devices,
2885 rmrr->devices_cnt, i, tmp)
2896 * There are a couple cases where we need to restrict the functionality of
2897 * devices associated with RMRRs. The first is when evaluating a device for
2898 * identity mapping because problems exist when devices are moved in and out
2899 * of domains and their respective RMRR information is lost. This means that
2900 * a device with associated RMRRs will never be in a "passthrough" domain.
2901 * The second is use of the device through the IOMMU API. This interface
2902 * expects to have full control of the IOVA space for the device. We cannot
2903 * satisfy both the requirement that RMRR access is maintained and have an
2904 * unencumbered IOVA space. We also have no ability to quiesce the device's
2905 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2906 * We therefore prevent devices associated with an RMRR from participating in
2907 * the IOMMU API, which eliminates them from device assignment.
2909 * In both cases we assume that PCI USB devices with RMRRs have them largely
2910 * for historical reasons and that the RMRR space is not actively used post
2911 * boot. This exclusion may change if vendors begin to abuse it.
2913 * The same exception is made for graphics devices, with the requirement that
2914 * any use of the RMRR regions will be torn down before assigning the device
2917 static bool device_is_rmrr_locked(struct device *dev)
2919 if (!device_has_rmrr(dev))
2922 if (dev_is_pci(dev)) {
2923 struct pci_dev *pdev = to_pci_dev(dev);
2925 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2932 static int iommu_should_identity_map(struct device *dev, int startup)
2934 if (dev_is_pci(dev)) {
2935 struct pci_dev *pdev = to_pci_dev(dev);
2937 if (device_is_rmrr_locked(dev))
2941 * Prevent any device marked as untrusted from getting
2942 * placed into the statically identity mapping domain.
2944 if (pdev->untrusted)
2947 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2950 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2953 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2957 * We want to start off with all devices in the 1:1 domain, and
2958 * take them out later if we find they can't access all of memory.
2960 * However, we can't do this for PCI devices behind bridges,
2961 * because all PCI devices behind the same bridge will end up
2962 * with the same source-id on their transactions.
2964 * Practically speaking, we can't change things around for these
2965 * devices at run-time, because we can't be sure there'll be no
2966 * DMA transactions in flight for any of their siblings.
2968 * So PCI devices (unless they're on the root bus) as well as
2969 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2970 * the 1:1 domain, just in _case_ one of their siblings turns out
2971 * not to be able to map all of memory.
2973 if (!pci_is_pcie(pdev)) {
2974 if (!pci_is_root_bus(pdev->bus))
2976 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2978 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2981 if (device_has_rmrr(dev))
2986 * At boot time, we don't yet know if devices will be 64-bit capable.
2987 * Assume that they will — if they turn out not to be, then we can
2988 * take them out of the 1:1 domain later.
2992 * If the device's dma_mask is less than the system's memory
2993 * size then this is not a candidate for identity mapping.
2995 u64 dma_mask = *dev->dma_mask;
2997 if (dev->coherent_dma_mask &&
2998 dev->coherent_dma_mask < dma_mask)
2999 dma_mask = dev->coherent_dma_mask;
3001 return dma_mask >= dma_get_required_mask(dev);
3007 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3011 if (!iommu_should_identity_map(dev, 1))
3014 ret = domain_add_dev_info(si_domain, dev);
3016 dev_info(dev, "%s identity mapping\n",
3017 hw ? "Hardware" : "Software");
3018 else if (ret == -ENODEV)
3019 /* device not associated with an iommu */
3026 static int __init iommu_prepare_static_identity_mapping(int hw)
3028 struct pci_dev *pdev = NULL;
3029 struct dmar_drhd_unit *drhd;
3030 struct intel_iommu *iommu;
3035 for_each_pci_dev(pdev) {
3036 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3041 for_each_active_iommu(iommu, drhd)
3042 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3043 struct acpi_device_physical_node *pn;
3044 struct acpi_device *adev;
3046 if (dev->bus != &acpi_bus_type)
3049 adev= to_acpi_device(dev);
3050 mutex_lock(&adev->physical_node_lock);
3051 list_for_each_entry(pn, &adev->physical_node_list, node) {
3052 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3056 mutex_unlock(&adev->physical_node_lock);
3064 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3067 * Start from the sane iommu hardware state.
3068 * If the queued invalidation is already initialized by us
3069 * (for example, while enabling interrupt-remapping) then
3070 * we got the things already rolling from a sane state.
3074 * Clear any previous faults.
3076 dmar_fault(-1, iommu);
3078 * Disable queued invalidation if supported and already enabled
3079 * before OS handover.
3081 dmar_disable_qi(iommu);
3084 if (dmar_enable_qi(iommu)) {
3086 * Queued Invalidate not enabled, use Register Based Invalidate
3088 iommu->flush.flush_context = __iommu_flush_context;
3089 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3090 pr_info("%s: Using Register based invalidation\n",
3093 iommu->flush.flush_context = qi_flush_context;
3094 iommu->flush.flush_iotlb = qi_flush_iotlb;
3095 pr_info("%s: Using Queued invalidation\n", iommu->name);
3099 static int copy_context_table(struct intel_iommu *iommu,
3100 struct root_entry *old_re,
3101 struct context_entry **tbl,
3104 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3105 struct context_entry *new_ce = NULL, ce;
3106 struct context_entry *old_ce = NULL;
3107 struct root_entry re;
3108 phys_addr_t old_ce_phys;
3110 tbl_idx = ext ? bus * 2 : bus;
3111 memcpy(&re, old_re, sizeof(re));
3113 for (devfn = 0; devfn < 256; devfn++) {
3114 /* First calculate the correct index */
3115 idx = (ext ? devfn * 2 : devfn) % 256;
3118 /* First save what we may have and clean up */
3120 tbl[tbl_idx] = new_ce;
3121 __iommu_flush_cache(iommu, new_ce,
3131 old_ce_phys = root_entry_lctp(&re);
3133 old_ce_phys = root_entry_uctp(&re);
3136 if (ext && devfn == 0) {
3137 /* No LCTP, try UCTP */
3146 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3151 new_ce = alloc_pgtable_page(iommu->node);
3158 /* Now copy the context entry */
3159 memcpy(&ce, old_ce + idx, sizeof(ce));
3161 if (!__context_present(&ce))
3164 did = context_domain_id(&ce);
3165 if (did >= 0 && did < cap_ndoms(iommu->cap))
3166 set_bit(did, iommu->domain_ids);
3169 * We need a marker for copied context entries. This
3170 * marker needs to work for the old format as well as
3171 * for extended context entries.
3173 * Bit 67 of the context entry is used. In the old
3174 * format this bit is available to software, in the
3175 * extended format it is the PGE bit, but PGE is ignored
3176 * by HW if PASIDs are disabled (and thus still
3179 * So disable PASIDs first and then mark the entry
3180 * copied. This means that we don't copy PASID
3181 * translations from the old kernel, but this is fine as
3182 * faults there are not fatal.
3184 context_clear_pasid_enable(&ce);
3185 context_set_copied(&ce);
3190 tbl[tbl_idx + pos] = new_ce;
3192 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3201 static int copy_translation_tables(struct intel_iommu *iommu)
3203 struct context_entry **ctxt_tbls;
3204 struct root_entry *old_rt;
3205 phys_addr_t old_rt_phys;
3206 int ctxt_table_entries;
3207 unsigned long flags;
3212 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3213 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3214 new_ext = !!ecap_ecs(iommu->ecap);
3217 * The RTT bit can only be changed when translation is disabled,
3218 * but disabling translation means to open a window for data
3219 * corruption. So bail out and don't copy anything if we would
3220 * have to change the bit.
3225 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3229 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3233 /* This is too big for the stack - allocate it from slab */
3234 ctxt_table_entries = ext ? 512 : 256;
3236 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3240 for (bus = 0; bus < 256; bus++) {
3241 ret = copy_context_table(iommu, &old_rt[bus],
3242 ctxt_tbls, bus, ext);
3244 pr_err("%s: Failed to copy context table for bus %d\n",
3250 spin_lock_irqsave(&iommu->lock, flags);
3252 /* Context tables are copied, now write them to the root_entry table */
3253 for (bus = 0; bus < 256; bus++) {
3254 int idx = ext ? bus * 2 : bus;
3257 if (ctxt_tbls[idx]) {
3258 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3259 iommu->root_entry[bus].lo = val;
3262 if (!ext || !ctxt_tbls[idx + 1])
3265 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3266 iommu->root_entry[bus].hi = val;
3269 spin_unlock_irqrestore(&iommu->lock, flags);
3273 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3283 static int __init init_dmars(void)
3285 struct dmar_drhd_unit *drhd;
3286 struct dmar_rmrr_unit *rmrr;
3287 bool copied_tables = false;
3289 struct intel_iommu *iommu;
3295 * initialize and program root entry to not present
3298 for_each_drhd_unit(drhd) {
3300 * lock not needed as this is only incremented in the single
3301 * threaded kernel __init code path all other access are read
3304 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3308 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3311 /* Preallocate enough resources for IOMMU hot-addition */
3312 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3313 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3315 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3318 pr_err("Allocating global iommu array failed\n");
3323 for_each_active_iommu(iommu, drhd) {
3325 * Find the max pasid size of all IOMMU's in the system.
3326 * We need to ensure the system pasid table is no bigger
3327 * than the smallest supported.
3329 if (pasid_supported(iommu)) {
3330 u32 temp = 2 << ecap_pss(iommu->ecap);
3332 intel_pasid_max_id = min_t(u32, temp,
3333 intel_pasid_max_id);
3336 g_iommus[iommu->seq_id] = iommu;
3338 intel_iommu_init_qi(iommu);
3340 ret = iommu_init_domains(iommu);
3344 init_translation_status(iommu);
3346 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3347 iommu_disable_translation(iommu);
3348 clear_translation_pre_enabled(iommu);
3349 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3355 * we could share the same root & context tables
3356 * among all IOMMU's. Need to Split it later.
3358 ret = iommu_alloc_root_entry(iommu);
3362 if (translation_pre_enabled(iommu)) {
3363 pr_info("Translation already enabled - trying to copy translation structures\n");
3365 ret = copy_translation_tables(iommu);
3368 * We found the IOMMU with translation
3369 * enabled - but failed to copy over the
3370 * old root-entry table. Try to proceed
3371 * by disabling translation now and
3372 * allocating a clean root-entry table.
3373 * This might cause DMAR faults, but
3374 * probably the dump will still succeed.
3376 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3378 iommu_disable_translation(iommu);
3379 clear_translation_pre_enabled(iommu);
3381 pr_info("Copied translation tables from previous kernel for %s\n",
3383 copied_tables = true;
3387 if (!ecap_pass_through(iommu->ecap))
3388 hw_pass_through = 0;
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390 if (pasid_supported(iommu))
3391 intel_svm_init(iommu);
3396 * Now that qi is enabled on all iommus, set the root entry and flush
3397 * caches. This is required on some Intel X58 chipsets, otherwise the
3398 * flush_context function will loop forever and the boot hangs.
3400 for_each_active_iommu(iommu, drhd) {
3401 iommu_flush_write_buffer(iommu);
3402 iommu_set_root_entry(iommu);
3403 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3404 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3407 if (iommu_pass_through)
3408 iommu_identity_mapping |= IDENTMAP_ALL;
3410 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3411 iommu_identity_mapping |= IDENTMAP_GFX;
3414 check_tylersburg_isoch();
3416 if (iommu_identity_mapping) {
3417 ret = si_domain_init(hw_pass_through);
3424 * If we copied translations from a previous kernel in the kdump
3425 * case, we can not assign the devices to domains now, as that
3426 * would eliminate the old mappings. So skip this part and defer
3427 * the assignment to device driver initialization time.
3433 * If pass through is not set or not enabled, setup context entries for
3434 * identity mappings for rmrr, gfx, and isa and may fall back to static
3435 * identity mapping if iommu_identity_mapping is set.
3437 if (iommu_identity_mapping) {
3438 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3440 pr_crit("Failed to setup IOMMU pass-through\n");
3446 * for each dev attached to rmrr
3448 * locate drhd for dev, alloc domain for dev
3449 * allocate free domain
3450 * allocate page table entries for rmrr
3451 * if context not allocated for bus
3452 * allocate and init context
3453 * set present in root table for this bus
3454 * init context with domain, translation etc
3458 pr_info("Setting RMRR:\n");
3459 for_each_rmrr_units(rmrr) {
3460 /* some BIOS lists non-exist devices in DMAR table. */
3461 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3463 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3465 pr_err("Mapping reserved region failed\n");
3469 iommu_prepare_isa();
3476 * global invalidate context cache
3477 * global invalidate iotlb
3478 * enable translation
3480 for_each_iommu(iommu, drhd) {
3481 if (drhd->ignored) {
3483 * we always have to disable PMRs or DMA may fail on
3487 iommu_disable_protect_mem_regions(iommu);
3491 iommu_flush_write_buffer(iommu);
3493 #ifdef CONFIG_INTEL_IOMMU_SVM
3494 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3495 ret = intel_svm_enable_prq(iommu);
3500 ret = dmar_set_interrupt(iommu);
3504 if (!translation_pre_enabled(iommu))
3505 iommu_enable_translation(iommu);
3507 iommu_disable_protect_mem_regions(iommu);
3513 for_each_active_iommu(iommu, drhd) {
3514 disable_dmar_iommu(iommu);
3515 free_dmar_iommu(iommu);
3524 /* This takes a number of _MM_ pages, not VTD pages */
3525 static unsigned long intel_alloc_iova(struct device *dev,
3526 struct dmar_domain *domain,
3527 unsigned long nrpages, uint64_t dma_mask)
3529 unsigned long iova_pfn;
3531 /* Restrict dma_mask to the width that the iommu can handle */
3532 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3533 /* Ensure we reserve the whole size-aligned region */
3534 nrpages = __roundup_pow_of_two(nrpages);
3536 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3538 * First try to allocate an io virtual address in
3539 * DMA_BIT_MASK(32) and if that fails then try allocating
3542 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3543 IOVA_PFN(DMA_BIT_MASK(32)), false);
3547 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3548 IOVA_PFN(dma_mask), true);
3549 if (unlikely(!iova_pfn)) {
3550 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3557 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3559 struct dmar_domain *domain, *tmp;
3560 struct dmar_rmrr_unit *rmrr;
3561 struct device *i_dev;
3564 domain = find_domain(dev);
3568 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3572 /* We have a new domain - setup possible RMRRs for the device */
3574 for_each_rmrr_units(rmrr) {
3575 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3580 ret = domain_prepare_identity_map(dev, domain,
3584 dev_err(dev, "Mapping reserved region failed\n");
3589 tmp = set_domain_for_dev(dev, domain);
3590 if (!tmp || domain != tmp) {
3591 domain_exit(domain);
3598 dev_err(dev, "Allocating domain failed\n");
3604 /* Check if the dev needs to go through non-identity map and unmap process.*/
3605 static int iommu_no_mapping(struct device *dev)
3609 if (iommu_dummy(dev))
3612 if (!iommu_identity_mapping)
3615 found = identity_mapping(dev);
3617 if (iommu_should_identity_map(dev, 0))
3621 * 32 bit DMA is removed from si_domain and fall back
3622 * to non-identity mapping.
3624 dmar_remove_one_dev_info(si_domain, dev);
3625 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3630 * In case of a detached 64 bit DMA device from vm, the device
3631 * is put into si_domain for identity mapping.
3633 if (iommu_should_identity_map(dev, 0)) {
3635 ret = domain_add_dev_info(si_domain, dev);
3637 dev_info(dev, "64bit DMA uses identity mapping\n");
3646 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3647 size_t size, int dir, u64 dma_mask)
3649 struct dmar_domain *domain;
3650 phys_addr_t start_paddr;
3651 unsigned long iova_pfn;
3654 struct intel_iommu *iommu;
3655 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3657 BUG_ON(dir == DMA_NONE);
3659 if (iommu_no_mapping(dev))
3662 domain = get_valid_domain_for_dev(dev);
3664 return DMA_MAPPING_ERROR;
3666 iommu = domain_get_iommu(domain);
3667 size = aligned_nrpages(paddr, size);
3669 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3674 * Check if DMAR supports zero-length reads on write only
3677 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3678 !cap_zlr(iommu->cap))
3679 prot |= DMA_PTE_READ;
3680 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3681 prot |= DMA_PTE_WRITE;
3683 * paddr - (paddr + size) might be partial page, we should map the whole
3684 * page. Note: if two part of one page are separately mapped, we
3685 * might have two guest_addr mapping to the same host paddr, but this
3686 * is not a big problem
3688 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3689 mm_to_dma_pfn(paddr_pfn), size, prot);
3693 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3694 start_paddr += paddr & ~PAGE_MASK;
3699 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3700 dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3701 size, (unsigned long long)paddr, dir);
3702 return DMA_MAPPING_ERROR;
3705 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3706 unsigned long offset, size_t size,
3707 enum dma_data_direction dir,
3708 unsigned long attrs)
3710 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3711 dir, *dev->dma_mask);
3714 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3715 size_t size, enum dma_data_direction dir,
3716 unsigned long attrs)
3718 return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3721 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3723 struct dmar_domain *domain;
3724 unsigned long start_pfn, last_pfn;
3725 unsigned long nrpages;
3726 unsigned long iova_pfn;
3727 struct intel_iommu *iommu;
3728 struct page *freelist;
3730 if (iommu_no_mapping(dev))
3733 domain = find_domain(dev);
3736 iommu = domain_get_iommu(domain);
3738 iova_pfn = IOVA_PFN(dev_addr);
3740 nrpages = aligned_nrpages(dev_addr, size);
3741 start_pfn = mm_to_dma_pfn(iova_pfn);
3742 last_pfn = start_pfn + nrpages - 1;
3744 dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3746 freelist = domain_unmap(domain, start_pfn, last_pfn);
3748 if (intel_iommu_strict) {
3749 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3750 nrpages, !freelist, 0);
3752 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3753 dma_free_pagelist(freelist);
3755 queue_iova(&domain->iovad, iova_pfn, nrpages,
3756 (unsigned long)freelist);
3758 * queue up the release of the unmap to save the 1/6th of the
3759 * cpu used up by the iotlb flush operation...
3764 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3765 size_t size, enum dma_data_direction dir,
3766 unsigned long attrs)
3768 intel_unmap(dev, dev_addr, size);
3771 static void *intel_alloc_coherent(struct device *dev, size_t size,
3772 dma_addr_t *dma_handle, gfp_t flags,
3773 unsigned long attrs)
3775 struct page *page = NULL;
3778 size = PAGE_ALIGN(size);
3779 order = get_order(size);
3781 if (!iommu_no_mapping(dev))
3782 flags &= ~(GFP_DMA | GFP_DMA32);
3783 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3784 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3790 if (gfpflags_allow_blocking(flags)) {
3791 unsigned int count = size >> PAGE_SHIFT;
3793 page = dma_alloc_from_contiguous(dev, count, order,
3794 flags & __GFP_NOWARN);
3795 if (page && iommu_no_mapping(dev) &&
3796 page_to_phys(page) + size > dev->coherent_dma_mask) {
3797 dma_release_from_contiguous(dev, page, count);
3803 page = alloc_pages(flags, order);
3806 memset(page_address(page), 0, size);
3808 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3810 dev->coherent_dma_mask);
3811 if (*dma_handle != DMA_MAPPING_ERROR)
3812 return page_address(page);
3813 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3814 __free_pages(page, order);
3819 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3820 dma_addr_t dma_handle, unsigned long attrs)
3823 struct page *page = virt_to_page(vaddr);
3825 size = PAGE_ALIGN(size);
3826 order = get_order(size);
3828 intel_unmap(dev, dma_handle, size);
3829 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3830 __free_pages(page, order);
3833 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3834 int nelems, enum dma_data_direction dir,
3835 unsigned long attrs)
3837 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3838 unsigned long nrpages = 0;
3839 struct scatterlist *sg;
3842 for_each_sg(sglist, sg, nelems, i) {
3843 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3846 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3849 static int intel_nontranslate_map_sg(struct device *hddev,
3850 struct scatterlist *sglist, int nelems, int dir)
3853 struct scatterlist *sg;
3855 for_each_sg(sglist, sg, nelems, i) {
3856 BUG_ON(!sg_page(sg));
3857 sg->dma_address = sg_phys(sg);
3858 sg->dma_length = sg->length;
3863 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3864 enum dma_data_direction dir, unsigned long attrs)
3867 struct dmar_domain *domain;
3870 unsigned long iova_pfn;
3872 struct scatterlist *sg;
3873 unsigned long start_vpfn;
3874 struct intel_iommu *iommu;
3876 BUG_ON(dir == DMA_NONE);
3877 if (iommu_no_mapping(dev))
3878 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3880 domain = get_valid_domain_for_dev(dev);
3884 iommu = domain_get_iommu(domain);
3886 for_each_sg(sglist, sg, nelems, i)
3887 size += aligned_nrpages(sg->offset, sg->length);
3889 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3892 sglist->dma_length = 0;
3897 * Check if DMAR supports zero-length reads on write only
3900 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3901 !cap_zlr(iommu->cap))
3902 prot |= DMA_PTE_READ;
3903 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3904 prot |= DMA_PTE_WRITE;
3906 start_vpfn = mm_to_dma_pfn(iova_pfn);
3908 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3909 if (unlikely(ret)) {
3910 dma_pte_free_pagetable(domain, start_vpfn,
3911 start_vpfn + size - 1,
3912 agaw_to_level(domain->agaw) + 1);
3913 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3920 static const struct dma_map_ops intel_dma_ops = {
3921 .alloc = intel_alloc_coherent,
3922 .free = intel_free_coherent,
3923 .map_sg = intel_map_sg,
3924 .unmap_sg = intel_unmap_sg,
3925 .map_page = intel_map_page,
3926 .unmap_page = intel_unmap_page,
3927 .map_resource = intel_map_resource,
3928 .unmap_resource = intel_unmap_page,
3929 .dma_supported = dma_direct_supported,
3932 static inline int iommu_domain_cache_init(void)
3936 iommu_domain_cache = kmem_cache_create("iommu_domain",
3937 sizeof(struct dmar_domain),
3942 if (!iommu_domain_cache) {
3943 pr_err("Couldn't create iommu_domain cache\n");
3950 static inline int iommu_devinfo_cache_init(void)
3954 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3955 sizeof(struct device_domain_info),
3959 if (!iommu_devinfo_cache) {
3960 pr_err("Couldn't create devinfo cache\n");
3967 static int __init iommu_init_mempool(void)
3970 ret = iova_cache_get();
3974 ret = iommu_domain_cache_init();
3978 ret = iommu_devinfo_cache_init();
3982 kmem_cache_destroy(iommu_domain_cache);
3989 static void __init iommu_exit_mempool(void)
3991 kmem_cache_destroy(iommu_devinfo_cache);
3992 kmem_cache_destroy(iommu_domain_cache);
3996 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3998 struct dmar_drhd_unit *drhd;
4002 /* We know that this device on this chipset has its own IOMMU.
4003 * If we find it under a different IOMMU, then the BIOS is lying
4004 * to us. Hope that the IOMMU for this device is actually
4005 * disabled, and it needs no translation...
4007 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4009 /* "can't" happen */
4010 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4013 vtbar &= 0xffff0000;
4015 /* we know that the this iommu should be at offset 0xa000 from vtbar */
4016 drhd = dmar_find_matched_drhd_unit(pdev);
4017 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4018 TAINT_FIRMWARE_WORKAROUND,
4019 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4020 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4022 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4024 static void __init init_no_remapping_devices(void)
4026 struct dmar_drhd_unit *drhd;
4030 for_each_drhd_unit(drhd) {
4031 if (!drhd->include_all) {
4032 for_each_active_dev_scope(drhd->devices,
4033 drhd->devices_cnt, i, dev)
4035 /* ignore DMAR unit if no devices exist */
4036 if (i == drhd->devices_cnt)
4041 for_each_active_drhd_unit(drhd) {
4042 if (drhd->include_all)
4045 for_each_active_dev_scope(drhd->devices,
4046 drhd->devices_cnt, i, dev)
4047 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4049 if (i < drhd->devices_cnt)
4052 /* This IOMMU has *only* gfx devices. Either bypass it or
4053 set the gfx_mapped flag, as appropriate */
4055 intel_iommu_gfx_mapped = 1;
4058 for_each_active_dev_scope(drhd->devices,
4059 drhd->devices_cnt, i, dev)
4060 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4065 #ifdef CONFIG_SUSPEND
4066 static int init_iommu_hw(void)
4068 struct dmar_drhd_unit *drhd;
4069 struct intel_iommu *iommu = NULL;
4071 for_each_active_iommu(iommu, drhd)
4073 dmar_reenable_qi(iommu);
4075 for_each_iommu(iommu, drhd) {
4076 if (drhd->ignored) {
4078 * we always have to disable PMRs or DMA may fail on
4082 iommu_disable_protect_mem_regions(iommu);
4086 iommu_flush_write_buffer(iommu);
4088 iommu_set_root_entry(iommu);
4090 iommu->flush.flush_context(iommu, 0, 0, 0,
4091 DMA_CCMD_GLOBAL_INVL);
4092 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4093 iommu_enable_translation(iommu);
4094 iommu_disable_protect_mem_regions(iommu);
4100 static void iommu_flush_all(void)
4102 struct dmar_drhd_unit *drhd;
4103 struct intel_iommu *iommu;
4105 for_each_active_iommu(iommu, drhd) {
4106 iommu->flush.flush_context(iommu, 0, 0, 0,
4107 DMA_CCMD_GLOBAL_INVL);
4108 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4109 DMA_TLB_GLOBAL_FLUSH);
4113 static int iommu_suspend(void)
4115 struct dmar_drhd_unit *drhd;
4116 struct intel_iommu *iommu = NULL;
4119 for_each_active_iommu(iommu, drhd) {
4120 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4122 if (!iommu->iommu_state)
4128 for_each_active_iommu(iommu, drhd) {
4129 iommu_disable_translation(iommu);
4131 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4133 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4134 readl(iommu->reg + DMAR_FECTL_REG);
4135 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4136 readl(iommu->reg + DMAR_FEDATA_REG);
4137 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4138 readl(iommu->reg + DMAR_FEADDR_REG);
4139 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4140 readl(iommu->reg + DMAR_FEUADDR_REG);
4142 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4147 for_each_active_iommu(iommu, drhd)
4148 kfree(iommu->iommu_state);
4153 static void iommu_resume(void)
4155 struct dmar_drhd_unit *drhd;
4156 struct intel_iommu *iommu = NULL;
4159 if (init_iommu_hw()) {
4161 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4163 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4167 for_each_active_iommu(iommu, drhd) {
4169 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4171 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4172 iommu->reg + DMAR_FECTL_REG);
4173 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4174 iommu->reg + DMAR_FEDATA_REG);
4175 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4176 iommu->reg + DMAR_FEADDR_REG);
4177 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4178 iommu->reg + DMAR_FEUADDR_REG);
4180 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4183 for_each_active_iommu(iommu, drhd)
4184 kfree(iommu->iommu_state);
4187 static struct syscore_ops iommu_syscore_ops = {
4188 .resume = iommu_resume,
4189 .suspend = iommu_suspend,
4192 static void __init init_iommu_pm_ops(void)
4194 register_syscore_ops(&iommu_syscore_ops);
4198 static inline void init_iommu_pm_ops(void) {}
4199 #endif /* CONFIG_PM */
4202 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4204 struct acpi_dmar_reserved_memory *rmrr;
4205 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4206 struct dmar_rmrr_unit *rmrru;
4209 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4213 rmrru->hdr = header;
4214 rmrr = (struct acpi_dmar_reserved_memory *)header;
4215 rmrru->base_address = rmrr->base_address;
4216 rmrru->end_address = rmrr->end_address;
4218 length = rmrr->end_address - rmrr->base_address + 1;
4219 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4224 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4225 ((void *)rmrr) + rmrr->header.length,
4226 &rmrru->devices_cnt);
4227 if (rmrru->devices_cnt && rmrru->devices == NULL)
4230 list_add(&rmrru->list, &dmar_rmrr_units);
4241 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4243 struct dmar_atsr_unit *atsru;
4244 struct acpi_dmar_atsr *tmp;
4246 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4247 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4248 if (atsr->segment != tmp->segment)
4250 if (atsr->header.length != tmp->header.length)
4252 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4259 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4261 struct acpi_dmar_atsr *atsr;
4262 struct dmar_atsr_unit *atsru;
4264 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4267 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4268 atsru = dmar_find_atsr(atsr);
4272 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4277 * If memory is allocated from slab by ACPI _DSM method, we need to
4278 * copy the memory content because the memory buffer will be freed
4281 atsru->hdr = (void *)(atsru + 1);
4282 memcpy(atsru->hdr, hdr, hdr->length);
4283 atsru->include_all = atsr->flags & 0x1;
4284 if (!atsru->include_all) {
4285 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4286 (void *)atsr + atsr->header.length,
4287 &atsru->devices_cnt);
4288 if (atsru->devices_cnt && atsru->devices == NULL) {
4294 list_add_rcu(&atsru->list, &dmar_atsr_units);
4299 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4301 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4305 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4307 struct acpi_dmar_atsr *atsr;
4308 struct dmar_atsr_unit *atsru;
4310 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4311 atsru = dmar_find_atsr(atsr);
4313 list_del_rcu(&atsru->list);
4315 intel_iommu_free_atsr(atsru);
4321 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4325 struct acpi_dmar_atsr *atsr;
4326 struct dmar_atsr_unit *atsru;
4328 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4329 atsru = dmar_find_atsr(atsr);
4333 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4334 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4342 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4345 struct intel_iommu *iommu = dmaru->iommu;
4347 if (g_iommus[iommu->seq_id])
4350 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4351 pr_warn("%s: Doesn't support hardware pass through.\n",
4355 if (!ecap_sc_support(iommu->ecap) &&
4356 domain_update_iommu_snooping(iommu)) {
4357 pr_warn("%s: Doesn't support snooping.\n",
4361 sp = domain_update_iommu_superpage(iommu) - 1;
4362 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4363 pr_warn("%s: Doesn't support large page.\n",
4369 * Disable translation if already enabled prior to OS handover.
4371 if (iommu->gcmd & DMA_GCMD_TE)
4372 iommu_disable_translation(iommu);
4374 g_iommus[iommu->seq_id] = iommu;
4375 ret = iommu_init_domains(iommu);
4377 ret = iommu_alloc_root_entry(iommu);
4381 #ifdef CONFIG_INTEL_IOMMU_SVM
4382 if (pasid_supported(iommu))
4383 intel_svm_init(iommu);
4386 if (dmaru->ignored) {
4388 * we always have to disable PMRs or DMA may fail on this device
4391 iommu_disable_protect_mem_regions(iommu);
4395 intel_iommu_init_qi(iommu);
4396 iommu_flush_write_buffer(iommu);
4398 #ifdef CONFIG_INTEL_IOMMU_SVM
4399 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4400 ret = intel_svm_enable_prq(iommu);
4405 ret = dmar_set_interrupt(iommu);
4409 iommu_set_root_entry(iommu);
4410 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4411 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4412 iommu_enable_translation(iommu);
4414 iommu_disable_protect_mem_regions(iommu);
4418 disable_dmar_iommu(iommu);
4420 free_dmar_iommu(iommu);
4424 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4427 struct intel_iommu *iommu = dmaru->iommu;
4429 if (!intel_iommu_enabled)
4435 ret = intel_iommu_add(dmaru);
4437 disable_dmar_iommu(iommu);
4438 free_dmar_iommu(iommu);
4444 static void intel_iommu_free_dmars(void)
4446 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4447 struct dmar_atsr_unit *atsru, *atsr_n;
4449 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4450 list_del(&rmrru->list);
4451 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4456 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4457 list_del(&atsru->list);
4458 intel_iommu_free_atsr(atsru);
4462 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4465 struct pci_bus *bus;
4466 struct pci_dev *bridge = NULL;
4468 struct acpi_dmar_atsr *atsr;
4469 struct dmar_atsr_unit *atsru;
4471 dev = pci_physfn(dev);
4472 for (bus = dev->bus; bus; bus = bus->parent) {
4474 /* If it's an integrated device, allow ATS */
4477 /* Connected via non-PCIe: no ATS */
4478 if (!pci_is_pcie(bridge) ||
4479 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4481 /* If we found the root port, look it up in the ATSR */
4482 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4487 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4488 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4489 if (atsr->segment != pci_domain_nr(dev->bus))
4492 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4493 if (tmp == &bridge->dev)
4496 if (atsru->include_all)
4506 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4509 struct dmar_rmrr_unit *rmrru;
4510 struct dmar_atsr_unit *atsru;
4511 struct acpi_dmar_atsr *atsr;
4512 struct acpi_dmar_reserved_memory *rmrr;
4514 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4517 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4518 rmrr = container_of(rmrru->hdr,
4519 struct acpi_dmar_reserved_memory, header);
4520 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4521 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4522 ((void *)rmrr) + rmrr->header.length,
4523 rmrr->segment, rmrru->devices,
4524 rmrru->devices_cnt);
4527 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4528 dmar_remove_dev_scope(info, rmrr->segment,
4529 rmrru->devices, rmrru->devices_cnt);
4533 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4534 if (atsru->include_all)
4537 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4538 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4539 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4540 (void *)atsr + atsr->header.length,
4541 atsr->segment, atsru->devices,
4542 atsru->devices_cnt);
4547 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4548 if (dmar_remove_dev_scope(info, atsr->segment,
4549 atsru->devices, atsru->devices_cnt))
4558 * Here we only respond to action of unbound device from driver.
4560 * Added device is not attached to its DMAR domain here yet. That will happen
4561 * when mapping the device to iova.
4563 static int device_notifier(struct notifier_block *nb,
4564 unsigned long action, void *data)
4566 struct device *dev = data;
4567 struct dmar_domain *domain;
4569 if (iommu_dummy(dev))
4572 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4575 domain = find_domain(dev);
4579 dmar_remove_one_dev_info(domain, dev);
4580 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4581 domain_exit(domain);
4586 static struct notifier_block device_nb = {
4587 .notifier_call = device_notifier,
4590 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4591 unsigned long val, void *v)
4593 struct memory_notify *mhp = v;
4594 unsigned long long start, end;
4595 unsigned long start_vpfn, last_vpfn;
4598 case MEM_GOING_ONLINE:
4599 start = mhp->start_pfn << PAGE_SHIFT;
4600 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4601 if (iommu_domain_identity_map(si_domain, start, end)) {
4602 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4609 case MEM_CANCEL_ONLINE:
4610 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4611 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4612 while (start_vpfn <= last_vpfn) {
4614 struct dmar_drhd_unit *drhd;
4615 struct intel_iommu *iommu;
4616 struct page *freelist;
4618 iova = find_iova(&si_domain->iovad, start_vpfn);
4620 pr_debug("Failed get IOVA for PFN %lx\n",
4625 iova = split_and_remove_iova(&si_domain->iovad, iova,
4626 start_vpfn, last_vpfn);
4628 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4629 start_vpfn, last_vpfn);
4633 freelist = domain_unmap(si_domain, iova->pfn_lo,
4637 for_each_active_iommu(iommu, drhd)
4638 iommu_flush_iotlb_psi(iommu, si_domain,
4639 iova->pfn_lo, iova_size(iova),
4642 dma_free_pagelist(freelist);
4644 start_vpfn = iova->pfn_hi + 1;
4645 free_iova_mem(iova);
4653 static struct notifier_block intel_iommu_memory_nb = {
4654 .notifier_call = intel_iommu_memory_notifier,
4658 static void free_all_cpu_cached_iovas(unsigned int cpu)
4662 for (i = 0; i < g_num_of_iommus; i++) {
4663 struct intel_iommu *iommu = g_iommus[i];
4664 struct dmar_domain *domain;
4670 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4671 domain = get_iommu_domain(iommu, (u16)did);
4675 free_cpu_cached_iovas(cpu, &domain->iovad);
4680 static int intel_iommu_cpu_dead(unsigned int cpu)
4682 free_all_cpu_cached_iovas(cpu);
4686 static void intel_disable_iommus(void)
4688 struct intel_iommu *iommu = NULL;
4689 struct dmar_drhd_unit *drhd;
4691 for_each_iommu(iommu, drhd)
4692 iommu_disable_translation(iommu);
4695 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4697 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4699 return container_of(iommu_dev, struct intel_iommu, iommu);
4702 static ssize_t intel_iommu_show_version(struct device *dev,
4703 struct device_attribute *attr,
4706 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4707 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4708 return sprintf(buf, "%d:%d\n",
4709 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4711 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4713 static ssize_t intel_iommu_show_address(struct device *dev,
4714 struct device_attribute *attr,
4717 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718 return sprintf(buf, "%llx\n", iommu->reg_phys);
4720 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4722 static ssize_t intel_iommu_show_cap(struct device *dev,
4723 struct device_attribute *attr,
4726 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727 return sprintf(buf, "%llx\n", iommu->cap);
4729 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4731 static ssize_t intel_iommu_show_ecap(struct device *dev,
4732 struct device_attribute *attr,
4735 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736 return sprintf(buf, "%llx\n", iommu->ecap);
4738 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4740 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4741 struct device_attribute *attr,
4744 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4745 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4747 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4749 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4750 struct device_attribute *attr,
4753 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4754 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4755 cap_ndoms(iommu->cap)));
4757 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4759 static struct attribute *intel_iommu_attrs[] = {
4760 &dev_attr_version.attr,
4761 &dev_attr_address.attr,
4763 &dev_attr_ecap.attr,
4764 &dev_attr_domains_supported.attr,
4765 &dev_attr_domains_used.attr,
4769 static struct attribute_group intel_iommu_group = {
4770 .name = "intel-iommu",
4771 .attrs = intel_iommu_attrs,
4774 const struct attribute_group *intel_iommu_groups[] = {
4779 static int __init platform_optin_force_iommu(void)
4781 struct pci_dev *pdev = NULL;
4782 bool has_untrusted_dev = false;
4784 if (!dmar_platform_optin() || no_platform_optin)
4787 for_each_pci_dev(pdev) {
4788 if (pdev->untrusted) {
4789 has_untrusted_dev = true;
4794 if (!has_untrusted_dev)
4797 if (no_iommu || dmar_disabled)
4798 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4801 * If Intel-IOMMU is disabled by default, we will apply identity
4802 * map for all devices except those marked as being untrusted.
4805 iommu_identity_mapping |= IDENTMAP_ALL;
4808 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4816 int __init intel_iommu_init(void)
4819 struct dmar_drhd_unit *drhd;
4820 struct intel_iommu *iommu;
4823 * Intel IOMMU is required for a TXT/tboot launch or platform
4824 * opt in, so enforce that.
4826 force_on = tboot_force_iommu() || platform_optin_force_iommu();
4828 if (iommu_init_mempool()) {
4830 panic("tboot: Failed to initialize iommu memory\n");
4834 down_write(&dmar_global_lock);
4835 if (dmar_table_init()) {
4837 panic("tboot: Failed to initialize DMAR table\n");
4841 if (dmar_dev_scope_init() < 0) {
4843 panic("tboot: Failed to initialize DMAR device scope\n");
4847 up_write(&dmar_global_lock);
4850 * The bus notifier takes the dmar_global_lock, so lockdep will
4851 * complain later when we register it under the lock.
4853 dmar_register_bus_notifier();
4855 down_write(&dmar_global_lock);
4857 if (no_iommu || dmar_disabled) {
4859 * We exit the function here to ensure IOMMU's remapping and
4860 * mempool aren't setup, which means that the IOMMU's PMRs
4861 * won't be disabled via the call to init_dmars(). So disable
4862 * it explicitly here. The PMRs were setup by tboot prior to
4863 * calling SENTER, but the kernel is expected to reset/tear
4866 if (intel_iommu_tboot_noforce) {
4867 for_each_iommu(iommu, drhd)
4868 iommu_disable_protect_mem_regions(iommu);
4872 * Make sure the IOMMUs are switched off, even when we
4873 * boot into a kexec kernel and the previous kernel left
4876 intel_disable_iommus();
4880 if (list_empty(&dmar_rmrr_units))
4881 pr_info("No RMRR found\n");
4883 if (list_empty(&dmar_atsr_units))
4884 pr_info("No ATSR found\n");
4886 if (dmar_init_reserved_ranges()) {
4888 panic("tboot: Failed to reserve iommu ranges\n");
4889 goto out_free_reserved_range;
4892 init_no_remapping_devices();
4897 panic("tboot: Failed to initialize DMARs\n");
4898 pr_err("Initialization failed\n");
4899 goto out_free_reserved_range;
4901 up_write(&dmar_global_lock);
4902 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4904 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4907 dma_ops = &intel_dma_ops;
4909 init_iommu_pm_ops();
4911 for_each_active_iommu(iommu, drhd) {
4912 iommu_device_sysfs_add(&iommu->iommu, NULL,
4915 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4916 iommu_device_register(&iommu->iommu);
4919 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4920 bus_register_notifier(&pci_bus_type, &device_nb);
4921 if (si_domain && !hw_pass_through)
4922 register_memory_notifier(&intel_iommu_memory_nb);
4923 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4924 intel_iommu_cpu_dead);
4925 intel_iommu_enabled = 1;
4926 intel_iommu_debugfs_init();
4930 out_free_reserved_range:
4931 put_iova_domain(&reserved_iova_list);
4933 intel_iommu_free_dmars();
4934 up_write(&dmar_global_lock);
4935 iommu_exit_mempool();
4939 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4941 struct intel_iommu *iommu = opaque;
4943 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4948 * NB - intel-iommu lacks any sort of reference counting for the users of
4949 * dependent devices. If multiple endpoints have intersecting dependent
4950 * devices, unbinding the driver from any one of them will possibly leave
4951 * the others unable to operate.
4953 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4955 if (!iommu || !dev || !dev_is_pci(dev))
4958 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4961 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4963 struct intel_iommu *iommu;
4964 unsigned long flags;
4966 assert_spin_locked(&device_domain_lock);
4971 iommu = info->iommu;
4974 if (dev_is_pci(info->dev) && sm_supported(iommu))
4975 intel_pasid_tear_down_entry(iommu, info->dev,
4978 iommu_disable_dev_iotlb(info);
4979 domain_context_clear(iommu, info->dev);
4980 intel_pasid_free_table(info->dev);
4983 unlink_domain_info(info);
4985 spin_lock_irqsave(&iommu->lock, flags);
4986 domain_detach_iommu(info->domain, iommu);
4987 spin_unlock_irqrestore(&iommu->lock, flags);
4989 free_devinfo_mem(info);
4992 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4995 struct device_domain_info *info;
4996 unsigned long flags;
4998 spin_lock_irqsave(&device_domain_lock, flags);
4999 info = dev->archdata.iommu;
5000 __dmar_remove_one_dev_info(info);
5001 spin_unlock_irqrestore(&device_domain_lock, flags);
5004 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5008 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5009 domain_reserve_special_ranges(domain);
5011 /* calculate AGAW */
5012 domain->gaw = guest_width;
5013 adjust_width = guestwidth_to_adjustwidth(guest_width);
5014 domain->agaw = width_to_agaw(adjust_width);
5016 domain->iommu_coherency = 0;
5017 domain->iommu_snooping = 0;
5018 domain->iommu_superpage = 0;
5019 domain->max_addr = 0;
5021 /* always allocate the top pgd */
5022 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5025 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5029 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5031 struct dmar_domain *dmar_domain;
5032 struct iommu_domain *domain;
5034 if (type != IOMMU_DOMAIN_UNMANAGED)
5037 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5039 pr_err("Can't allocate dmar_domain\n");
5042 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5043 pr_err("Domain initialization failed\n");
5044 domain_exit(dmar_domain);
5047 domain_update_iommu_cap(dmar_domain);
5049 domain = &dmar_domain->domain;
5050 domain->geometry.aperture_start = 0;
5051 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5052 domain->geometry.force_aperture = true;
5057 static void intel_iommu_domain_free(struct iommu_domain *domain)
5059 domain_exit(to_dmar_domain(domain));
5062 static int intel_iommu_attach_device(struct iommu_domain *domain,
5065 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5066 struct intel_iommu *iommu;
5070 if (device_is_rmrr_locked(dev)) {
5071 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
5075 /* normally dev is not mapped */
5076 if (unlikely(domain_context_mapped(dev))) {
5077 struct dmar_domain *old_domain;
5079 old_domain = find_domain(dev);
5082 dmar_remove_one_dev_info(old_domain, dev);
5085 if (!domain_type_is_vm_or_si(old_domain) &&
5086 list_empty(&old_domain->devices))
5087 domain_exit(old_domain);
5091 iommu = device_to_iommu(dev, &bus, &devfn);
5095 /* check if this iommu agaw is sufficient for max mapped address */
5096 addr_width = agaw_to_width(iommu->agaw);
5097 if (addr_width > cap_mgaw(iommu->cap))
5098 addr_width = cap_mgaw(iommu->cap);
5100 if (dmar_domain->max_addr > (1LL << addr_width)) {
5101 dev_err(dev, "%s: iommu width (%d) is not "
5102 "sufficient for the mapped address (%llx)\n",
5103 __func__, addr_width, dmar_domain->max_addr);
5106 dmar_domain->gaw = addr_width;
5109 * Knock out extra levels of page tables if necessary
5111 while (iommu->agaw < dmar_domain->agaw) {
5112 struct dma_pte *pte;
5114 pte = dmar_domain->pgd;
5115 if (dma_pte_present(pte)) {
5116 dmar_domain->pgd = (struct dma_pte *)
5117 phys_to_virt(dma_pte_addr(pte));
5118 free_pgtable_page(pte);
5120 dmar_domain->agaw--;
5123 return domain_add_dev_info(dmar_domain, dev);
5126 static void intel_iommu_detach_device(struct iommu_domain *domain,
5129 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5132 static int intel_iommu_map(struct iommu_domain *domain,
5133 unsigned long iova, phys_addr_t hpa,
5134 size_t size, int iommu_prot)
5136 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5141 if (iommu_prot & IOMMU_READ)
5142 prot |= DMA_PTE_READ;
5143 if (iommu_prot & IOMMU_WRITE)
5144 prot |= DMA_PTE_WRITE;
5145 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5146 prot |= DMA_PTE_SNP;
5148 max_addr = iova + size;
5149 if (dmar_domain->max_addr < max_addr) {
5152 /* check if minimum agaw is sufficient for mapped address */
5153 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5154 if (end < max_addr) {
5155 pr_err("%s: iommu width (%d) is not "
5156 "sufficient for the mapped address (%llx)\n",
5157 __func__, dmar_domain->gaw, max_addr);
5160 dmar_domain->max_addr = max_addr;
5162 /* Round up size to next multiple of PAGE_SIZE, if it and
5163 the low bits of hpa would take us onto the next page */
5164 size = aligned_nrpages(hpa, size);
5165 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5166 hpa >> VTD_PAGE_SHIFT, size, prot);
5170 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5171 unsigned long iova, size_t size)
5173 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5174 struct page *freelist = NULL;
5175 unsigned long start_pfn, last_pfn;
5176 unsigned int npages;
5177 int iommu_id, level = 0;
5179 /* Cope with horrid API which requires us to unmap more than the
5180 size argument if it happens to be a large-page mapping. */
5181 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5183 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5184 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5186 start_pfn = iova >> VTD_PAGE_SHIFT;
5187 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5189 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5191 npages = last_pfn - start_pfn + 1;
5193 for_each_domain_iommu(iommu_id, dmar_domain)
5194 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5195 start_pfn, npages, !freelist, 0);
5197 dma_free_pagelist(freelist);
5199 if (dmar_domain->max_addr == iova + size)
5200 dmar_domain->max_addr = iova;
5205 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5208 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5209 struct dma_pte *pte;
5213 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5215 phys = dma_pte_addr(pte);
5220 static bool intel_iommu_capable(enum iommu_cap cap)
5222 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5223 return domain_update_iommu_snooping(NULL) == 1;
5224 if (cap == IOMMU_CAP_INTR_REMAP)
5225 return irq_remapping_enabled == 1;
5230 static int intel_iommu_add_device(struct device *dev)
5232 struct intel_iommu *iommu;
5233 struct iommu_group *group;
5236 iommu = device_to_iommu(dev, &bus, &devfn);
5240 iommu_device_link(&iommu->iommu, dev);
5242 group = iommu_group_get_for_dev(dev);
5245 return PTR_ERR(group);
5247 iommu_group_put(group);
5251 static void intel_iommu_remove_device(struct device *dev)
5253 struct intel_iommu *iommu;
5256 iommu = device_to_iommu(dev, &bus, &devfn);
5260 iommu_group_remove_device(dev);
5262 iommu_device_unlink(&iommu->iommu, dev);
5265 static void intel_iommu_get_resv_regions(struct device *device,
5266 struct list_head *head)
5268 struct iommu_resv_region *reg;
5269 struct dmar_rmrr_unit *rmrr;
5270 struct device *i_dev;
5274 for_each_rmrr_units(rmrr) {
5275 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5277 if (i_dev != device)
5280 list_add_tail(&rmrr->resv->list, head);
5285 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5286 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5290 list_add_tail(®->list, head);
5293 static void intel_iommu_put_resv_regions(struct device *dev,
5294 struct list_head *head)
5296 struct iommu_resv_region *entry, *next;
5298 list_for_each_entry_safe(entry, next, head, list) {
5299 if (entry->type == IOMMU_RESV_RESERVED)
5304 #ifdef CONFIG_INTEL_IOMMU_SVM
5305 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5307 struct device_domain_info *info;
5308 struct context_entry *context;
5309 struct dmar_domain *domain;
5310 unsigned long flags;
5314 domain = get_valid_domain_for_dev(sdev->dev);
5318 spin_lock_irqsave(&device_domain_lock, flags);
5319 spin_lock(&iommu->lock);
5322 info = sdev->dev->archdata.iommu;
5323 if (!info || !info->pasid_supported)
5326 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5327 if (WARN_ON(!context))
5330 ctx_lo = context[0].lo;
5332 sdev->did = domain->iommu_did[iommu->seq_id];
5333 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5335 if (!(ctx_lo & CONTEXT_PASIDE)) {
5336 ctx_lo |= CONTEXT_PASIDE;
5337 context[0].lo = ctx_lo;
5339 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5340 DMA_CCMD_MASK_NOBIT,
5341 DMA_CCMD_DEVICE_INVL);
5344 /* Enable PASID support in the device, if it wasn't already */
5345 if (!info->pasid_enabled)
5346 iommu_enable_dev_iotlb(info);
5348 if (info->ats_enabled) {
5349 sdev->dev_iotlb = 1;
5350 sdev->qdep = info->ats_qdep;
5351 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5357 spin_unlock(&iommu->lock);
5358 spin_unlock_irqrestore(&device_domain_lock, flags);
5363 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5365 struct intel_iommu *iommu;
5368 if (iommu_dummy(dev)) {
5370 "No IOMMU translation for device; cannot enable SVM\n");
5374 iommu = device_to_iommu(dev, &bus, &devfn);
5376 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5382 #endif /* CONFIG_INTEL_IOMMU_SVM */
5384 const struct iommu_ops intel_iommu_ops = {
5385 .capable = intel_iommu_capable,
5386 .domain_alloc = intel_iommu_domain_alloc,
5387 .domain_free = intel_iommu_domain_free,
5388 .attach_dev = intel_iommu_attach_device,
5389 .detach_dev = intel_iommu_detach_device,
5390 .map = intel_iommu_map,
5391 .unmap = intel_iommu_unmap,
5392 .iova_to_phys = intel_iommu_iova_to_phys,
5393 .add_device = intel_iommu_add_device,
5394 .remove_device = intel_iommu_remove_device,
5395 .get_resv_regions = intel_iommu_get_resv_regions,
5396 .put_resv_regions = intel_iommu_put_resv_regions,
5397 .device_group = pci_device_group,
5398 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5401 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5403 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5404 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5416 static void quirk_iommu_rwbf(struct pci_dev *dev)
5419 * Mobile 4 Series Chipset neglects to set RWBF capability,
5420 * but needs it. Same seems to hold for the desktop versions.
5422 pci_info(dev, "Forcing write-buffer flush capability\n");
5426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5435 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5436 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5437 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5438 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5439 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5440 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5441 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5442 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5444 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5448 if (pci_read_config_word(dev, GGC, &ggc))
5451 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5452 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5454 } else if (dmar_map_gfx) {
5455 /* we have to ensure the gfx device is idle before we flush */
5456 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5457 intel_iommu_strict = 1;
5460 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5461 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5462 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5465 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5466 ISOCH DMAR unit for the Azalia sound device, but not give it any
5467 TLB entries, which causes it to deadlock. Check for that. We do
5468 this in a function called from init_dmars(), instead of in a PCI
5469 quirk, because we don't want to print the obnoxious "BIOS broken"
5470 message if VT-d is actually disabled.
5472 static void __init check_tylersburg_isoch(void)
5474 struct pci_dev *pdev;
5475 uint32_t vtisochctrl;
5477 /* If there's no Azalia in the system anyway, forget it. */
5478 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5483 /* System Management Registers. Might be hidden, in which case
5484 we can't do the sanity check. But that's OK, because the
5485 known-broken BIOSes _don't_ actually hide it, so far. */
5486 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5490 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5497 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5498 if (vtisochctrl & 1)
5501 /* Drop all bits other than the number of TLB entries */
5502 vtisochctrl &= 0x1c;
5504 /* If we have the recommended number of TLB entries (16), fine. */
5505 if (vtisochctrl == 0x10)
5508 /* Zero TLB entries? You get to ride the short bus to school. */
5510 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5511 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5512 dmi_get_system_info(DMI_BIOS_VENDOR),
5513 dmi_get_system_info(DMI_BIOS_VERSION),
5514 dmi_get_system_info(DMI_PRODUCT_VERSION));
5515 iommu_identity_mapping |= IDENTMAP_AZALIA;
5519 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",