2 * Copyright © 2006-2014 Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * Authors: David Woodhouse <dwmw2@infradead.org>,
14 * Ashok Raj <ashok.raj@intel.com>,
15 * Shaohua Li <shaohua.li@intel.com>,
16 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17 * Fenghua Yu <fenghua.yu@intel.com>
18 * Joerg Roedel <jroedel@suse.de>
21 #define pr_fmt(fmt) "DMAR: " fmt
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/cpu.h>
37 #include <linux/timer.h>
39 #include <linux/iova.h>
40 #include <linux/iommu.h>
41 #include <linux/intel-iommu.h>
42 #include <linux/syscore_ops.h>
43 #include <linux/tboot.h>
44 #include <linux/dmi.h>
45 #include <linux/pci-ats.h>
46 #include <linux/memblock.h>
47 #include <linux/dma-contiguous.h>
48 #include <linux/dma-direct.h>
49 #include <linux/crash_dump.h>
50 #include <asm/irq_remapping.h>
51 #include <asm/cacheflush.h>
52 #include <asm/iommu.h>
54 #include "irq_remapping.h"
55 #include "intel-pasid.h"
57 #define ROOT_SIZE VTD_PAGE_SIZE
58 #define CONTEXT_SIZE VTD_PAGE_SIZE
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
65 #define IOAPIC_RANGE_START (0xfee00000)
66 #define IOAPIC_RANGE_END (0xfeefffff)
67 #define IOVA_START_ADDR (0x1000)
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
74 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
80 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN (1)
86 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
88 /* page table handling */
89 #define LEVEL_STRIDE (9)
90 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
93 * This bitmap is used to advertise the page sizes our hardware support
94 * to the IOMMU core, which will then use this information to split
95 * physically contiguous memory regions it is mapping into page sizes
98 * Traditionally the IOMMU core just handed us the mappings directly,
99 * after making sure the size is an order of a 4KiB page and that the
100 * mapping has natural alignment.
102 * To retain this behavior, we currently advertise that we support
103 * all page sizes that are an order of 4KiB.
105 * If at some point we'd like to utilize the IOMMU core's new behavior,
106 * we could change this to advertise the real page sizes we support.
108 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
110 static inline int agaw_to_level(int agaw)
115 static inline int agaw_to_width(int agaw)
117 return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 static inline int width_to_agaw(int width)
122 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 static inline unsigned int level_to_offset_bits(int level)
127 return (level - 1) * LEVEL_STRIDE;
130 static inline int pfn_level_offset(unsigned long pfn, int level)
132 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 static inline unsigned long level_mask(int level)
137 return -1UL << level_to_offset_bits(level);
140 static inline unsigned long level_size(int level)
142 return 1UL << level_to_offset_bits(level);
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
147 return (pfn + level_size(level) - 1) & level_mask(level);
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
152 return 1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156 are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
159 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
164 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
168 return mm_to_dma_pfn(page_to_pfn(pg));
170 static inline unsigned long virt_to_dma_pfn(void *p)
172 return page_to_dma_pfn(virt_to_page(p));
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
182 * set to 1 to panic kernel if can't successfully enable VT-d
183 * (used when kernel is launched w/ TXT)
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
191 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
194 static phys_addr_t root_entry_lctp(struct root_entry *re)
199 return re->lo & VTD_PAGE_MASK;
203 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
206 static phys_addr_t root_entry_uctp(struct root_entry *re)
211 return re->hi & VTD_PAGE_MASK;
214 static inline void context_clear_pasid_enable(struct context_entry *context)
216 context->lo &= ~(1ULL << 11);
219 static inline bool context_pasid_enabled(struct context_entry *context)
221 return !!(context->lo & (1ULL << 11));
224 static inline void context_set_copied(struct context_entry *context)
226 context->hi |= (1ull << 3);
229 static inline bool context_copied(struct context_entry *context)
231 return !!(context->hi & (1ULL << 3));
234 static inline bool __context_present(struct context_entry *context)
236 return (context->lo & 1);
239 bool context_present(struct context_entry *context)
241 return context_pasid_enabled(context) ?
242 __context_present(context) :
243 __context_present(context) && !context_copied(context);
246 static inline void context_set_present(struct context_entry *context)
251 static inline void context_set_fault_enable(struct context_entry *context)
253 context->lo &= (((u64)-1) << 2) | 1;
256 static inline void context_set_translation_type(struct context_entry *context,
259 context->lo &= (((u64)-1) << 4) | 3;
260 context->lo |= (value & 3) << 2;
263 static inline void context_set_address_root(struct context_entry *context,
266 context->lo &= ~VTD_PAGE_MASK;
267 context->lo |= value & VTD_PAGE_MASK;
270 static inline void context_set_address_width(struct context_entry *context,
273 context->hi |= value & 7;
276 static inline void context_set_domain_id(struct context_entry *context,
279 context->hi |= (value & ((1 << 16) - 1)) << 8;
282 static inline int context_domain_id(struct context_entry *c)
284 return((c->hi >> 8) & 0xffff);
287 static inline void context_clear_entry(struct context_entry *context)
294 * This domain is a statically identity mapping domain.
295 * 1. This domain creats a static 1:1 mapping to all usable memory.
296 * 2. It maps to each iommu if successful.
297 * 3. Each iommu mapps to this domain if successful.
299 static struct dmar_domain *si_domain;
300 static int hw_pass_through = 1;
303 * Domain represents a virtual machine, more than one devices
304 * across iommus may be owned in one domain, e.g. kvm guest.
306 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 0)
308 /* si_domain contains mulitple devices */
309 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 1)
311 #define for_each_domain_iommu(idx, domain) \
312 for (idx = 0; idx < g_num_of_iommus; idx++) \
313 if (domain->iommu_refcnt[idx])
315 struct dmar_rmrr_unit {
316 struct list_head list; /* list of rmrr units */
317 struct acpi_dmar_header *hdr; /* ACPI header */
318 u64 base_address; /* reserved base address*/
319 u64 end_address; /* reserved end address */
320 struct dmar_dev_scope *devices; /* target devices */
321 int devices_cnt; /* target device count */
322 struct iommu_resv_region *resv; /* reserved region handle */
325 struct dmar_atsr_unit {
326 struct list_head list; /* list of ATSR units */
327 struct acpi_dmar_header *hdr; /* ACPI header */
328 struct dmar_dev_scope *devices; /* target devices */
329 int devices_cnt; /* target device count */
330 u8 include_all:1; /* include all ports */
333 static LIST_HEAD(dmar_atsr_units);
334 static LIST_HEAD(dmar_rmrr_units);
336 #define for_each_rmrr_units(rmrr) \
337 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
339 /* bitmap for indexing intel_iommus */
340 static int g_num_of_iommus;
342 static void domain_exit(struct dmar_domain *domain);
343 static void domain_remove_dev_info(struct dmar_domain *domain);
344 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
346 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
347 static void domain_context_clear(struct intel_iommu *iommu,
349 static int domain_detach_iommu(struct dmar_domain *domain,
350 struct intel_iommu *iommu);
352 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
353 int dmar_disabled = 0;
355 int dmar_disabled = 1;
356 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
358 int intel_iommu_enabled = 0;
359 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
361 static int dmar_map_gfx = 1;
362 static int dmar_forcedac;
363 static int intel_iommu_strict;
364 static int intel_iommu_superpage = 1;
365 static int intel_iommu_sm = 1;
366 static int iommu_identity_mapping;
368 #define IDENTMAP_ALL 1
369 #define IDENTMAP_GFX 2
370 #define IDENTMAP_AZALIA 4
372 #define sm_supported(iommu) (intel_iommu_sm && ecap_smts((iommu)->ecap))
373 #define pasid_supported(iommu) (sm_supported(iommu) && \
374 ecap_pasid((iommu)->ecap))
376 int intel_iommu_gfx_mapped;
377 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
379 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
384 * Iterate over elements in device_domain_list and call the specified
385 * callback @fn against each element.
387 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
388 void *data), void *data)
392 struct device_domain_info *info;
394 spin_lock_irqsave(&device_domain_lock, flags);
395 list_for_each_entry(info, &device_domain_list, global) {
396 ret = fn(info, data);
398 spin_unlock_irqrestore(&device_domain_lock, flags);
402 spin_unlock_irqrestore(&device_domain_lock, flags);
407 const struct iommu_ops intel_iommu_ops;
409 static bool translation_pre_enabled(struct intel_iommu *iommu)
411 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
414 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
416 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
419 static void init_translation_status(struct intel_iommu *iommu)
423 gsts = readl(iommu->reg + DMAR_GSTS_REG);
424 if (gsts & DMA_GSTS_TES)
425 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
428 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
429 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
431 return container_of(dom, struct dmar_domain, domain);
434 static int __init intel_iommu_setup(char *str)
439 if (!strncmp(str, "on", 2)) {
441 pr_info("IOMMU enabled\n");
442 } else if (!strncmp(str, "off", 3)) {
444 pr_info("IOMMU disabled\n");
445 } else if (!strncmp(str, "igfx_off", 8)) {
447 pr_info("Disable GFX device mapping\n");
448 } else if (!strncmp(str, "forcedac", 8)) {
449 pr_info("Forcing DAC for PCI devices\n");
451 } else if (!strncmp(str, "strict", 6)) {
452 pr_info("Disable batched IOTLB flush\n");
453 intel_iommu_strict = 1;
454 } else if (!strncmp(str, "sp_off", 6)) {
455 pr_info("Disable supported super page\n");
456 intel_iommu_superpage = 0;
457 } else if (!strncmp(str, "sm_off", 6)) {
458 pr_info("Intel-IOMMU: disable scalable mode support\n");
460 } else if (!strncmp(str, "tboot_noforce", 13)) {
462 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
463 intel_iommu_tboot_noforce = 1;
466 str += strcspn(str, ",");
472 __setup("intel_iommu=", intel_iommu_setup);
474 static struct kmem_cache *iommu_domain_cache;
475 static struct kmem_cache *iommu_devinfo_cache;
477 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
479 struct dmar_domain **domains;
482 domains = iommu->domains[idx];
486 return domains[did & 0xff];
489 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
490 struct dmar_domain *domain)
492 struct dmar_domain **domains;
495 if (!iommu->domains[idx]) {
496 size_t size = 256 * sizeof(struct dmar_domain *);
497 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
500 domains = iommu->domains[idx];
501 if (WARN_ON(!domains))
504 domains[did & 0xff] = domain;
507 void *alloc_pgtable_page(int node)
512 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
514 vaddr = page_address(page);
518 void free_pgtable_page(void *vaddr)
520 free_page((unsigned long)vaddr);
523 static inline void *alloc_domain_mem(void)
525 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
528 static void free_domain_mem(void *vaddr)
530 kmem_cache_free(iommu_domain_cache, vaddr);
533 static inline void * alloc_devinfo_mem(void)
535 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
538 static inline void free_devinfo_mem(void *vaddr)
540 kmem_cache_free(iommu_devinfo_cache, vaddr);
543 static inline int domain_type_is_vm(struct dmar_domain *domain)
545 return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
548 static inline int domain_type_is_si(struct dmar_domain *domain)
550 return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
555 return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
556 DOMAIN_FLAG_STATIC_IDENTITY);
559 static inline int domain_pfn_supported(struct dmar_domain *domain,
562 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
564 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
567 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
572 sagaw = cap_sagaw(iommu->cap);
573 for (agaw = width_to_agaw(max_gaw);
575 if (test_bit(agaw, &sagaw))
583 * Calculate max SAGAW for each iommu.
585 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
587 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
591 * calculate agaw for each iommu.
592 * "SAGAW" may be different across iommus, use a default agaw, and
593 * get a supported less agaw for iommus that don't support the default agaw.
595 int iommu_calculate_agaw(struct intel_iommu *iommu)
597 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
600 /* This functionin only returns single iommu in a domain */
601 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
605 /* si_domain and vm domain should not get here. */
606 BUG_ON(domain_type_is_vm_or_si(domain));
607 for_each_domain_iommu(iommu_id, domain)
610 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
613 return g_iommus[iommu_id];
616 static void domain_update_iommu_coherency(struct dmar_domain *domain)
618 struct dmar_drhd_unit *drhd;
619 struct intel_iommu *iommu;
623 domain->iommu_coherency = 1;
625 for_each_domain_iommu(i, domain) {
627 if (!ecap_coherent(g_iommus[i]->ecap)) {
628 domain->iommu_coherency = 0;
635 /* No hardware attached; use lowest common denominator */
637 for_each_active_iommu(iommu, drhd) {
638 if (!ecap_coherent(iommu->ecap)) {
639 domain->iommu_coherency = 0;
646 static int domain_update_iommu_snooping(struct intel_iommu *skip)
648 struct dmar_drhd_unit *drhd;
649 struct intel_iommu *iommu;
653 for_each_active_iommu(iommu, drhd) {
655 if (!ecap_sc_support(iommu->ecap)) {
666 static int domain_update_iommu_superpage(struct intel_iommu *skip)
668 struct dmar_drhd_unit *drhd;
669 struct intel_iommu *iommu;
672 if (!intel_iommu_superpage) {
676 /* set iommu_superpage to the smallest common denominator */
678 for_each_active_iommu(iommu, drhd) {
680 mask &= cap_super_page_val(iommu->cap);
690 /* Some capabilities may be different across iommus */
691 static void domain_update_iommu_cap(struct dmar_domain *domain)
693 domain_update_iommu_coherency(domain);
694 domain->iommu_snooping = domain_update_iommu_snooping(NULL);
695 domain->iommu_superpage = domain_update_iommu_superpage(NULL);
698 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
701 struct root_entry *root = &iommu->root_entry[bus];
702 struct context_entry *context;
706 if (sm_supported(iommu)) {
714 context = phys_to_virt(*entry & VTD_PAGE_MASK);
716 unsigned long phy_addr;
720 context = alloc_pgtable_page(iommu->node);
724 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
725 phy_addr = virt_to_phys((void *)context);
726 *entry = phy_addr | 1;
727 __iommu_flush_cache(iommu, entry, sizeof(*entry));
729 return &context[devfn];
732 static int iommu_dummy(struct device *dev)
734 return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
737 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
739 struct dmar_drhd_unit *drhd = NULL;
740 struct intel_iommu *iommu;
742 struct pci_dev *ptmp, *pdev = NULL;
746 if (iommu_dummy(dev))
749 if (dev_is_pci(dev)) {
750 struct pci_dev *pf_pdev;
752 pdev = to_pci_dev(dev);
755 /* VMD child devices currently cannot be handled individually */
756 if (is_vmd(pdev->bus))
760 /* VFs aren't listed in scope tables; we need to look up
761 * the PF instead to find the IOMMU. */
762 pf_pdev = pci_physfn(pdev);
764 segment = pci_domain_nr(pdev->bus);
765 } else if (has_acpi_companion(dev))
766 dev = &ACPI_COMPANION(dev)->dev;
769 for_each_active_iommu(iommu, drhd) {
770 if (pdev && segment != drhd->segment)
773 for_each_active_dev_scope(drhd->devices,
774 drhd->devices_cnt, i, tmp) {
776 /* For a VF use its original BDF# not that of the PF
777 * which we used for the IOMMU lookup. Strictly speaking
778 * we could do this for all PCI devices; we only need to
779 * get the BDF# from the scope table for ACPI matches. */
780 if (pdev && pdev->is_virtfn)
783 *bus = drhd->devices[i].bus;
784 *devfn = drhd->devices[i].devfn;
788 if (!pdev || !dev_is_pci(tmp))
791 ptmp = to_pci_dev(tmp);
792 if (ptmp->subordinate &&
793 ptmp->subordinate->number <= pdev->bus->number &&
794 ptmp->subordinate->busn_res.end >= pdev->bus->number)
798 if (pdev && drhd->include_all) {
800 *bus = pdev->bus->number;
801 *devfn = pdev->devfn;
812 static void domain_flush_cache(struct dmar_domain *domain,
813 void *addr, int size)
815 if (!domain->iommu_coherency)
816 clflush_cache_range(addr, size);
819 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
821 struct context_entry *context;
825 spin_lock_irqsave(&iommu->lock, flags);
826 context = iommu_context_addr(iommu, bus, devfn, 0);
828 ret = context_present(context);
829 spin_unlock_irqrestore(&iommu->lock, flags);
833 static void free_context_table(struct intel_iommu *iommu)
837 struct context_entry *context;
839 spin_lock_irqsave(&iommu->lock, flags);
840 if (!iommu->root_entry) {
843 for (i = 0; i < ROOT_ENTRY_NR; i++) {
844 context = iommu_context_addr(iommu, i, 0, 0);
846 free_pgtable_page(context);
848 if (!sm_supported(iommu))
851 context = iommu_context_addr(iommu, i, 0x80, 0);
853 free_pgtable_page(context);
856 free_pgtable_page(iommu->root_entry);
857 iommu->root_entry = NULL;
859 spin_unlock_irqrestore(&iommu->lock, flags);
862 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
863 unsigned long pfn, int *target_level)
865 struct dma_pte *parent, *pte = NULL;
866 int level = agaw_to_level(domain->agaw);
869 BUG_ON(!domain->pgd);
871 if (!domain_pfn_supported(domain, pfn))
872 /* Address beyond IOMMU's addressing capabilities. */
875 parent = domain->pgd;
880 offset = pfn_level_offset(pfn, level);
881 pte = &parent[offset];
882 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
884 if (level == *target_level)
887 if (!dma_pte_present(pte)) {
890 tmp_page = alloc_pgtable_page(domain->nid);
895 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
896 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
897 if (cmpxchg64(&pte->val, 0ULL, pteval))
898 /* Someone else set it while we were thinking; use theirs. */
899 free_pgtable_page(tmp_page);
901 domain_flush_cache(domain, pte, sizeof(*pte));
906 parent = phys_to_virt(dma_pte_addr(pte));
911 *target_level = level;
917 /* return address's pte at specific level */
918 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
920 int level, int *large_page)
922 struct dma_pte *parent, *pte = NULL;
923 int total = agaw_to_level(domain->agaw);
926 parent = domain->pgd;
927 while (level <= total) {
928 offset = pfn_level_offset(pfn, total);
929 pte = &parent[offset];
933 if (!dma_pte_present(pte)) {
938 if (dma_pte_superpage(pte)) {
943 parent = phys_to_virt(dma_pte_addr(pte));
949 /* clear last level pte, a tlb flush should be followed */
950 static void dma_pte_clear_range(struct dmar_domain *domain,
951 unsigned long start_pfn,
952 unsigned long last_pfn)
954 unsigned int large_page = 1;
955 struct dma_pte *first_pte, *pte;
957 BUG_ON(!domain_pfn_supported(domain, start_pfn));
958 BUG_ON(!domain_pfn_supported(domain, last_pfn));
959 BUG_ON(start_pfn > last_pfn);
961 /* we don't need lock here; nobody else touches the iova range */
964 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
966 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
971 start_pfn += lvl_to_nr_pages(large_page);
973 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
975 domain_flush_cache(domain, first_pte,
976 (void *)pte - (void *)first_pte);
978 } while (start_pfn && start_pfn <= last_pfn);
981 static void dma_pte_free_level(struct dmar_domain *domain, int level,
982 int retain_level, struct dma_pte *pte,
983 unsigned long pfn, unsigned long start_pfn,
984 unsigned long last_pfn)
986 pfn = max(start_pfn, pfn);
987 pte = &pte[pfn_level_offset(pfn, level)];
990 unsigned long level_pfn;
991 struct dma_pte *level_pte;
993 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
996 level_pfn = pfn & level_mask(level);
997 level_pte = phys_to_virt(dma_pte_addr(pte));
1000 dma_pte_free_level(domain, level - 1, retain_level,
1001 level_pte, level_pfn, start_pfn,
1006 * Free the page table if we're below the level we want to
1007 * retain and the range covers the entire table.
1009 if (level < retain_level && !(start_pfn > level_pfn ||
1010 last_pfn < level_pfn + level_size(level) - 1)) {
1012 domain_flush_cache(domain, pte, sizeof(*pte));
1013 free_pgtable_page(level_pte);
1016 pfn += level_size(level);
1017 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1021 * clear last level (leaf) ptes and free page table pages below the
1022 * level we wish to keep intact.
1024 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1025 unsigned long start_pfn,
1026 unsigned long last_pfn,
1029 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1030 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1031 BUG_ON(start_pfn > last_pfn);
1033 dma_pte_clear_range(domain, start_pfn, last_pfn);
1035 /* We don't need lock here; nobody else touches the iova range */
1036 dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1037 domain->pgd, 0, start_pfn, last_pfn);
1040 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1041 free_pgtable_page(domain->pgd);
1046 /* When a page at a given level is being unlinked from its parent, we don't
1047 need to *modify* it at all. All we need to do is make a list of all the
1048 pages which can be freed just as soon as we've flushed the IOTLB and we
1049 know the hardware page-walk will no longer touch them.
1050 The 'pte' argument is the *parent* PTE, pointing to the page that is to
1052 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1053 int level, struct dma_pte *pte,
1054 struct page *freelist)
1058 pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1059 pg->freelist = freelist;
1065 pte = page_address(pg);
1067 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1068 freelist = dma_pte_list_pagetables(domain, level - 1,
1071 } while (!first_pte_in_page(pte));
1076 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1077 struct dma_pte *pte, unsigned long pfn,
1078 unsigned long start_pfn,
1079 unsigned long last_pfn,
1080 struct page *freelist)
1082 struct dma_pte *first_pte = NULL, *last_pte = NULL;
1084 pfn = max(start_pfn, pfn);
1085 pte = &pte[pfn_level_offset(pfn, level)];
1088 unsigned long level_pfn;
1090 if (!dma_pte_present(pte))
1093 level_pfn = pfn & level_mask(level);
1095 /* If range covers entire pagetable, free it */
1096 if (start_pfn <= level_pfn &&
1097 last_pfn >= level_pfn + level_size(level) - 1) {
1098 /* These suborbinate page tables are going away entirely. Don't
1099 bother to clear them; we're just going to *free* them. */
1100 if (level > 1 && !dma_pte_superpage(pte))
1101 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1107 } else if (level > 1) {
1108 /* Recurse down into a level that isn't *entirely* obsolete */
1109 freelist = dma_pte_clear_level(domain, level - 1,
1110 phys_to_virt(dma_pte_addr(pte)),
1111 level_pfn, start_pfn, last_pfn,
1115 pfn += level_size(level);
1116 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1119 domain_flush_cache(domain, first_pte,
1120 (void *)++last_pte - (void *)first_pte);
1125 /* We can't just free the pages because the IOMMU may still be walking
1126 the page tables, and may have cached the intermediate levels. The
1127 pages can only be freed after the IOTLB flush has been done. */
1128 static struct page *domain_unmap(struct dmar_domain *domain,
1129 unsigned long start_pfn,
1130 unsigned long last_pfn)
1132 struct page *freelist = NULL;
1134 BUG_ON(!domain_pfn_supported(domain, start_pfn));
1135 BUG_ON(!domain_pfn_supported(domain, last_pfn));
1136 BUG_ON(start_pfn > last_pfn);
1138 /* we don't need lock here; nobody else touches the iova range */
1139 freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1140 domain->pgd, 0, start_pfn, last_pfn, NULL);
1143 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1144 struct page *pgd_page = virt_to_page(domain->pgd);
1145 pgd_page->freelist = freelist;
1146 freelist = pgd_page;
1154 static void dma_free_pagelist(struct page *freelist)
1158 while ((pg = freelist)) {
1159 freelist = pg->freelist;
1160 free_pgtable_page(page_address(pg));
1164 static void iova_entry_free(unsigned long data)
1166 struct page *freelist = (struct page *)data;
1168 dma_free_pagelist(freelist);
1171 /* iommu handling */
1172 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1174 struct root_entry *root;
1175 unsigned long flags;
1177 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1179 pr_err("Allocating root entry for %s failed\n",
1184 __iommu_flush_cache(iommu, root, ROOT_SIZE);
1186 spin_lock_irqsave(&iommu->lock, flags);
1187 iommu->root_entry = root;
1188 spin_unlock_irqrestore(&iommu->lock, flags);
1193 static void iommu_set_root_entry(struct intel_iommu *iommu)
1199 addr = virt_to_phys(iommu->root_entry);
1201 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1202 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1204 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1206 /* Make sure hardware complete it */
1207 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1208 readl, (sts & DMA_GSTS_RTPS), sts);
1210 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1213 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1218 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1221 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1224 /* Make sure hardware complete it */
1225 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1226 readl, (!(val & DMA_GSTS_WBFS)), val);
1228 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1231 /* return value determine if we need a write buffer flush */
1232 static void __iommu_flush_context(struct intel_iommu *iommu,
1233 u16 did, u16 source_id, u8 function_mask,
1240 case DMA_CCMD_GLOBAL_INVL:
1241 val = DMA_CCMD_GLOBAL_INVL;
1243 case DMA_CCMD_DOMAIN_INVL:
1244 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1246 case DMA_CCMD_DEVICE_INVL:
1247 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1248 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1253 val |= DMA_CCMD_ICC;
1255 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1256 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1258 /* Make sure hardware complete it */
1259 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1260 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1262 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1265 /* return value determine if we need a write buffer flush */
1266 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1267 u64 addr, unsigned int size_order, u64 type)
1269 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1270 u64 val = 0, val_iva = 0;
1274 case DMA_TLB_GLOBAL_FLUSH:
1275 /* global flush doesn't need set IVA_REG */
1276 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1278 case DMA_TLB_DSI_FLUSH:
1279 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1281 case DMA_TLB_PSI_FLUSH:
1282 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1283 /* IH bit is passed in as part of address */
1284 val_iva = size_order | addr;
1289 /* Note: set drain read/write */
1292 * This is probably to be super secure.. Looks like we can
1293 * ignore it without any impact.
1295 if (cap_read_drain(iommu->cap))
1296 val |= DMA_TLB_READ_DRAIN;
1298 if (cap_write_drain(iommu->cap))
1299 val |= DMA_TLB_WRITE_DRAIN;
1301 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302 /* Note: Only uses first TLB reg currently */
1304 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1305 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1307 /* Make sure hardware complete it */
1308 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1309 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1311 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1313 /* check IOTLB invalidation granularity */
1314 if (DMA_TLB_IAIG(val) == 0)
1315 pr_err("Flush IOTLB failed\n");
1316 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1317 pr_debug("TLB flush request %Lx, actual %Lx\n",
1318 (unsigned long long)DMA_TLB_IIRG(type),
1319 (unsigned long long)DMA_TLB_IAIG(val));
1322 static struct device_domain_info *
1323 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1326 struct device_domain_info *info;
1328 assert_spin_locked(&device_domain_lock);
1333 list_for_each_entry(info, &domain->devices, link)
1334 if (info->iommu == iommu && info->bus == bus &&
1335 info->devfn == devfn) {
1336 if (info->ats_supported && info->dev)
1344 static void domain_update_iotlb(struct dmar_domain *domain)
1346 struct device_domain_info *info;
1347 bool has_iotlb_device = false;
1349 assert_spin_locked(&device_domain_lock);
1351 list_for_each_entry(info, &domain->devices, link) {
1352 struct pci_dev *pdev;
1354 if (!info->dev || !dev_is_pci(info->dev))
1357 pdev = to_pci_dev(info->dev);
1358 if (pdev->ats_enabled) {
1359 has_iotlb_device = true;
1364 domain->has_iotlb_device = has_iotlb_device;
1367 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1369 struct pci_dev *pdev;
1371 assert_spin_locked(&device_domain_lock);
1373 if (!info || !dev_is_pci(info->dev))
1376 pdev = to_pci_dev(info->dev);
1377 /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1378 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1379 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1380 * reserved, which should be set to 0.
1382 if (!ecap_dit(info->iommu->ecap))
1385 struct pci_dev *pf_pdev;
1387 /* pdev will be returned if device is not a vf */
1388 pf_pdev = pci_physfn(pdev);
1389 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1392 #ifdef CONFIG_INTEL_IOMMU_SVM
1393 /* The PCIe spec, in its wisdom, declares that the behaviour of
1394 the device if you enable PASID support after ATS support is
1395 undefined. So always enable PASID support on devices which
1396 have it, even if we can't yet know if we're ever going to
1398 if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1399 info->pasid_enabled = 1;
1401 if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1402 info->pri_enabled = 1;
1404 if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1405 info->ats_enabled = 1;
1406 domain_update_iotlb(info->domain);
1407 info->ats_qdep = pci_ats_queue_depth(pdev);
1411 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1413 struct pci_dev *pdev;
1415 assert_spin_locked(&device_domain_lock);
1417 if (!dev_is_pci(info->dev))
1420 pdev = to_pci_dev(info->dev);
1422 if (info->ats_enabled) {
1423 pci_disable_ats(pdev);
1424 info->ats_enabled = 0;
1425 domain_update_iotlb(info->domain);
1427 #ifdef CONFIG_INTEL_IOMMU_SVM
1428 if (info->pri_enabled) {
1429 pci_disable_pri(pdev);
1430 info->pri_enabled = 0;
1432 if (info->pasid_enabled) {
1433 pci_disable_pasid(pdev);
1434 info->pasid_enabled = 0;
1439 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1440 u64 addr, unsigned mask)
1443 unsigned long flags;
1444 struct device_domain_info *info;
1446 if (!domain->has_iotlb_device)
1449 spin_lock_irqsave(&device_domain_lock, flags);
1450 list_for_each_entry(info, &domain->devices, link) {
1451 if (!info->ats_enabled)
1454 sid = info->bus << 8 | info->devfn;
1455 qdep = info->ats_qdep;
1456 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1459 spin_unlock_irqrestore(&device_domain_lock, flags);
1462 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1463 struct dmar_domain *domain,
1464 unsigned long pfn, unsigned int pages,
1467 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1468 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1469 u16 did = domain->iommu_did[iommu->seq_id];
1476 * Fallback to domain selective flush if no PSI support or the size is
1478 * PSI requires page size to be 2 ^ x, and the base address is naturally
1479 * aligned to the size
1481 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1482 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1485 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1489 * In caching mode, changes of pages from non-present to present require
1490 * flush. However, device IOTLB doesn't need to be flushed in this case.
1492 if (!cap_caching_mode(iommu->cap) || !map)
1493 iommu_flush_dev_iotlb(domain, addr, mask);
1496 /* Notification for newly created mappings */
1497 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1498 struct dmar_domain *domain,
1499 unsigned long pfn, unsigned int pages)
1501 /* It's a non-present to present mapping. Only flush if caching mode */
1502 if (cap_caching_mode(iommu->cap))
1503 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1505 iommu_flush_write_buffer(iommu);
1508 static void iommu_flush_iova(struct iova_domain *iovad)
1510 struct dmar_domain *domain;
1513 domain = container_of(iovad, struct dmar_domain, iovad);
1515 for_each_domain_iommu(idx, domain) {
1516 struct intel_iommu *iommu = g_iommus[idx];
1517 u16 did = domain->iommu_did[iommu->seq_id];
1519 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1521 if (!cap_caching_mode(iommu->cap))
1522 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1523 0, MAX_AGAW_PFN_WIDTH);
1527 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1530 unsigned long flags;
1532 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1533 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1534 pmen &= ~DMA_PMEN_EPM;
1535 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1537 /* wait for the protected region status bit to clear */
1538 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1539 readl, !(pmen & DMA_PMEN_PRS), pmen);
1541 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1544 static void iommu_enable_translation(struct intel_iommu *iommu)
1547 unsigned long flags;
1549 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1550 iommu->gcmd |= DMA_GCMD_TE;
1551 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1553 /* Make sure hardware complete it */
1554 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1555 readl, (sts & DMA_GSTS_TES), sts);
1557 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1560 static void iommu_disable_translation(struct intel_iommu *iommu)
1565 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1566 iommu->gcmd &= ~DMA_GCMD_TE;
1567 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1569 /* Make sure hardware complete it */
1570 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571 readl, (!(sts & DMA_GSTS_TES)), sts);
1573 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1577 static int iommu_init_domains(struct intel_iommu *iommu)
1579 u32 ndomains, nlongs;
1582 ndomains = cap_ndoms(iommu->cap);
1583 pr_debug("%s: Number of Domains supported <%d>\n",
1584 iommu->name, ndomains);
1585 nlongs = BITS_TO_LONGS(ndomains);
1587 spin_lock_init(&iommu->lock);
1589 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1590 if (!iommu->domain_ids) {
1591 pr_err("%s: Allocating domain id array failed\n",
1596 size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1597 iommu->domains = kzalloc(size, GFP_KERNEL);
1599 if (iommu->domains) {
1600 size = 256 * sizeof(struct dmar_domain *);
1601 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1604 if (!iommu->domains || !iommu->domains[0]) {
1605 pr_err("%s: Allocating domain array failed\n",
1607 kfree(iommu->domain_ids);
1608 kfree(iommu->domains);
1609 iommu->domain_ids = NULL;
1610 iommu->domains = NULL;
1617 * If Caching mode is set, then invalid translations are tagged
1618 * with domain-id 0, hence we need to pre-allocate it. We also
1619 * use domain-id 0 as a marker for non-allocated domain-id, so
1620 * make sure it is not used for a real domain.
1622 set_bit(0, iommu->domain_ids);
1625 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1626 * entry for first-level or pass-through translation modes should
1627 * be programmed with a domain id different from those used for
1628 * second-level or nested translation. We reserve a domain id for
1631 if (sm_supported(iommu))
1632 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1637 static void disable_dmar_iommu(struct intel_iommu *iommu)
1639 struct device_domain_info *info, *tmp;
1640 unsigned long flags;
1642 if (!iommu->domains || !iommu->domain_ids)
1646 spin_lock_irqsave(&device_domain_lock, flags);
1647 list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1648 struct dmar_domain *domain;
1650 if (info->iommu != iommu)
1653 if (!info->dev || !info->domain)
1656 domain = info->domain;
1658 __dmar_remove_one_dev_info(info);
1660 if (!domain_type_is_vm_or_si(domain)) {
1662 * The domain_exit() function can't be called under
1663 * device_domain_lock, as it takes this lock itself.
1664 * So release the lock here and re-run the loop
1667 spin_unlock_irqrestore(&device_domain_lock, flags);
1668 domain_exit(domain);
1672 spin_unlock_irqrestore(&device_domain_lock, flags);
1674 if (iommu->gcmd & DMA_GCMD_TE)
1675 iommu_disable_translation(iommu);
1678 static void free_dmar_iommu(struct intel_iommu *iommu)
1680 if ((iommu->domains) && (iommu->domain_ids)) {
1681 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1684 for (i = 0; i < elems; i++)
1685 kfree(iommu->domains[i]);
1686 kfree(iommu->domains);
1687 kfree(iommu->domain_ids);
1688 iommu->domains = NULL;
1689 iommu->domain_ids = NULL;
1692 g_iommus[iommu->seq_id] = NULL;
1694 /* free context mapping */
1695 free_context_table(iommu);
1697 #ifdef CONFIG_INTEL_IOMMU_SVM
1698 if (pasid_supported(iommu)) {
1699 if (ecap_prs(iommu->ecap))
1700 intel_svm_finish_prq(iommu);
1701 intel_svm_exit(iommu);
1706 static struct dmar_domain *alloc_domain(int flags)
1708 struct dmar_domain *domain;
1710 domain = alloc_domain_mem();
1714 memset(domain, 0, sizeof(*domain));
1716 domain->flags = flags;
1717 domain->has_iotlb_device = false;
1718 INIT_LIST_HEAD(&domain->devices);
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725 struct intel_iommu *iommu)
1727 unsigned long ndomains;
1730 assert_spin_locked(&device_domain_lock);
1731 assert_spin_locked(&iommu->lock);
1733 domain->iommu_refcnt[iommu->seq_id] += 1;
1734 domain->iommu_count += 1;
1735 if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736 ndomains = cap_ndoms(iommu->cap);
1737 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1739 if (num >= ndomains) {
1740 pr_err("%s: No free domain ids\n", iommu->name);
1741 domain->iommu_refcnt[iommu->seq_id] -= 1;
1742 domain->iommu_count -= 1;
1746 set_bit(num, iommu->domain_ids);
1747 set_iommu_domain(iommu, num, domain);
1749 domain->iommu_did[iommu->seq_id] = num;
1750 domain->nid = iommu->node;
1752 domain_update_iommu_cap(domain);
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759 struct intel_iommu *iommu)
1761 int num, count = INT_MAX;
1763 assert_spin_locked(&device_domain_lock);
1764 assert_spin_locked(&iommu->lock);
1766 domain->iommu_refcnt[iommu->seq_id] -= 1;
1767 count = --domain->iommu_count;
1768 if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769 num = domain->iommu_did[iommu->seq_id];
1770 clear_bit(num, iommu->domain_ids);
1771 set_iommu_domain(iommu, num, NULL);
1773 domain_update_iommu_cap(domain);
1774 domain->iommu_did[iommu->seq_id] = 0;
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1783 static int dmar_init_reserved_ranges(void)
1785 struct pci_dev *pdev = NULL;
1789 init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1791 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792 &reserved_rbtree_key);
1794 /* IOAPIC ranges shouldn't be accessed by DMA */
1795 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796 IOVA_PFN(IOAPIC_RANGE_END));
1798 pr_err("Reserve IOAPIC range failed\n");
1802 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803 for_each_pci_dev(pdev) {
1806 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807 r = &pdev->resource[i];
1808 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1810 iova = reserve_iova(&reserved_iova_list,
1814 pr_err("Reserve iova failed\n");
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1824 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1830 int r = (gaw - 12) % 9;
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1844 int adjust_width, agaw;
1845 unsigned long sagaw;
1848 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1850 err = init_iova_flush_queue(&domain->iovad,
1851 iommu_flush_iova, iova_entry_free);
1855 domain_reserve_special_ranges(domain);
1857 /* calculate AGAW */
1858 if (guest_width > cap_mgaw(iommu->cap))
1859 guest_width = cap_mgaw(iommu->cap);
1860 domain->gaw = guest_width;
1861 adjust_width = guestwidth_to_adjustwidth(guest_width);
1862 agaw = width_to_agaw(adjust_width);
1863 sagaw = cap_sagaw(iommu->cap);
1864 if (!test_bit(agaw, &sagaw)) {
1865 /* hardware doesn't support it, choose a bigger one */
1866 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867 agaw = find_next_bit(&sagaw, 5, agaw);
1871 domain->agaw = agaw;
1873 if (ecap_coherent(iommu->ecap))
1874 domain->iommu_coherency = 1;
1876 domain->iommu_coherency = 0;
1878 if (ecap_sc_support(iommu->ecap))
1879 domain->iommu_snooping = 1;
1881 domain->iommu_snooping = 0;
1883 if (intel_iommu_superpage)
1884 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1886 domain->iommu_superpage = 0;
1888 domain->nid = iommu->node;
1890 /* always allocate the top pgd */
1891 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1894 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1898 static void domain_exit(struct dmar_domain *domain)
1900 struct page *freelist = NULL;
1902 /* Domain 0 is reserved, so dont process it */
1906 /* Remove associated devices and clear attached or cached domains */
1908 domain_remove_dev_info(domain);
1912 put_iova_domain(&domain->iovad);
1914 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1916 dma_free_pagelist(freelist);
1918 free_domain_mem(domain);
1921 static int domain_context_mapping_one(struct dmar_domain *domain,
1922 struct intel_iommu *iommu,
1925 u16 did = domain->iommu_did[iommu->seq_id];
1926 int translation = CONTEXT_TT_MULTI_LEVEL;
1927 struct device_domain_info *info = NULL;
1928 struct context_entry *context;
1929 unsigned long flags;
1930 struct dma_pte *pgd;
1935 if (hw_pass_through && domain_type_is_si(domain))
1936 translation = CONTEXT_TT_PASS_THROUGH;
1938 pr_debug("Set context mapping for %02x:%02x.%d\n",
1939 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1941 BUG_ON(!domain->pgd);
1943 spin_lock_irqsave(&device_domain_lock, flags);
1944 spin_lock(&iommu->lock);
1947 context = iommu_context_addr(iommu, bus, devfn, 1);
1952 if (context_present(context))
1956 * For kdump cases, old valid entries may be cached due to the
1957 * in-flight DMA and copied pgtable, but there is no unmapping
1958 * behaviour for them, thus we need an explicit cache flush for
1959 * the newly-mapped device. For kdump, at this point, the device
1960 * is supposed to finish reset at its driver probe stage, so no
1961 * in-flight DMA will exist, and we don't need to worry anymore
1964 if (context_copied(context)) {
1965 u16 did_old = context_domain_id(context);
1967 if (did_old < cap_ndoms(iommu->cap)) {
1968 iommu->flush.flush_context(iommu, did_old,
1969 (((u16)bus) << 8) | devfn,
1970 DMA_CCMD_MASK_NOBIT,
1971 DMA_CCMD_DEVICE_INVL);
1972 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1979 context_clear_entry(context);
1980 context_set_domain_id(context, did);
1983 * Skip top levels of page tables for iommu which has less agaw
1984 * than default. Unnecessary for PT mode.
1986 if (translation != CONTEXT_TT_PASS_THROUGH) {
1987 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1989 pgd = phys_to_virt(dma_pte_addr(pgd));
1990 if (!dma_pte_present(pgd))
1994 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1995 if (info && info->ats_supported)
1996 translation = CONTEXT_TT_DEV_IOTLB;
1998 translation = CONTEXT_TT_MULTI_LEVEL;
2000 context_set_address_root(context, virt_to_phys(pgd));
2001 context_set_address_width(context, agaw);
2004 * In pass through mode, AW must be programmed to
2005 * indicate the largest AGAW value supported by
2006 * hardware. And ASR is ignored by hardware.
2008 context_set_address_width(context, iommu->msagaw);
2011 context_set_translation_type(context, translation);
2012 context_set_fault_enable(context);
2013 context_set_present(context);
2014 domain_flush_cache(domain, context, sizeof(*context));
2017 * It's a non-present to present mapping. If hardware doesn't cache
2018 * non-present entry we only need to flush the write-buffer. If the
2019 * _does_ cache non-present entries, then it does so in the special
2020 * domain #0, which we have to flush:
2022 if (cap_caching_mode(iommu->cap)) {
2023 iommu->flush.flush_context(iommu, 0,
2024 (((u16)bus) << 8) | devfn,
2025 DMA_CCMD_MASK_NOBIT,
2026 DMA_CCMD_DEVICE_INVL);
2027 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2029 iommu_flush_write_buffer(iommu);
2031 iommu_enable_dev_iotlb(info);
2036 spin_unlock(&iommu->lock);
2037 spin_unlock_irqrestore(&device_domain_lock, flags);
2042 struct domain_context_mapping_data {
2043 struct dmar_domain *domain;
2044 struct intel_iommu *iommu;
2047 static int domain_context_mapping_cb(struct pci_dev *pdev,
2048 u16 alias, void *opaque)
2050 struct domain_context_mapping_data *data = opaque;
2052 return domain_context_mapping_one(data->domain, data->iommu,
2053 PCI_BUS_NUM(alias), alias & 0xff);
2057 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2059 struct intel_iommu *iommu;
2061 struct domain_context_mapping_data data;
2063 iommu = device_to_iommu(dev, &bus, &devfn);
2067 if (!dev_is_pci(dev))
2068 return domain_context_mapping_one(domain, iommu, bus, devfn);
2070 data.domain = domain;
2073 return pci_for_each_dma_alias(to_pci_dev(dev),
2074 &domain_context_mapping_cb, &data);
2077 static int domain_context_mapped_cb(struct pci_dev *pdev,
2078 u16 alias, void *opaque)
2080 struct intel_iommu *iommu = opaque;
2082 return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2085 static int domain_context_mapped(struct device *dev)
2087 struct intel_iommu *iommu;
2090 iommu = device_to_iommu(dev, &bus, &devfn);
2094 if (!dev_is_pci(dev))
2095 return device_context_mapped(iommu, bus, devfn);
2097 return !pci_for_each_dma_alias(to_pci_dev(dev),
2098 domain_context_mapped_cb, iommu);
2101 /* Returns a number of VTD pages, but aligned to MM page size */
2102 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2105 host_addr &= ~PAGE_MASK;
2106 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2109 /* Return largest possible superpage level for a given mapping */
2110 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111 unsigned long iov_pfn,
2112 unsigned long phy_pfn,
2113 unsigned long pages)
2115 int support, level = 1;
2116 unsigned long pfnmerge;
2118 support = domain->iommu_superpage;
2120 /* To use a large page, the virtual *and* physical addresses
2121 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122 of them will mean we have to use smaller pages. So just
2123 merge them and check both at once. */
2124 pfnmerge = iov_pfn | phy_pfn;
2126 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127 pages >>= VTD_STRIDE_SHIFT;
2130 pfnmerge >>= VTD_STRIDE_SHIFT;
2137 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2138 struct scatterlist *sg, unsigned long phys_pfn,
2139 unsigned long nr_pages, int prot)
2141 struct dma_pte *first_pte = NULL, *pte = NULL;
2142 phys_addr_t uninitialized_var(pteval);
2143 unsigned long sg_res = 0;
2144 unsigned int largepage_lvl = 0;
2145 unsigned long lvl_pages = 0;
2147 BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2149 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2152 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2156 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2159 while (nr_pages > 0) {
2163 unsigned int pgoff = sg->offset & ~PAGE_MASK;
2165 sg_res = aligned_nrpages(sg->offset, sg->length);
2166 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2167 sg->dma_length = sg->length;
2168 pteval = (sg_phys(sg) - pgoff) | prot;
2169 phys_pfn = pteval >> VTD_PAGE_SHIFT;
2173 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2175 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2178 /* It is large page*/
2179 if (largepage_lvl > 1) {
2180 unsigned long nr_superpages, end_pfn;
2182 pteval |= DMA_PTE_LARGE_PAGE;
2183 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2185 nr_superpages = sg_res / lvl_pages;
2186 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2189 * Ensure that old small page tables are
2190 * removed to make room for superpage(s).
2191 * We're adding new large pages, so make sure
2192 * we don't remove their parent tables.
2194 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2197 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2201 /* We don't need lock here, nobody else
2202 * touches the iova range
2204 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2206 static int dumps = 5;
2207 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2208 iov_pfn, tmp, (unsigned long long)pteval);
2211 debug_dma_dump_mappings(NULL);
2216 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2218 BUG_ON(nr_pages < lvl_pages);
2219 BUG_ON(sg_res < lvl_pages);
2221 nr_pages -= lvl_pages;
2222 iov_pfn += lvl_pages;
2223 phys_pfn += lvl_pages;
2224 pteval += lvl_pages * VTD_PAGE_SIZE;
2225 sg_res -= lvl_pages;
2227 /* If the next PTE would be the first in a new page, then we
2228 need to flush the cache on the entries we've just written.
2229 And then we'll need to recalculate 'pte', so clear it and
2230 let it get set again in the if (!pte) block above.
2232 If we're done (!nr_pages) we need to flush the cache too.
2234 Also if we've been setting superpages, we may need to
2235 recalculate 'pte' and switch back to smaller pages for the
2236 end of the mapping, if the trailing size is not enough to
2237 use another superpage (i.e. sg_res < lvl_pages). */
2239 if (!nr_pages || first_pte_in_page(pte) ||
2240 (largepage_lvl > 1 && sg_res < lvl_pages)) {
2241 domain_flush_cache(domain, first_pte,
2242 (void *)pte - (void *)first_pte);
2246 if (!sg_res && nr_pages)
2252 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2253 struct scatterlist *sg, unsigned long phys_pfn,
2254 unsigned long nr_pages, int prot)
2257 struct intel_iommu *iommu;
2259 /* Do the real mapping first */
2260 ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2264 /* Notify about the new mapping */
2265 if (domain_type_is_vm(domain)) {
2266 /* VM typed domains can have more than one IOMMUs */
2268 for_each_domain_iommu(iommu_id, domain) {
2269 iommu = g_iommus[iommu_id];
2270 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2273 /* General domains only have one IOMMU */
2274 iommu = domain_get_iommu(domain);
2275 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2281 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2282 struct scatterlist *sg, unsigned long nr_pages,
2285 return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2288 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2289 unsigned long phys_pfn, unsigned long nr_pages,
2292 return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2295 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2297 unsigned long flags;
2298 struct context_entry *context;
2304 spin_lock_irqsave(&iommu->lock, flags);
2305 context = iommu_context_addr(iommu, bus, devfn, 0);
2307 spin_unlock_irqrestore(&iommu->lock, flags);
2310 did_old = context_domain_id(context);
2311 context_clear_entry(context);
2312 __iommu_flush_cache(iommu, context, sizeof(*context));
2313 spin_unlock_irqrestore(&iommu->lock, flags);
2314 iommu->flush.flush_context(iommu,
2316 (((u16)bus) << 8) | devfn,
2317 DMA_CCMD_MASK_NOBIT,
2318 DMA_CCMD_DEVICE_INVL);
2319 iommu->flush.flush_iotlb(iommu,
2326 static inline void unlink_domain_info(struct device_domain_info *info)
2328 assert_spin_locked(&device_domain_lock);
2329 list_del(&info->link);
2330 list_del(&info->global);
2332 info->dev->archdata.iommu = NULL;
2335 static void domain_remove_dev_info(struct dmar_domain *domain)
2337 struct device_domain_info *info, *tmp;
2338 unsigned long flags;
2340 spin_lock_irqsave(&device_domain_lock, flags);
2341 list_for_each_entry_safe(info, tmp, &domain->devices, link)
2342 __dmar_remove_one_dev_info(info);
2343 spin_unlock_irqrestore(&device_domain_lock, flags);
2348 * Note: we use struct device->archdata.iommu stores the info
2350 static struct dmar_domain *find_domain(struct device *dev)
2352 struct device_domain_info *info;
2354 /* No lock here, assumes no domain exit in normal case */
2355 info = dev->archdata.iommu;
2357 return info->domain;
2361 static inline struct device_domain_info *
2362 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2364 struct device_domain_info *info;
2366 list_for_each_entry(info, &device_domain_list, global)
2367 if (info->iommu->segment == segment && info->bus == bus &&
2368 info->devfn == devfn)
2374 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2377 struct dmar_domain *domain)
2379 struct dmar_domain *found = NULL;
2380 struct device_domain_info *info;
2381 unsigned long flags;
2384 info = alloc_devinfo_mem();
2389 info->devfn = devfn;
2390 info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2391 info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2394 info->domain = domain;
2395 info->iommu = iommu;
2396 info->pasid_table = NULL;
2398 if (dev && dev_is_pci(dev)) {
2399 struct pci_dev *pdev = to_pci_dev(info->dev);
2401 if (!pci_ats_disabled() &&
2402 ecap_dev_iotlb_support(iommu->ecap) &&
2403 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2404 dmar_find_matched_atsr_unit(pdev))
2405 info->ats_supported = 1;
2407 if (sm_supported(iommu)) {
2408 if (pasid_supported(iommu)) {
2409 int features = pci_pasid_features(pdev);
2411 info->pasid_supported = features | 1;
2414 if (info->ats_supported && ecap_prs(iommu->ecap) &&
2415 pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2416 info->pri_supported = 1;
2420 spin_lock_irqsave(&device_domain_lock, flags);
2422 found = find_domain(dev);
2425 struct device_domain_info *info2;
2426 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2428 found = info2->domain;
2434 spin_unlock_irqrestore(&device_domain_lock, flags);
2435 free_devinfo_mem(info);
2436 /* Caller must free the original domain */
2440 spin_lock(&iommu->lock);
2441 ret = domain_attach_iommu(domain, iommu);
2442 spin_unlock(&iommu->lock);
2445 spin_unlock_irqrestore(&device_domain_lock, flags);
2446 free_devinfo_mem(info);
2450 list_add(&info->link, &domain->devices);
2451 list_add(&info->global, &device_domain_list);
2453 dev->archdata.iommu = info;
2454 spin_unlock_irqrestore(&device_domain_lock, flags);
2456 /* PASID table is mandatory for a PCI device in scalable mode. */
2457 if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2458 ret = intel_pasid_alloc_table(dev);
2460 pr_err("PASID table allocation for %s failed\n",
2462 dmar_remove_one_dev_info(domain, dev);
2467 if (dev && domain_context_mapping(domain, dev)) {
2468 pr_err("Domain context map for %s failed\n", dev_name(dev));
2469 dmar_remove_one_dev_info(domain, dev);
2476 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2478 *(u16 *)opaque = alias;
2482 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2484 struct device_domain_info *info = NULL;
2485 struct dmar_domain *domain = NULL;
2486 struct intel_iommu *iommu;
2488 unsigned long flags;
2491 iommu = device_to_iommu(dev, &bus, &devfn);
2495 if (dev_is_pci(dev)) {
2496 struct pci_dev *pdev = to_pci_dev(dev);
2498 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2500 spin_lock_irqsave(&device_domain_lock, flags);
2501 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2502 PCI_BUS_NUM(dma_alias),
2505 iommu = info->iommu;
2506 domain = info->domain;
2508 spin_unlock_irqrestore(&device_domain_lock, flags);
2510 /* DMA alias already has a domain, use it */
2515 /* Allocate and initialize new domain for the device */
2516 domain = alloc_domain(0);
2519 if (domain_init(domain, iommu, gaw)) {
2520 domain_exit(domain);
2529 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2530 struct dmar_domain *domain)
2532 struct intel_iommu *iommu;
2533 struct dmar_domain *tmp;
2534 u16 req_id, dma_alias;
2537 iommu = device_to_iommu(dev, &bus, &devfn);
2541 req_id = ((u16)bus << 8) | devfn;
2543 if (dev_is_pci(dev)) {
2544 struct pci_dev *pdev = to_pci_dev(dev);
2546 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2548 /* register PCI DMA alias device */
2549 if (req_id != dma_alias) {
2550 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2551 dma_alias & 0xff, NULL, domain);
2553 if (!tmp || tmp != domain)
2558 tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2559 if (!tmp || tmp != domain)
2565 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2567 struct dmar_domain *domain, *tmp;
2569 domain = find_domain(dev);
2573 domain = find_or_alloc_domain(dev, gaw);
2577 tmp = set_domain_for_dev(dev, domain);
2578 if (!tmp || domain != tmp) {
2579 domain_exit(domain);
2588 static int iommu_domain_identity_map(struct dmar_domain *domain,
2589 unsigned long long start,
2590 unsigned long long end)
2592 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2593 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2595 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2596 dma_to_mm_pfn(last_vpfn))) {
2597 pr_err("Reserving iova failed\n");
2601 pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2603 * RMRR range might have overlap with physical memory range,
2606 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2608 return __domain_mapping(domain, first_vpfn, NULL,
2609 first_vpfn, last_vpfn - first_vpfn + 1,
2610 DMA_PTE_READ|DMA_PTE_WRITE);
2613 static int domain_prepare_identity_map(struct device *dev,
2614 struct dmar_domain *domain,
2615 unsigned long long start,
2616 unsigned long long end)
2618 /* For _hardware_ passthrough, don't bother. But for software
2619 passthrough, we do it anyway -- it may indicate a memory
2620 range which is reserved in E820, so which didn't get set
2621 up to start with in si_domain */
2622 if (domain == si_domain && hw_pass_through) {
2623 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2624 dev_name(dev), start, end);
2628 pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2629 dev_name(dev), start, end);
2632 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2633 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2634 dmi_get_system_info(DMI_BIOS_VENDOR),
2635 dmi_get_system_info(DMI_BIOS_VERSION),
2636 dmi_get_system_info(DMI_PRODUCT_VERSION));
2640 if (end >> agaw_to_width(domain->agaw)) {
2641 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2642 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2643 agaw_to_width(domain->agaw),
2644 dmi_get_system_info(DMI_BIOS_VENDOR),
2645 dmi_get_system_info(DMI_BIOS_VERSION),
2646 dmi_get_system_info(DMI_PRODUCT_VERSION));
2650 return iommu_domain_identity_map(domain, start, end);
2653 static int iommu_prepare_identity_map(struct device *dev,
2654 unsigned long long start,
2655 unsigned long long end)
2657 struct dmar_domain *domain;
2660 domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2664 ret = domain_prepare_identity_map(dev, domain, start, end);
2666 domain_exit(domain);
2671 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2674 if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2676 return iommu_prepare_identity_map(dev, rmrr->base_address,
2680 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2681 static inline void iommu_prepare_isa(void)
2683 struct pci_dev *pdev;
2686 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2690 pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2691 ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2694 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2699 static inline void iommu_prepare_isa(void)
2703 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2705 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2707 static int __init si_domain_init(int hw)
2711 si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2715 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2716 domain_exit(si_domain);
2720 pr_debug("Identity mapping domain allocated\n");
2725 for_each_online_node(nid) {
2726 unsigned long start_pfn, end_pfn;
2729 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2730 ret = iommu_domain_identity_map(si_domain,
2731 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2740 static int identity_mapping(struct device *dev)
2742 struct device_domain_info *info;
2744 if (likely(!iommu_identity_mapping))
2747 info = dev->archdata.iommu;
2748 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2749 return (info->domain == si_domain);
2754 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2756 struct dmar_domain *ndomain;
2757 struct intel_iommu *iommu;
2760 iommu = device_to_iommu(dev, &bus, &devfn);
2764 ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2765 if (ndomain != domain)
2771 static bool device_has_rmrr(struct device *dev)
2773 struct dmar_rmrr_unit *rmrr;
2778 for_each_rmrr_units(rmrr) {
2780 * Return TRUE if this RMRR contains the device that
2783 for_each_active_dev_scope(rmrr->devices,
2784 rmrr->devices_cnt, i, tmp)
2795 * There are a couple cases where we need to restrict the functionality of
2796 * devices associated with RMRRs. The first is when evaluating a device for
2797 * identity mapping because problems exist when devices are moved in and out
2798 * of domains and their respective RMRR information is lost. This means that
2799 * a device with associated RMRRs will never be in a "passthrough" domain.
2800 * The second is use of the device through the IOMMU API. This interface
2801 * expects to have full control of the IOVA space for the device. We cannot
2802 * satisfy both the requirement that RMRR access is maintained and have an
2803 * unencumbered IOVA space. We also have no ability to quiesce the device's
2804 * use of the RMRR space or even inform the IOMMU API user of the restriction.
2805 * We therefore prevent devices associated with an RMRR from participating in
2806 * the IOMMU API, which eliminates them from device assignment.
2808 * In both cases we assume that PCI USB devices with RMRRs have them largely
2809 * for historical reasons and that the RMRR space is not actively used post
2810 * boot. This exclusion may change if vendors begin to abuse it.
2812 * The same exception is made for graphics devices, with the requirement that
2813 * any use of the RMRR regions will be torn down before assigning the device
2816 static bool device_is_rmrr_locked(struct device *dev)
2818 if (!device_has_rmrr(dev))
2821 if (dev_is_pci(dev)) {
2822 struct pci_dev *pdev = to_pci_dev(dev);
2824 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2831 static int iommu_should_identity_map(struct device *dev, int startup)
2834 if (dev_is_pci(dev)) {
2835 struct pci_dev *pdev = to_pci_dev(dev);
2837 if (device_is_rmrr_locked(dev))
2840 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2843 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2846 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2850 * We want to start off with all devices in the 1:1 domain, and
2851 * take them out later if we find they can't access all of memory.
2853 * However, we can't do this for PCI devices behind bridges,
2854 * because all PCI devices behind the same bridge will end up
2855 * with the same source-id on their transactions.
2857 * Practically speaking, we can't change things around for these
2858 * devices at run-time, because we can't be sure there'll be no
2859 * DMA transactions in flight for any of their siblings.
2861 * So PCI devices (unless they're on the root bus) as well as
2862 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2863 * the 1:1 domain, just in _case_ one of their siblings turns out
2864 * not to be able to map all of memory.
2866 if (!pci_is_pcie(pdev)) {
2867 if (!pci_is_root_bus(pdev->bus))
2869 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2871 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2874 if (device_has_rmrr(dev))
2879 * At boot time, we don't yet know if devices will be 64-bit capable.
2880 * Assume that they will — if they turn out not to be, then we can
2881 * take them out of the 1:1 domain later.
2885 * If the device's dma_mask is less than the system's memory
2886 * size then this is not a candidate for identity mapping.
2888 u64 dma_mask = *dev->dma_mask;
2890 if (dev->coherent_dma_mask &&
2891 dev->coherent_dma_mask < dma_mask)
2892 dma_mask = dev->coherent_dma_mask;
2894 return dma_mask >= dma_get_required_mask(dev);
2900 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2904 if (!iommu_should_identity_map(dev, 1))
2907 ret = domain_add_dev_info(si_domain, dev);
2909 pr_info("%s identity mapping for device %s\n",
2910 hw ? "Hardware" : "Software", dev_name(dev));
2911 else if (ret == -ENODEV)
2912 /* device not associated with an iommu */
2919 static int __init iommu_prepare_static_identity_mapping(int hw)
2921 struct pci_dev *pdev = NULL;
2922 struct dmar_drhd_unit *drhd;
2923 struct intel_iommu *iommu;
2928 for_each_pci_dev(pdev) {
2929 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2934 for_each_active_iommu(iommu, drhd)
2935 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2936 struct acpi_device_physical_node *pn;
2937 struct acpi_device *adev;
2939 if (dev->bus != &acpi_bus_type)
2942 adev= to_acpi_device(dev);
2943 mutex_lock(&adev->physical_node_lock);
2944 list_for_each_entry(pn, &adev->physical_node_list, node) {
2945 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2949 mutex_unlock(&adev->physical_node_lock);
2957 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2960 * Start from the sane iommu hardware state.
2961 * If the queued invalidation is already initialized by us
2962 * (for example, while enabling interrupt-remapping) then
2963 * we got the things already rolling from a sane state.
2967 * Clear any previous faults.
2969 dmar_fault(-1, iommu);
2971 * Disable queued invalidation if supported and already enabled
2972 * before OS handover.
2974 dmar_disable_qi(iommu);
2977 if (dmar_enable_qi(iommu)) {
2979 * Queued Invalidate not enabled, use Register Based Invalidate
2981 iommu->flush.flush_context = __iommu_flush_context;
2982 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2983 pr_info("%s: Using Register based invalidation\n",
2986 iommu->flush.flush_context = qi_flush_context;
2987 iommu->flush.flush_iotlb = qi_flush_iotlb;
2988 pr_info("%s: Using Queued invalidation\n", iommu->name);
2992 static int copy_context_table(struct intel_iommu *iommu,
2993 struct root_entry *old_re,
2994 struct context_entry **tbl,
2997 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2998 struct context_entry *new_ce = NULL, ce;
2999 struct context_entry *old_ce = NULL;
3000 struct root_entry re;
3001 phys_addr_t old_ce_phys;
3003 tbl_idx = ext ? bus * 2 : bus;
3004 memcpy(&re, old_re, sizeof(re));
3006 for (devfn = 0; devfn < 256; devfn++) {
3007 /* First calculate the correct index */
3008 idx = (ext ? devfn * 2 : devfn) % 256;
3011 /* First save what we may have and clean up */
3013 tbl[tbl_idx] = new_ce;
3014 __iommu_flush_cache(iommu, new_ce,
3024 old_ce_phys = root_entry_lctp(&re);
3026 old_ce_phys = root_entry_uctp(&re);
3029 if (ext && devfn == 0) {
3030 /* No LCTP, try UCTP */
3039 old_ce = memremap(old_ce_phys, PAGE_SIZE,
3044 new_ce = alloc_pgtable_page(iommu->node);
3051 /* Now copy the context entry */
3052 memcpy(&ce, old_ce + idx, sizeof(ce));
3054 if (!__context_present(&ce))
3057 did = context_domain_id(&ce);
3058 if (did >= 0 && did < cap_ndoms(iommu->cap))
3059 set_bit(did, iommu->domain_ids);
3062 * We need a marker for copied context entries. This
3063 * marker needs to work for the old format as well as
3064 * for extended context entries.
3066 * Bit 67 of the context entry is used. In the old
3067 * format this bit is available to software, in the
3068 * extended format it is the PGE bit, but PGE is ignored
3069 * by HW if PASIDs are disabled (and thus still
3072 * So disable PASIDs first and then mark the entry
3073 * copied. This means that we don't copy PASID
3074 * translations from the old kernel, but this is fine as
3075 * faults there are not fatal.
3077 context_clear_pasid_enable(&ce);
3078 context_set_copied(&ce);
3083 tbl[tbl_idx + pos] = new_ce;
3085 __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3094 static int copy_translation_tables(struct intel_iommu *iommu)
3096 struct context_entry **ctxt_tbls;
3097 struct root_entry *old_rt;
3098 phys_addr_t old_rt_phys;
3099 int ctxt_table_entries;
3100 unsigned long flags;
3105 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3106 ext = !!(rtaddr_reg & DMA_RTADDR_RTT);
3107 new_ext = !!ecap_ecs(iommu->ecap);
3110 * The RTT bit can only be changed when translation is disabled,
3111 * but disabling translation means to open a window for data
3112 * corruption. So bail out and don't copy anything if we would
3113 * have to change the bit.
3118 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3122 old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3126 /* This is too big for the stack - allocate it from slab */
3127 ctxt_table_entries = ext ? 512 : 256;
3129 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3133 for (bus = 0; bus < 256; bus++) {
3134 ret = copy_context_table(iommu, &old_rt[bus],
3135 ctxt_tbls, bus, ext);
3137 pr_err("%s: Failed to copy context table for bus %d\n",
3143 spin_lock_irqsave(&iommu->lock, flags);
3145 /* Context tables are copied, now write them to the root_entry table */
3146 for (bus = 0; bus < 256; bus++) {
3147 int idx = ext ? bus * 2 : bus;
3150 if (ctxt_tbls[idx]) {
3151 val = virt_to_phys(ctxt_tbls[idx]) | 1;
3152 iommu->root_entry[bus].lo = val;
3155 if (!ext || !ctxt_tbls[idx + 1])
3158 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3159 iommu->root_entry[bus].hi = val;
3162 spin_unlock_irqrestore(&iommu->lock, flags);
3166 __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3176 static int __init init_dmars(void)
3178 struct dmar_drhd_unit *drhd;
3179 struct dmar_rmrr_unit *rmrr;
3180 bool copied_tables = false;
3182 struct intel_iommu *iommu;
3188 * initialize and program root entry to not present
3191 for_each_drhd_unit(drhd) {
3193 * lock not needed as this is only incremented in the single
3194 * threaded kernel __init code path all other access are read
3197 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3201 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3204 /* Preallocate enough resources for IOMMU hot-addition */
3205 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3206 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3208 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3211 pr_err("Allocating global iommu array failed\n");
3216 for_each_active_iommu(iommu, drhd) {
3218 * Find the max pasid size of all IOMMU's in the system.
3219 * We need to ensure the system pasid table is no bigger
3220 * than the smallest supported.
3222 if (pasid_supported(iommu)) {
3223 u32 temp = 2 << ecap_pss(iommu->ecap);
3225 intel_pasid_max_id = min_t(u32, temp,
3226 intel_pasid_max_id);
3229 g_iommus[iommu->seq_id] = iommu;
3231 intel_iommu_init_qi(iommu);
3233 ret = iommu_init_domains(iommu);
3237 init_translation_status(iommu);
3239 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240 iommu_disable_translation(iommu);
3241 clear_translation_pre_enabled(iommu);
3242 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3248 * we could share the same root & context tables
3249 * among all IOMMU's. Need to Split it later.
3251 ret = iommu_alloc_root_entry(iommu);
3255 if (translation_pre_enabled(iommu)) {
3256 pr_info("Translation already enabled - trying to copy translation structures\n");
3258 ret = copy_translation_tables(iommu);
3261 * We found the IOMMU with translation
3262 * enabled - but failed to copy over the
3263 * old root-entry table. Try to proceed
3264 * by disabling translation now and
3265 * allocating a clean root-entry table.
3266 * This might cause DMAR faults, but
3267 * probably the dump will still succeed.
3269 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3271 iommu_disable_translation(iommu);
3272 clear_translation_pre_enabled(iommu);
3274 pr_info("Copied translation tables from previous kernel for %s\n",
3276 copied_tables = true;
3280 if (!ecap_pass_through(iommu->ecap))
3281 hw_pass_through = 0;
3282 #ifdef CONFIG_INTEL_IOMMU_SVM
3283 if (pasid_supported(iommu))
3284 intel_svm_init(iommu);
3289 * Now that qi is enabled on all iommus, set the root entry and flush
3290 * caches. This is required on some Intel X58 chipsets, otherwise the
3291 * flush_context function will loop forever and the boot hangs.
3293 for_each_active_iommu(iommu, drhd) {
3294 iommu_flush_write_buffer(iommu);
3295 iommu_set_root_entry(iommu);
3296 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3297 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3300 if (iommu_pass_through)
3301 iommu_identity_mapping |= IDENTMAP_ALL;
3303 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3304 iommu_identity_mapping |= IDENTMAP_GFX;
3307 check_tylersburg_isoch();
3309 if (iommu_identity_mapping) {
3310 ret = si_domain_init(hw_pass_through);
3317 * If we copied translations from a previous kernel in the kdump
3318 * case, we can not assign the devices to domains now, as that
3319 * would eliminate the old mappings. So skip this part and defer
3320 * the assignment to device driver initialization time.
3326 * If pass through is not set or not enabled, setup context entries for
3327 * identity mappings for rmrr, gfx, and isa and may fall back to static
3328 * identity mapping if iommu_identity_mapping is set.
3330 if (iommu_identity_mapping) {
3331 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3333 pr_crit("Failed to setup IOMMU pass-through\n");
3339 * for each dev attached to rmrr
3341 * locate drhd for dev, alloc domain for dev
3342 * allocate free domain
3343 * allocate page table entries for rmrr
3344 * if context not allocated for bus
3345 * allocate and init context
3346 * set present in root table for this bus
3347 * init context with domain, translation etc
3351 pr_info("Setting RMRR:\n");
3352 for_each_rmrr_units(rmrr) {
3353 /* some BIOS lists non-exist devices in DMAR table. */
3354 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3356 ret = iommu_prepare_rmrr_dev(rmrr, dev);
3358 pr_err("Mapping reserved region failed\n");
3362 iommu_prepare_isa();
3369 * global invalidate context cache
3370 * global invalidate iotlb
3371 * enable translation
3373 for_each_iommu(iommu, drhd) {
3374 if (drhd->ignored) {
3376 * we always have to disable PMRs or DMA may fail on
3380 iommu_disable_protect_mem_regions(iommu);
3384 iommu_flush_write_buffer(iommu);
3386 #ifdef CONFIG_INTEL_IOMMU_SVM
3387 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3388 ret = intel_svm_enable_prq(iommu);
3393 ret = dmar_set_interrupt(iommu);
3397 if (!translation_pre_enabled(iommu))
3398 iommu_enable_translation(iommu);
3400 iommu_disable_protect_mem_regions(iommu);
3406 for_each_active_iommu(iommu, drhd) {
3407 disable_dmar_iommu(iommu);
3408 free_dmar_iommu(iommu);
3417 /* This takes a number of _MM_ pages, not VTD pages */
3418 static unsigned long intel_alloc_iova(struct device *dev,
3419 struct dmar_domain *domain,
3420 unsigned long nrpages, uint64_t dma_mask)
3422 unsigned long iova_pfn = 0;
3424 /* Restrict dma_mask to the width that the iommu can handle */
3425 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3426 /* Ensure we reserve the whole size-aligned region */
3427 nrpages = __roundup_pow_of_two(nrpages);
3429 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3431 * First try to allocate an io virtual address in
3432 * DMA_BIT_MASK(32) and if that fails then try allocating
3435 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3436 IOVA_PFN(DMA_BIT_MASK(32)), false);
3440 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3441 IOVA_PFN(dma_mask), true);
3442 if (unlikely(!iova_pfn)) {
3443 pr_err("Allocating %ld-page iova for %s failed",
3444 nrpages, dev_name(dev));
3451 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3453 struct dmar_domain *domain, *tmp;
3454 struct dmar_rmrr_unit *rmrr;
3455 struct device *i_dev;
3458 domain = find_domain(dev);
3462 domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3466 /* We have a new domain - setup possible RMRRs for the device */
3468 for_each_rmrr_units(rmrr) {
3469 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3474 ret = domain_prepare_identity_map(dev, domain,
3478 dev_err(dev, "Mapping reserved region failed\n");
3483 tmp = set_domain_for_dev(dev, domain);
3484 if (!tmp || domain != tmp) {
3485 domain_exit(domain);
3492 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3498 /* Check if the dev needs to go through non-identity map and unmap process.*/
3499 static int iommu_no_mapping(struct device *dev)
3503 if (iommu_dummy(dev))
3506 if (!iommu_identity_mapping)
3509 found = identity_mapping(dev);
3511 if (iommu_should_identity_map(dev, 0))
3515 * 32 bit DMA is removed from si_domain and fall back
3516 * to non-identity mapping.
3518 dmar_remove_one_dev_info(si_domain, dev);
3519 pr_info("32bit %s uses non-identity mapping\n",
3525 * In case of a detached 64 bit DMA device from vm, the device
3526 * is put into si_domain for identity mapping.
3528 if (iommu_should_identity_map(dev, 0)) {
3530 ret = domain_add_dev_info(si_domain, dev);
3532 pr_info("64bit %s uses identity mapping\n",
3542 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3543 size_t size, int dir, u64 dma_mask)
3545 struct dmar_domain *domain;
3546 phys_addr_t start_paddr;
3547 unsigned long iova_pfn;
3550 struct intel_iommu *iommu;
3551 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3553 BUG_ON(dir == DMA_NONE);
3555 if (iommu_no_mapping(dev))
3558 domain = get_valid_domain_for_dev(dev);
3562 iommu = domain_get_iommu(domain);
3563 size = aligned_nrpages(paddr, size);
3565 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3570 * Check if DMAR supports zero-length reads on write only
3573 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3574 !cap_zlr(iommu->cap))
3575 prot |= DMA_PTE_READ;
3576 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3577 prot |= DMA_PTE_WRITE;
3579 * paddr - (paddr + size) might be partial page, we should map the whole
3580 * page. Note: if two part of one page are separately mapped, we
3581 * might have two guest_addr mapping to the same host paddr, but this
3582 * is not a big problem
3584 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3585 mm_to_dma_pfn(paddr_pfn), size, prot);
3589 start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3590 start_paddr += paddr & ~PAGE_MASK;
3595 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3596 pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3597 dev_name(dev), size, (unsigned long long)paddr, dir);
3601 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3602 unsigned long offset, size_t size,
3603 enum dma_data_direction dir,
3604 unsigned long attrs)
3606 return __intel_map_single(dev, page_to_phys(page) + offset, size,
3607 dir, *dev->dma_mask);
3610 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3612 struct dmar_domain *domain;
3613 unsigned long start_pfn, last_pfn;
3614 unsigned long nrpages;
3615 unsigned long iova_pfn;
3616 struct intel_iommu *iommu;
3617 struct page *freelist;
3619 if (iommu_no_mapping(dev))
3622 domain = find_domain(dev);
3625 iommu = domain_get_iommu(domain);
3627 iova_pfn = IOVA_PFN(dev_addr);
3629 nrpages = aligned_nrpages(dev_addr, size);
3630 start_pfn = mm_to_dma_pfn(iova_pfn);
3631 last_pfn = start_pfn + nrpages - 1;
3633 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3634 dev_name(dev), start_pfn, last_pfn);
3636 freelist = domain_unmap(domain, start_pfn, last_pfn);
3638 if (intel_iommu_strict) {
3639 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3640 nrpages, !freelist, 0);
3642 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3643 dma_free_pagelist(freelist);
3645 queue_iova(&domain->iovad, iova_pfn, nrpages,
3646 (unsigned long)freelist);
3648 * queue up the release of the unmap to save the 1/6th of the
3649 * cpu used up by the iotlb flush operation...
3654 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3655 size_t size, enum dma_data_direction dir,
3656 unsigned long attrs)
3658 intel_unmap(dev, dev_addr, size);
3661 static void *intel_alloc_coherent(struct device *dev, size_t size,
3662 dma_addr_t *dma_handle, gfp_t flags,
3663 unsigned long attrs)
3665 struct page *page = NULL;
3668 size = PAGE_ALIGN(size);
3669 order = get_order(size);
3671 if (!iommu_no_mapping(dev))
3672 flags &= ~(GFP_DMA | GFP_DMA32);
3673 else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3674 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3680 if (gfpflags_allow_blocking(flags)) {
3681 unsigned int count = size >> PAGE_SHIFT;
3683 page = dma_alloc_from_contiguous(dev, count, order,
3684 flags & __GFP_NOWARN);
3685 if (page && iommu_no_mapping(dev) &&
3686 page_to_phys(page) + size > dev->coherent_dma_mask) {
3687 dma_release_from_contiguous(dev, page, count);
3693 page = alloc_pages(flags, order);
3696 memset(page_address(page), 0, size);
3698 *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3700 dev->coherent_dma_mask);
3702 return page_address(page);
3703 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3704 __free_pages(page, order);
3709 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3710 dma_addr_t dma_handle, unsigned long attrs)
3713 struct page *page = virt_to_page(vaddr);
3715 size = PAGE_ALIGN(size);
3716 order = get_order(size);
3718 intel_unmap(dev, dma_handle, size);
3719 if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3720 __free_pages(page, order);
3723 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3724 int nelems, enum dma_data_direction dir,
3725 unsigned long attrs)
3727 dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3728 unsigned long nrpages = 0;
3729 struct scatterlist *sg;
3732 for_each_sg(sglist, sg, nelems, i) {
3733 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3736 intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3739 static int intel_nontranslate_map_sg(struct device *hddev,
3740 struct scatterlist *sglist, int nelems, int dir)
3743 struct scatterlist *sg;
3745 for_each_sg(sglist, sg, nelems, i) {
3746 BUG_ON(!sg_page(sg));
3747 sg->dma_address = sg_phys(sg);
3748 sg->dma_length = sg->length;
3753 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3754 enum dma_data_direction dir, unsigned long attrs)
3757 struct dmar_domain *domain;
3760 unsigned long iova_pfn;
3762 struct scatterlist *sg;
3763 unsigned long start_vpfn;
3764 struct intel_iommu *iommu;
3766 BUG_ON(dir == DMA_NONE);
3767 if (iommu_no_mapping(dev))
3768 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3770 domain = get_valid_domain_for_dev(dev);
3774 iommu = domain_get_iommu(domain);
3776 for_each_sg(sglist, sg, nelems, i)
3777 size += aligned_nrpages(sg->offset, sg->length);
3779 iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3782 sglist->dma_length = 0;
3787 * Check if DMAR supports zero-length reads on write only
3790 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3791 !cap_zlr(iommu->cap))
3792 prot |= DMA_PTE_READ;
3793 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3794 prot |= DMA_PTE_WRITE;
3796 start_vpfn = mm_to_dma_pfn(iova_pfn);
3798 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3799 if (unlikely(ret)) {
3800 dma_pte_free_pagetable(domain, start_vpfn,
3801 start_vpfn + size - 1,
3802 agaw_to_level(domain->agaw) + 1);
3803 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3810 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3815 static const struct dma_map_ops intel_dma_ops = {
3816 .alloc = intel_alloc_coherent,
3817 .free = intel_free_coherent,
3818 .map_sg = intel_map_sg,
3819 .unmap_sg = intel_unmap_sg,
3820 .map_page = intel_map_page,
3821 .unmap_page = intel_unmap_page,
3822 .mapping_error = intel_mapping_error,
3823 .dma_supported = dma_direct_supported,
3826 static inline int iommu_domain_cache_init(void)
3830 iommu_domain_cache = kmem_cache_create("iommu_domain",
3831 sizeof(struct dmar_domain),
3836 if (!iommu_domain_cache) {
3837 pr_err("Couldn't create iommu_domain cache\n");
3844 static inline int iommu_devinfo_cache_init(void)
3848 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3849 sizeof(struct device_domain_info),
3853 if (!iommu_devinfo_cache) {
3854 pr_err("Couldn't create devinfo cache\n");
3861 static int __init iommu_init_mempool(void)
3864 ret = iova_cache_get();
3868 ret = iommu_domain_cache_init();
3872 ret = iommu_devinfo_cache_init();
3876 kmem_cache_destroy(iommu_domain_cache);
3883 static void __init iommu_exit_mempool(void)
3885 kmem_cache_destroy(iommu_devinfo_cache);
3886 kmem_cache_destroy(iommu_domain_cache);
3890 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3892 struct dmar_drhd_unit *drhd;
3896 /* We know that this device on this chipset has its own IOMMU.
3897 * If we find it under a different IOMMU, then the BIOS is lying
3898 * to us. Hope that the IOMMU for this device is actually
3899 * disabled, and it needs no translation...
3901 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3903 /* "can't" happen */
3904 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3907 vtbar &= 0xffff0000;
3909 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3910 drhd = dmar_find_matched_drhd_unit(pdev);
3911 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3912 TAINT_FIRMWARE_WORKAROUND,
3913 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3914 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3916 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3918 static void __init init_no_remapping_devices(void)
3920 struct dmar_drhd_unit *drhd;
3924 for_each_drhd_unit(drhd) {
3925 if (!drhd->include_all) {
3926 for_each_active_dev_scope(drhd->devices,
3927 drhd->devices_cnt, i, dev)
3929 /* ignore DMAR unit if no devices exist */
3930 if (i == drhd->devices_cnt)
3935 for_each_active_drhd_unit(drhd) {
3936 if (drhd->include_all)
3939 for_each_active_dev_scope(drhd->devices,
3940 drhd->devices_cnt, i, dev)
3941 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3943 if (i < drhd->devices_cnt)
3946 /* This IOMMU has *only* gfx devices. Either bypass it or
3947 set the gfx_mapped flag, as appropriate */
3949 intel_iommu_gfx_mapped = 1;
3952 for_each_active_dev_scope(drhd->devices,
3953 drhd->devices_cnt, i, dev)
3954 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3959 #ifdef CONFIG_SUSPEND
3960 static int init_iommu_hw(void)
3962 struct dmar_drhd_unit *drhd;
3963 struct intel_iommu *iommu = NULL;
3965 for_each_active_iommu(iommu, drhd)
3967 dmar_reenable_qi(iommu);
3969 for_each_iommu(iommu, drhd) {
3970 if (drhd->ignored) {
3972 * we always have to disable PMRs or DMA may fail on
3976 iommu_disable_protect_mem_regions(iommu);
3980 iommu_flush_write_buffer(iommu);
3982 iommu_set_root_entry(iommu);
3984 iommu->flush.flush_context(iommu, 0, 0, 0,
3985 DMA_CCMD_GLOBAL_INVL);
3986 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3987 iommu_enable_translation(iommu);
3988 iommu_disable_protect_mem_regions(iommu);
3994 static void iommu_flush_all(void)
3996 struct dmar_drhd_unit *drhd;
3997 struct intel_iommu *iommu;
3999 for_each_active_iommu(iommu, drhd) {
4000 iommu->flush.flush_context(iommu, 0, 0, 0,
4001 DMA_CCMD_GLOBAL_INVL);
4002 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4003 DMA_TLB_GLOBAL_FLUSH);
4007 static int iommu_suspend(void)
4009 struct dmar_drhd_unit *drhd;
4010 struct intel_iommu *iommu = NULL;
4013 for_each_active_iommu(iommu, drhd) {
4014 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4016 if (!iommu->iommu_state)
4022 for_each_active_iommu(iommu, drhd) {
4023 iommu_disable_translation(iommu);
4025 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4027 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4028 readl(iommu->reg + DMAR_FECTL_REG);
4029 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4030 readl(iommu->reg + DMAR_FEDATA_REG);
4031 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4032 readl(iommu->reg + DMAR_FEADDR_REG);
4033 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4034 readl(iommu->reg + DMAR_FEUADDR_REG);
4036 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4041 for_each_active_iommu(iommu, drhd)
4042 kfree(iommu->iommu_state);
4047 static void iommu_resume(void)
4049 struct dmar_drhd_unit *drhd;
4050 struct intel_iommu *iommu = NULL;
4053 if (init_iommu_hw()) {
4055 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4057 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4061 for_each_active_iommu(iommu, drhd) {
4063 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4065 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4066 iommu->reg + DMAR_FECTL_REG);
4067 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4068 iommu->reg + DMAR_FEDATA_REG);
4069 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4070 iommu->reg + DMAR_FEADDR_REG);
4071 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4072 iommu->reg + DMAR_FEUADDR_REG);
4074 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4077 for_each_active_iommu(iommu, drhd)
4078 kfree(iommu->iommu_state);
4081 static struct syscore_ops iommu_syscore_ops = {
4082 .resume = iommu_resume,
4083 .suspend = iommu_suspend,
4086 static void __init init_iommu_pm_ops(void)
4088 register_syscore_ops(&iommu_syscore_ops);
4092 static inline void init_iommu_pm_ops(void) {}
4093 #endif /* CONFIG_PM */
4096 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4098 struct acpi_dmar_reserved_memory *rmrr;
4099 int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4100 struct dmar_rmrr_unit *rmrru;
4103 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4107 rmrru->hdr = header;
4108 rmrr = (struct acpi_dmar_reserved_memory *)header;
4109 rmrru->base_address = rmrr->base_address;
4110 rmrru->end_address = rmrr->end_address;
4112 length = rmrr->end_address - rmrr->base_address + 1;
4113 rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4118 rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4119 ((void *)rmrr) + rmrr->header.length,
4120 &rmrru->devices_cnt);
4121 if (rmrru->devices_cnt && rmrru->devices == NULL)
4124 list_add(&rmrru->list, &dmar_rmrr_units);
4135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4137 struct dmar_atsr_unit *atsru;
4138 struct acpi_dmar_atsr *tmp;
4140 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4141 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4142 if (atsr->segment != tmp->segment)
4144 if (atsr->header.length != tmp->header.length)
4146 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4155 struct acpi_dmar_atsr *atsr;
4156 struct dmar_atsr_unit *atsru;
4158 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4161 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4162 atsru = dmar_find_atsr(atsr);
4166 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4171 * If memory is allocated from slab by ACPI _DSM method, we need to
4172 * copy the memory content because the memory buffer will be freed
4175 atsru->hdr = (void *)(atsru + 1);
4176 memcpy(atsru->hdr, hdr, hdr->length);
4177 atsru->include_all = atsr->flags & 0x1;
4178 if (!atsru->include_all) {
4179 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4180 (void *)atsr + atsr->header.length,
4181 &atsru->devices_cnt);
4182 if (atsru->devices_cnt && atsru->devices == NULL) {
4188 list_add_rcu(&atsru->list, &dmar_atsr_units);
4193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4195 dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4201 struct acpi_dmar_atsr *atsr;
4202 struct dmar_atsr_unit *atsru;
4204 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4205 atsru = dmar_find_atsr(atsr);
4207 list_del_rcu(&atsru->list);
4209 intel_iommu_free_atsr(atsru);
4215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4219 struct acpi_dmar_atsr *atsr;
4220 struct dmar_atsr_unit *atsru;
4222 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4223 atsru = dmar_find_atsr(atsr);
4227 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4228 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4236 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4239 struct intel_iommu *iommu = dmaru->iommu;
4241 if (g_iommus[iommu->seq_id])
4244 if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4245 pr_warn("%s: Doesn't support hardware pass through.\n",
4249 if (!ecap_sc_support(iommu->ecap) &&
4250 domain_update_iommu_snooping(iommu)) {
4251 pr_warn("%s: Doesn't support snooping.\n",
4255 sp = domain_update_iommu_superpage(iommu) - 1;
4256 if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4257 pr_warn("%s: Doesn't support large page.\n",
4263 * Disable translation if already enabled prior to OS handover.
4265 if (iommu->gcmd & DMA_GCMD_TE)
4266 iommu_disable_translation(iommu);
4268 g_iommus[iommu->seq_id] = iommu;
4269 ret = iommu_init_domains(iommu);
4271 ret = iommu_alloc_root_entry(iommu);
4275 #ifdef CONFIG_INTEL_IOMMU_SVM
4276 if (pasid_supported(iommu))
4277 intel_svm_init(iommu);
4280 if (dmaru->ignored) {
4282 * we always have to disable PMRs or DMA may fail on this device
4285 iommu_disable_protect_mem_regions(iommu);
4289 intel_iommu_init_qi(iommu);
4290 iommu_flush_write_buffer(iommu);
4292 #ifdef CONFIG_INTEL_IOMMU_SVM
4293 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4294 ret = intel_svm_enable_prq(iommu);
4299 ret = dmar_set_interrupt(iommu);
4303 iommu_set_root_entry(iommu);
4304 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4305 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4306 iommu_enable_translation(iommu);
4308 iommu_disable_protect_mem_regions(iommu);
4312 disable_dmar_iommu(iommu);
4314 free_dmar_iommu(iommu);
4318 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4321 struct intel_iommu *iommu = dmaru->iommu;
4323 if (!intel_iommu_enabled)
4329 ret = intel_iommu_add(dmaru);
4331 disable_dmar_iommu(iommu);
4332 free_dmar_iommu(iommu);
4338 static void intel_iommu_free_dmars(void)
4340 struct dmar_rmrr_unit *rmrru, *rmrr_n;
4341 struct dmar_atsr_unit *atsru, *atsr_n;
4343 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4344 list_del(&rmrru->list);
4345 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4350 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4351 list_del(&atsru->list);
4352 intel_iommu_free_atsr(atsru);
4356 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4359 struct pci_bus *bus;
4360 struct pci_dev *bridge = NULL;
4362 struct acpi_dmar_atsr *atsr;
4363 struct dmar_atsr_unit *atsru;
4365 dev = pci_physfn(dev);
4366 for (bus = dev->bus; bus; bus = bus->parent) {
4368 /* If it's an integrated device, allow ATS */
4371 /* Connected via non-PCIe: no ATS */
4372 if (!pci_is_pcie(bridge) ||
4373 pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4375 /* If we found the root port, look it up in the ATSR */
4376 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4381 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4382 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4383 if (atsr->segment != pci_domain_nr(dev->bus))
4386 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4387 if (tmp == &bridge->dev)
4390 if (atsru->include_all)
4400 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4403 struct dmar_rmrr_unit *rmrru;
4404 struct dmar_atsr_unit *atsru;
4405 struct acpi_dmar_atsr *atsr;
4406 struct acpi_dmar_reserved_memory *rmrr;
4408 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4411 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4412 rmrr = container_of(rmrru->hdr,
4413 struct acpi_dmar_reserved_memory, header);
4414 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4415 ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4416 ((void *)rmrr) + rmrr->header.length,
4417 rmrr->segment, rmrru->devices,
4418 rmrru->devices_cnt);
4421 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4422 dmar_remove_dev_scope(info, rmrr->segment,
4423 rmrru->devices, rmrru->devices_cnt);
4427 list_for_each_entry(atsru, &dmar_atsr_units, list) {
4428 if (atsru->include_all)
4431 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4432 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4433 ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4434 (void *)atsr + atsr->header.length,
4435 atsr->segment, atsru->devices,
4436 atsru->devices_cnt);
4441 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4442 if (dmar_remove_dev_scope(info, atsr->segment,
4443 atsru->devices, atsru->devices_cnt))
4452 * Here we only respond to action of unbound device from driver.
4454 * Added device is not attached to its DMAR domain here yet. That will happen
4455 * when mapping the device to iova.
4457 static int device_notifier(struct notifier_block *nb,
4458 unsigned long action, void *data)
4460 struct device *dev = data;
4461 struct dmar_domain *domain;
4463 if (iommu_dummy(dev))
4466 if (action != BUS_NOTIFY_REMOVED_DEVICE)
4469 domain = find_domain(dev);
4473 dmar_remove_one_dev_info(domain, dev);
4474 if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4475 domain_exit(domain);
4480 static struct notifier_block device_nb = {
4481 .notifier_call = device_notifier,
4484 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4485 unsigned long val, void *v)
4487 struct memory_notify *mhp = v;
4488 unsigned long long start, end;
4489 unsigned long start_vpfn, last_vpfn;
4492 case MEM_GOING_ONLINE:
4493 start = mhp->start_pfn << PAGE_SHIFT;
4494 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4495 if (iommu_domain_identity_map(si_domain, start, end)) {
4496 pr_warn("Failed to build identity map for [%llx-%llx]\n",
4503 case MEM_CANCEL_ONLINE:
4504 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4505 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4506 while (start_vpfn <= last_vpfn) {
4508 struct dmar_drhd_unit *drhd;
4509 struct intel_iommu *iommu;
4510 struct page *freelist;
4512 iova = find_iova(&si_domain->iovad, start_vpfn);
4514 pr_debug("Failed get IOVA for PFN %lx\n",
4519 iova = split_and_remove_iova(&si_domain->iovad, iova,
4520 start_vpfn, last_vpfn);
4522 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4523 start_vpfn, last_vpfn);
4527 freelist = domain_unmap(si_domain, iova->pfn_lo,
4531 for_each_active_iommu(iommu, drhd)
4532 iommu_flush_iotlb_psi(iommu, si_domain,
4533 iova->pfn_lo, iova_size(iova),
4536 dma_free_pagelist(freelist);
4538 start_vpfn = iova->pfn_hi + 1;
4539 free_iova_mem(iova);
4547 static struct notifier_block intel_iommu_memory_nb = {
4548 .notifier_call = intel_iommu_memory_notifier,
4552 static void free_all_cpu_cached_iovas(unsigned int cpu)
4556 for (i = 0; i < g_num_of_iommus; i++) {
4557 struct intel_iommu *iommu = g_iommus[i];
4558 struct dmar_domain *domain;
4564 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4565 domain = get_iommu_domain(iommu, (u16)did);
4569 free_cpu_cached_iovas(cpu, &domain->iovad);
4574 static int intel_iommu_cpu_dead(unsigned int cpu)
4576 free_all_cpu_cached_iovas(cpu);
4580 static void intel_disable_iommus(void)
4582 struct intel_iommu *iommu = NULL;
4583 struct dmar_drhd_unit *drhd;
4585 for_each_iommu(iommu, drhd)
4586 iommu_disable_translation(iommu);
4589 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4591 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4593 return container_of(iommu_dev, struct intel_iommu, iommu);
4596 static ssize_t intel_iommu_show_version(struct device *dev,
4597 struct device_attribute *attr,
4600 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4601 u32 ver = readl(iommu->reg + DMAR_VER_REG);
4602 return sprintf(buf, "%d:%d\n",
4603 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4605 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4607 static ssize_t intel_iommu_show_address(struct device *dev,
4608 struct device_attribute *attr,
4611 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4612 return sprintf(buf, "%llx\n", iommu->reg_phys);
4614 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4616 static ssize_t intel_iommu_show_cap(struct device *dev,
4617 struct device_attribute *attr,
4620 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4621 return sprintf(buf, "%llx\n", iommu->cap);
4623 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4625 static ssize_t intel_iommu_show_ecap(struct device *dev,
4626 struct device_attribute *attr,
4629 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4630 return sprintf(buf, "%llx\n", iommu->ecap);
4632 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4634 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4635 struct device_attribute *attr,
4638 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4639 return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4641 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4643 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4644 struct device_attribute *attr,
4647 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4648 return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4649 cap_ndoms(iommu->cap)));
4651 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4653 static struct attribute *intel_iommu_attrs[] = {
4654 &dev_attr_version.attr,
4655 &dev_attr_address.attr,
4657 &dev_attr_ecap.attr,
4658 &dev_attr_domains_supported.attr,
4659 &dev_attr_domains_used.attr,
4663 static struct attribute_group intel_iommu_group = {
4664 .name = "intel-iommu",
4665 .attrs = intel_iommu_attrs,
4668 const struct attribute_group *intel_iommu_groups[] = {
4673 int __init intel_iommu_init(void)
4676 struct dmar_drhd_unit *drhd;
4677 struct intel_iommu *iommu;
4679 /* VT-d is required for a TXT/tboot launch, so enforce that */
4680 force_on = tboot_force_iommu();
4682 if (iommu_init_mempool()) {
4684 panic("tboot: Failed to initialize iommu memory\n");
4688 down_write(&dmar_global_lock);
4689 if (dmar_table_init()) {
4691 panic("tboot: Failed to initialize DMAR table\n");
4695 if (dmar_dev_scope_init() < 0) {
4697 panic("tboot: Failed to initialize DMAR device scope\n");
4701 up_write(&dmar_global_lock);
4704 * The bus notifier takes the dmar_global_lock, so lockdep will
4705 * complain later when we register it under the lock.
4707 dmar_register_bus_notifier();
4709 down_write(&dmar_global_lock);
4711 if (no_iommu || dmar_disabled) {
4713 * We exit the function here to ensure IOMMU's remapping and
4714 * mempool aren't setup, which means that the IOMMU's PMRs
4715 * won't be disabled via the call to init_dmars(). So disable
4716 * it explicitly here. The PMRs were setup by tboot prior to
4717 * calling SENTER, but the kernel is expected to reset/tear
4720 if (intel_iommu_tboot_noforce) {
4721 for_each_iommu(iommu, drhd)
4722 iommu_disable_protect_mem_regions(iommu);
4726 * Make sure the IOMMUs are switched off, even when we
4727 * boot into a kexec kernel and the previous kernel left
4730 intel_disable_iommus();
4734 if (list_empty(&dmar_rmrr_units))
4735 pr_info("No RMRR found\n");
4737 if (list_empty(&dmar_atsr_units))
4738 pr_info("No ATSR found\n");
4740 if (dmar_init_reserved_ranges()) {
4742 panic("tboot: Failed to reserve iommu ranges\n");
4743 goto out_free_reserved_range;
4746 init_no_remapping_devices();
4751 panic("tboot: Failed to initialize DMARs\n");
4752 pr_err("Initialization failed\n");
4753 goto out_free_reserved_range;
4755 up_write(&dmar_global_lock);
4756 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4758 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4761 dma_ops = &intel_dma_ops;
4763 init_iommu_pm_ops();
4765 for_each_active_iommu(iommu, drhd) {
4766 iommu_device_sysfs_add(&iommu->iommu, NULL,
4769 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4770 iommu_device_register(&iommu->iommu);
4773 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4774 bus_register_notifier(&pci_bus_type, &device_nb);
4775 if (si_domain && !hw_pass_through)
4776 register_memory_notifier(&intel_iommu_memory_nb);
4777 cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4778 intel_iommu_cpu_dead);
4779 intel_iommu_enabled = 1;
4780 intel_iommu_debugfs_init();
4784 out_free_reserved_range:
4785 put_iova_domain(&reserved_iova_list);
4787 intel_iommu_free_dmars();
4788 up_write(&dmar_global_lock);
4789 iommu_exit_mempool();
4793 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4795 struct intel_iommu *iommu = opaque;
4797 domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4802 * NB - intel-iommu lacks any sort of reference counting for the users of
4803 * dependent devices. If multiple endpoints have intersecting dependent
4804 * devices, unbinding the driver from any one of them will possibly leave
4805 * the others unable to operate.
4807 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4809 if (!iommu || !dev || !dev_is_pci(dev))
4812 pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4815 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4817 struct intel_iommu *iommu;
4818 unsigned long flags;
4820 assert_spin_locked(&device_domain_lock);
4825 iommu = info->iommu;
4828 iommu_disable_dev_iotlb(info);
4829 domain_context_clear(iommu, info->dev);
4830 intel_pasid_free_table(info->dev);
4833 unlink_domain_info(info);
4835 spin_lock_irqsave(&iommu->lock, flags);
4836 domain_detach_iommu(info->domain, iommu);
4837 spin_unlock_irqrestore(&iommu->lock, flags);
4839 free_devinfo_mem(info);
4842 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4845 struct device_domain_info *info;
4846 unsigned long flags;
4848 spin_lock_irqsave(&device_domain_lock, flags);
4849 info = dev->archdata.iommu;
4850 __dmar_remove_one_dev_info(info);
4851 spin_unlock_irqrestore(&device_domain_lock, flags);
4854 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4858 init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4859 domain_reserve_special_ranges(domain);
4861 /* calculate AGAW */
4862 domain->gaw = guest_width;
4863 adjust_width = guestwidth_to_adjustwidth(guest_width);
4864 domain->agaw = width_to_agaw(adjust_width);
4866 domain->iommu_coherency = 0;
4867 domain->iommu_snooping = 0;
4868 domain->iommu_superpage = 0;
4869 domain->max_addr = 0;
4871 /* always allocate the top pgd */
4872 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4875 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4879 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4881 struct dmar_domain *dmar_domain;
4882 struct iommu_domain *domain;
4884 if (type != IOMMU_DOMAIN_UNMANAGED)
4887 dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4889 pr_err("Can't allocate dmar_domain\n");
4892 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4893 pr_err("Domain initialization failed\n");
4894 domain_exit(dmar_domain);
4897 domain_update_iommu_cap(dmar_domain);
4899 domain = &dmar_domain->domain;
4900 domain->geometry.aperture_start = 0;
4901 domain->geometry.aperture_end = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4902 domain->geometry.force_aperture = true;
4907 static void intel_iommu_domain_free(struct iommu_domain *domain)
4909 domain_exit(to_dmar_domain(domain));
4912 static int intel_iommu_attach_device(struct iommu_domain *domain,
4915 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4916 struct intel_iommu *iommu;
4920 if (device_is_rmrr_locked(dev)) {
4921 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement. Contact your platform vendor.\n");
4925 /* normally dev is not mapped */
4926 if (unlikely(domain_context_mapped(dev))) {
4927 struct dmar_domain *old_domain;
4929 old_domain = find_domain(dev);
4932 dmar_remove_one_dev_info(old_domain, dev);
4935 if (!domain_type_is_vm_or_si(old_domain) &&
4936 list_empty(&old_domain->devices))
4937 domain_exit(old_domain);
4941 iommu = device_to_iommu(dev, &bus, &devfn);
4945 /* check if this iommu agaw is sufficient for max mapped address */
4946 addr_width = agaw_to_width(iommu->agaw);
4947 if (addr_width > cap_mgaw(iommu->cap))
4948 addr_width = cap_mgaw(iommu->cap);
4950 if (dmar_domain->max_addr > (1LL << addr_width)) {
4951 pr_err("%s: iommu width (%d) is not "
4952 "sufficient for the mapped address (%llx)\n",
4953 __func__, addr_width, dmar_domain->max_addr);
4956 dmar_domain->gaw = addr_width;
4959 * Knock out extra levels of page tables if necessary
4961 while (iommu->agaw < dmar_domain->agaw) {
4962 struct dma_pte *pte;
4964 pte = dmar_domain->pgd;
4965 if (dma_pte_present(pte)) {
4966 dmar_domain->pgd = (struct dma_pte *)
4967 phys_to_virt(dma_pte_addr(pte));
4968 free_pgtable_page(pte);
4970 dmar_domain->agaw--;
4973 return domain_add_dev_info(dmar_domain, dev);
4976 static void intel_iommu_detach_device(struct iommu_domain *domain,
4979 dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4982 static int intel_iommu_map(struct iommu_domain *domain,
4983 unsigned long iova, phys_addr_t hpa,
4984 size_t size, int iommu_prot)
4986 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4991 if (iommu_prot & IOMMU_READ)
4992 prot |= DMA_PTE_READ;
4993 if (iommu_prot & IOMMU_WRITE)
4994 prot |= DMA_PTE_WRITE;
4995 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4996 prot |= DMA_PTE_SNP;
4998 max_addr = iova + size;
4999 if (dmar_domain->max_addr < max_addr) {
5002 /* check if minimum agaw is sufficient for mapped address */
5003 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5004 if (end < max_addr) {
5005 pr_err("%s: iommu width (%d) is not "
5006 "sufficient for the mapped address (%llx)\n",
5007 __func__, dmar_domain->gaw, max_addr);
5010 dmar_domain->max_addr = max_addr;
5012 /* Round up size to next multiple of PAGE_SIZE, if it and
5013 the low bits of hpa would take us onto the next page */
5014 size = aligned_nrpages(hpa, size);
5015 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5016 hpa >> VTD_PAGE_SHIFT, size, prot);
5020 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5021 unsigned long iova, size_t size)
5023 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5024 struct page *freelist = NULL;
5025 unsigned long start_pfn, last_pfn;
5026 unsigned int npages;
5027 int iommu_id, level = 0;
5029 /* Cope with horrid API which requires us to unmap more than the
5030 size argument if it happens to be a large-page mapping. */
5031 BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5033 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5034 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5036 start_pfn = iova >> VTD_PAGE_SHIFT;
5037 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5039 freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5041 npages = last_pfn - start_pfn + 1;
5043 for_each_domain_iommu(iommu_id, dmar_domain)
5044 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5045 start_pfn, npages, !freelist, 0);
5047 dma_free_pagelist(freelist);
5049 if (dmar_domain->max_addr == iova + size)
5050 dmar_domain->max_addr = iova;
5055 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5058 struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5059 struct dma_pte *pte;
5063 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5065 phys = dma_pte_addr(pte);
5070 static bool intel_iommu_capable(enum iommu_cap cap)
5072 if (cap == IOMMU_CAP_CACHE_COHERENCY)
5073 return domain_update_iommu_snooping(NULL) == 1;
5074 if (cap == IOMMU_CAP_INTR_REMAP)
5075 return irq_remapping_enabled == 1;
5080 static int intel_iommu_add_device(struct device *dev)
5082 struct intel_iommu *iommu;
5083 struct iommu_group *group;
5086 iommu = device_to_iommu(dev, &bus, &devfn);
5090 iommu_device_link(&iommu->iommu, dev);
5092 group = iommu_group_get_for_dev(dev);
5095 return PTR_ERR(group);
5097 iommu_group_put(group);
5101 static void intel_iommu_remove_device(struct device *dev)
5103 struct intel_iommu *iommu;
5106 iommu = device_to_iommu(dev, &bus, &devfn);
5110 iommu_group_remove_device(dev);
5112 iommu_device_unlink(&iommu->iommu, dev);
5115 static void intel_iommu_get_resv_regions(struct device *device,
5116 struct list_head *head)
5118 struct iommu_resv_region *reg;
5119 struct dmar_rmrr_unit *rmrr;
5120 struct device *i_dev;
5124 for_each_rmrr_units(rmrr) {
5125 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5127 if (i_dev != device)
5130 list_add_tail(&rmrr->resv->list, head);
5135 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5136 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5140 list_add_tail(®->list, head);
5143 static void intel_iommu_put_resv_regions(struct device *dev,
5144 struct list_head *head)
5146 struct iommu_resv_region *entry, *next;
5148 list_for_each_entry_safe(entry, next, head, list) {
5149 if (entry->type == IOMMU_RESV_RESERVED)
5154 #ifdef CONFIG_INTEL_IOMMU_SVM
5155 #define MAX_NR_PASID_BITS (20)
5156 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5160 max_pasid = intel_pasid_get_dev_max_id(dev);
5161 pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5168 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5170 struct device_domain_info *info;
5171 struct context_entry *context;
5172 struct dmar_domain *domain;
5173 unsigned long flags;
5177 domain = get_valid_domain_for_dev(sdev->dev);
5181 spin_lock_irqsave(&device_domain_lock, flags);
5182 spin_lock(&iommu->lock);
5185 info = sdev->dev->archdata.iommu;
5186 if (!info || !info->pasid_supported)
5189 context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5190 if (WARN_ON(!context))
5193 ctx_lo = context[0].lo;
5195 sdev->did = domain->iommu_did[iommu->seq_id];
5196 sdev->sid = PCI_DEVID(info->bus, info->devfn);
5198 if (!(ctx_lo & CONTEXT_PASIDE)) {
5199 if (iommu->pasid_state_table)
5200 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5201 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5202 intel_iommu_get_pts(sdev->dev);
5205 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5206 * extended to permit requests-with-PASID if the PASIDE bit
5207 * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5208 * however, the PASIDE bit is ignored and requests-with-PASID
5209 * are unconditionally blocked. Which makes less sense.
5210 * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5211 * "guest mode" translation types depending on whether ATS
5212 * is available or not. Annoyingly, we can't use the new
5213 * modes *unless* PASIDE is set. */
5214 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5215 ctx_lo &= ~CONTEXT_TT_MASK;
5216 if (info->ats_supported)
5217 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5219 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5221 ctx_lo |= CONTEXT_PASIDE;
5222 if (iommu->pasid_state_table)
5223 ctx_lo |= CONTEXT_DINVE;
5224 if (info->pri_supported)
5225 ctx_lo |= CONTEXT_PRS;
5226 context[0].lo = ctx_lo;
5228 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5229 DMA_CCMD_MASK_NOBIT,
5230 DMA_CCMD_DEVICE_INVL);
5233 /* Enable PASID support in the device, if it wasn't already */
5234 if (!info->pasid_enabled)
5235 iommu_enable_dev_iotlb(info);
5237 if (info->ats_enabled) {
5238 sdev->dev_iotlb = 1;
5239 sdev->qdep = info->ats_qdep;
5240 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5246 spin_unlock(&iommu->lock);
5247 spin_unlock_irqrestore(&device_domain_lock, flags);
5252 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5254 struct intel_iommu *iommu;
5257 if (iommu_dummy(dev)) {
5259 "No IOMMU translation for device; cannot enable SVM\n");
5263 iommu = device_to_iommu(dev, &bus, &devfn);
5265 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5271 #endif /* CONFIG_INTEL_IOMMU_SVM */
5273 const struct iommu_ops intel_iommu_ops = {
5274 .capable = intel_iommu_capable,
5275 .domain_alloc = intel_iommu_domain_alloc,
5276 .domain_free = intel_iommu_domain_free,
5277 .attach_dev = intel_iommu_attach_device,
5278 .detach_dev = intel_iommu_detach_device,
5279 .map = intel_iommu_map,
5280 .unmap = intel_iommu_unmap,
5281 .iova_to_phys = intel_iommu_iova_to_phys,
5282 .add_device = intel_iommu_add_device,
5283 .remove_device = intel_iommu_remove_device,
5284 .get_resv_regions = intel_iommu_get_resv_regions,
5285 .put_resv_regions = intel_iommu_put_resv_regions,
5286 .device_group = pci_device_group,
5287 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
5290 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5292 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5293 pr_info("Disabling IOMMU for graphics on this chipset\n");
5297 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5298 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5305 static void quirk_iommu_rwbf(struct pci_dev *dev)
5308 * Mobile 4 Series Chipset neglects to set RWBF capability,
5309 * but needs it. Same seems to hold for the desktop versions.
5311 pr_info("Forcing write-buffer flush capability\n");
5315 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5316 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5317 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5318 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5319 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5324 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
5325 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
5326 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
5327 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
5328 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
5329 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
5330 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
5331 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
5333 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5337 if (pci_read_config_word(dev, GGC, &ggc))
5340 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5341 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5343 } else if (dmar_map_gfx) {
5344 /* we have to ensure the gfx device is idle before we flush */
5345 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5346 intel_iommu_strict = 1;
5349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5350 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5351 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5352 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5354 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5355 ISOCH DMAR unit for the Azalia sound device, but not give it any
5356 TLB entries, which causes it to deadlock. Check for that. We do
5357 this in a function called from init_dmars(), instead of in a PCI
5358 quirk, because we don't want to print the obnoxious "BIOS broken"
5359 message if VT-d is actually disabled.
5361 static void __init check_tylersburg_isoch(void)
5363 struct pci_dev *pdev;
5364 uint32_t vtisochctrl;
5366 /* If there's no Azalia in the system anyway, forget it. */
5367 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5372 /* System Management Registers. Might be hidden, in which case
5373 we can't do the sanity check. But that's OK, because the
5374 known-broken BIOSes _don't_ actually hide it, so far. */
5375 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5379 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5386 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5387 if (vtisochctrl & 1)
5390 /* Drop all bits other than the number of TLB entries */
5391 vtisochctrl &= 0x1c;
5393 /* If we have the recommended number of TLB entries (16), fine. */
5394 if (vtisochctrl == 0x10)
5397 /* Zero TLB entries? You get to ride the short bus to school. */
5399 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5400 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5401 dmi_get_system_info(DMI_BIOS_VENDOR),
5402 dmi_get_system_info(DMI_BIOS_VERSION),
5403 dmi_get_system_info(DMI_PRODUCT_VERSION));
5404 iommu_identity_mapping |= IDENTMAP_AZALIA;
5408 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",