]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
14e4b37224284976a1cb8890e5d13ae5337350cc
[linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/dma-direct.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/cacheflush.h>
53 #include <asm/iommu.h>
54
55 #include "irq_remapping.h"
56
57 #define ROOT_SIZE               VTD_PAGE_SIZE
58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
59
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64
65 #define IOAPIC_RANGE_START      (0xfee00000)
66 #define IOAPIC_RANGE_END        (0xfeefffff)
67 #define IOVA_START_ADDR         (0x1000)
68
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73
74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN          (1)
85
86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187
188 /*
189  * 0: Present
190  * 1-11: Reserved
191  * 12-63: Context Ptr (12 - (haw-1))
192  * 64-127: Reserved
193  */
194 struct root_entry {
195         u64     lo;
196         u64     hi;
197 };
198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
199
200 /*
201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
202  * if marked present.
203  */
204 static phys_addr_t root_entry_lctp(struct root_entry *re)
205 {
206         if (!(re->lo & 1))
207                 return 0;
208
209         return re->lo & VTD_PAGE_MASK;
210 }
211
212 /*
213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
214  * if marked present.
215  */
216 static phys_addr_t root_entry_uctp(struct root_entry *re)
217 {
218         if (!(re->hi & 1))
219                 return 0;
220
221         return re->hi & VTD_PAGE_MASK;
222 }
223 /*
224  * low 64 bits:
225  * 0: present
226  * 1: fault processing disable
227  * 2-3: translation type
228  * 12-63: address space root
229  * high 64 bits:
230  * 0-2: address width
231  * 3-6: aval
232  * 8-23: domain id
233  */
234 struct context_entry {
235         u64 lo;
236         u64 hi;
237 };
238
239 static inline void context_clear_pasid_enable(struct context_entry *context)
240 {
241         context->lo &= ~(1ULL << 11);
242 }
243
244 static inline bool context_pasid_enabled(struct context_entry *context)
245 {
246         return !!(context->lo & (1ULL << 11));
247 }
248
249 static inline void context_set_copied(struct context_entry *context)
250 {
251         context->hi |= (1ull << 3);
252 }
253
254 static inline bool context_copied(struct context_entry *context)
255 {
256         return !!(context->hi & (1ULL << 3));
257 }
258
259 static inline bool __context_present(struct context_entry *context)
260 {
261         return (context->lo & 1);
262 }
263
264 static inline bool context_present(struct context_entry *context)
265 {
266         return context_pasid_enabled(context) ?
267              __context_present(context) :
268              __context_present(context) && !context_copied(context);
269 }
270
271 static inline void context_set_present(struct context_entry *context)
272 {
273         context->lo |= 1;
274 }
275
276 static inline void context_set_fault_enable(struct context_entry *context)
277 {
278         context->lo &= (((u64)-1) << 2) | 1;
279 }
280
281 static inline void context_set_translation_type(struct context_entry *context,
282                                                 unsigned long value)
283 {
284         context->lo &= (((u64)-1) << 4) | 3;
285         context->lo |= (value & 3) << 2;
286 }
287
288 static inline void context_set_address_root(struct context_entry *context,
289                                             unsigned long value)
290 {
291         context->lo &= ~VTD_PAGE_MASK;
292         context->lo |= value & VTD_PAGE_MASK;
293 }
294
295 static inline void context_set_address_width(struct context_entry *context,
296                                              unsigned long value)
297 {
298         context->hi |= value & 7;
299 }
300
301 static inline void context_set_domain_id(struct context_entry *context,
302                                          unsigned long value)
303 {
304         context->hi |= (value & ((1 << 16) - 1)) << 8;
305 }
306
307 static inline int context_domain_id(struct context_entry *c)
308 {
309         return((c->hi >> 8) & 0xffff);
310 }
311
312 static inline void context_clear_entry(struct context_entry *context)
313 {
314         context->lo = 0;
315         context->hi = 0;
316 }
317
318 /*
319  * 0: readable
320  * 1: writable
321  * 2-6: reserved
322  * 7: super page
323  * 8-10: available
324  * 11: snoop behavior
325  * 12-63: Host physcial address
326  */
327 struct dma_pte {
328         u64 val;
329 };
330
331 static inline void dma_clear_pte(struct dma_pte *pte)
332 {
333         pte->val = 0;
334 }
335
336 static inline u64 dma_pte_addr(struct dma_pte *pte)
337 {
338 #ifdef CONFIG_64BIT
339         return pte->val & VTD_PAGE_MASK;
340 #else
341         /* Must have a full atomic 64-bit read */
342         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
343 #endif
344 }
345
346 static inline bool dma_pte_present(struct dma_pte *pte)
347 {
348         return (pte->val & 3) != 0;
349 }
350
351 static inline bool dma_pte_superpage(struct dma_pte *pte)
352 {
353         return (pte->val & DMA_PTE_LARGE_PAGE);
354 }
355
356 static inline int first_pte_in_page(struct dma_pte *pte)
357 {
358         return !((unsigned long)pte & ~VTD_PAGE_MASK);
359 }
360
361 /*
362  * This domain is a statically identity mapping domain.
363  *      1. This domain creats a static 1:1 mapping to all usable memory.
364  *      2. It maps to each iommu if successful.
365  *      3. Each iommu mapps to this domain if successful.
366  */
367 static struct dmar_domain *si_domain;
368 static int hw_pass_through = 1;
369
370 /*
371  * Domain represents a virtual machine, more than one devices
372  * across iommus may be owned in one domain, e.g. kvm guest.
373  */
374 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
375
376 /* si_domain contains mulitple devices */
377 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
378
379 #define for_each_domain_iommu(idx, domain)                      \
380         for (idx = 0; idx < g_num_of_iommus; idx++)             \
381                 if (domain->iommu_refcnt[idx])
382
383 struct dmar_domain {
384         int     nid;                    /* node id */
385
386         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
387                                         /* Refcount of devices per iommu */
388
389
390         u16             iommu_did[DMAR_UNITS_SUPPORTED];
391                                         /* Domain ids per IOMMU. Use u16 since
392                                          * domain ids are 16 bit wide according
393                                          * to VT-d spec, section 9.3 */
394
395         bool has_iotlb_device;
396         struct list_head devices;       /* all devices' list */
397         struct iova_domain iovad;       /* iova's that belong to this domain */
398
399         struct dma_pte  *pgd;           /* virtual address */
400         int             gaw;            /* max guest address width */
401
402         /* adjusted guest address width, 0 is level 2 30-bit */
403         int             agaw;
404
405         int             flags;          /* flags to find out type of domain */
406
407         int             iommu_coherency;/* indicate coherency of iommu access */
408         int             iommu_snooping; /* indicate snooping control feature*/
409         int             iommu_count;    /* reference count of iommu */
410         int             iommu_superpage;/* Level of superpages supported:
411                                            0 == 4KiB (no superpages), 1 == 2MiB,
412                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
413         u64             max_addr;       /* maximum mapped address */
414
415         struct iommu_domain domain;     /* generic domain data structure for
416                                            iommu core */
417 };
418
419 /* PCI domain-device relationship */
420 struct device_domain_info {
421         struct list_head link;  /* link to domain siblings */
422         struct list_head global; /* link to global list */
423         u8 bus;                 /* PCI bus number */
424         u8 devfn;               /* PCI devfn number */
425         u8 pasid_supported:3;
426         u8 pasid_enabled:1;
427         u8 pri_supported:1;
428         u8 pri_enabled:1;
429         u8 ats_supported:1;
430         u8 ats_enabled:1;
431         u8 ats_qdep;
432         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
433         struct intel_iommu *iommu; /* IOMMU used by this device */
434         struct dmar_domain *domain; /* pointer to domain */
435 };
436
437 struct dmar_rmrr_unit {
438         struct list_head list;          /* list of rmrr units   */
439         struct acpi_dmar_header *hdr;   /* ACPI header          */
440         u64     base_address;           /* reserved base address*/
441         u64     end_address;            /* reserved end address */
442         struct dmar_dev_scope *devices; /* target devices */
443         int     devices_cnt;            /* target device count */
444         struct iommu_resv_region *resv; /* reserved region handle */
445 };
446
447 struct dmar_atsr_unit {
448         struct list_head list;          /* list of ATSR units */
449         struct acpi_dmar_header *hdr;   /* ACPI header */
450         struct dmar_dev_scope *devices; /* target devices */
451         int devices_cnt;                /* target device count */
452         u8 include_all:1;               /* include all ports */
453 };
454
455 static LIST_HEAD(dmar_atsr_units);
456 static LIST_HEAD(dmar_rmrr_units);
457
458 #define for_each_rmrr_units(rmrr) \
459         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
460
461 /* bitmap for indexing intel_iommus */
462 static int g_num_of_iommus;
463
464 static void domain_exit(struct dmar_domain *domain);
465 static void domain_remove_dev_info(struct dmar_domain *domain);
466 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
467                                      struct device *dev);
468 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
469 static void domain_context_clear(struct intel_iommu *iommu,
470                                  struct device *dev);
471 static int domain_detach_iommu(struct dmar_domain *domain,
472                                struct intel_iommu *iommu);
473
474 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
475 int dmar_disabled = 0;
476 #else
477 int dmar_disabled = 1;
478 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
479
480 int intel_iommu_enabled = 0;
481 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
482
483 static int dmar_map_gfx = 1;
484 static int dmar_forcedac;
485 static int intel_iommu_strict;
486 static int intel_iommu_superpage = 1;
487 static int intel_iommu_ecs = 1;
488 static int iommu_identity_mapping;
489
490 #define IDENTMAP_ALL            1
491 #define IDENTMAP_GFX            2
492 #define IDENTMAP_AZALIA         4
493
494 #define ecs_enabled(iommu)      (intel_iommu_ecs && ecap_ecs(iommu->ecap))
495 #define pasid_enabled(iommu)    (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
496
497 int intel_iommu_gfx_mapped;
498 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
499
500 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
501 static DEFINE_SPINLOCK(device_domain_lock);
502 static LIST_HEAD(device_domain_list);
503
504 const struct iommu_ops intel_iommu_ops;
505
506 static bool translation_pre_enabled(struct intel_iommu *iommu)
507 {
508         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
509 }
510
511 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
512 {
513         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
514 }
515
516 static void init_translation_status(struct intel_iommu *iommu)
517 {
518         u32 gsts;
519
520         gsts = readl(iommu->reg + DMAR_GSTS_REG);
521         if (gsts & DMA_GSTS_TES)
522                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
523 }
524
525 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
526 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
527 {
528         return container_of(dom, struct dmar_domain, domain);
529 }
530
531 static int __init intel_iommu_setup(char *str)
532 {
533         if (!str)
534                 return -EINVAL;
535         while (*str) {
536                 if (!strncmp(str, "on", 2)) {
537                         dmar_disabled = 0;
538                         pr_info("IOMMU enabled\n");
539                 } else if (!strncmp(str, "off", 3)) {
540                         dmar_disabled = 1;
541                         pr_info("IOMMU disabled\n");
542                 } else if (!strncmp(str, "igfx_off", 8)) {
543                         dmar_map_gfx = 0;
544                         pr_info("Disable GFX device mapping\n");
545                 } else if (!strncmp(str, "forcedac", 8)) {
546                         pr_info("Forcing DAC for PCI devices\n");
547                         dmar_forcedac = 1;
548                 } else if (!strncmp(str, "strict", 6)) {
549                         pr_info("Disable batched IOTLB flush\n");
550                         intel_iommu_strict = 1;
551                 } else if (!strncmp(str, "sp_off", 6)) {
552                         pr_info("Disable supported super page\n");
553                         intel_iommu_superpage = 0;
554                 } else if (!strncmp(str, "ecs_off", 7)) {
555                         printk(KERN_INFO
556                                 "Intel-IOMMU: disable extended context table support\n");
557                         intel_iommu_ecs = 0;
558                 } else if (!strncmp(str, "tboot_noforce", 13)) {
559                         printk(KERN_INFO
560                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
561                         intel_iommu_tboot_noforce = 1;
562                 }
563
564                 str += strcspn(str, ",");
565                 while (*str == ',')
566                         str++;
567         }
568         return 0;
569 }
570 __setup("intel_iommu=", intel_iommu_setup);
571
572 static struct kmem_cache *iommu_domain_cache;
573 static struct kmem_cache *iommu_devinfo_cache;
574
575 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
576 {
577         struct dmar_domain **domains;
578         int idx = did >> 8;
579
580         domains = iommu->domains[idx];
581         if (!domains)
582                 return NULL;
583
584         return domains[did & 0xff];
585 }
586
587 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
588                              struct dmar_domain *domain)
589 {
590         struct dmar_domain **domains;
591         int idx = did >> 8;
592
593         if (!iommu->domains[idx]) {
594                 size_t size = 256 * sizeof(struct dmar_domain *);
595                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
596         }
597
598         domains = iommu->domains[idx];
599         if (WARN_ON(!domains))
600                 return;
601         else
602                 domains[did & 0xff] = domain;
603 }
604
605 static inline void *alloc_pgtable_page(int node)
606 {
607         struct page *page;
608         void *vaddr = NULL;
609
610         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
611         if (page)
612                 vaddr = page_address(page);
613         return vaddr;
614 }
615
616 static inline void free_pgtable_page(void *vaddr)
617 {
618         free_page((unsigned long)vaddr);
619 }
620
621 static inline void *alloc_domain_mem(void)
622 {
623         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
624 }
625
626 static void free_domain_mem(void *vaddr)
627 {
628         kmem_cache_free(iommu_domain_cache, vaddr);
629 }
630
631 static inline void * alloc_devinfo_mem(void)
632 {
633         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
634 }
635
636 static inline void free_devinfo_mem(void *vaddr)
637 {
638         kmem_cache_free(iommu_devinfo_cache, vaddr);
639 }
640
641 static inline int domain_type_is_vm(struct dmar_domain *domain)
642 {
643         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
644 }
645
646 static inline int domain_type_is_si(struct dmar_domain *domain)
647 {
648         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
649 }
650
651 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
652 {
653         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
654                                 DOMAIN_FLAG_STATIC_IDENTITY);
655 }
656
657 static inline int domain_pfn_supported(struct dmar_domain *domain,
658                                        unsigned long pfn)
659 {
660         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
661
662         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
663 }
664
665 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
666 {
667         unsigned long sagaw;
668         int agaw = -1;
669
670         sagaw = cap_sagaw(iommu->cap);
671         for (agaw = width_to_agaw(max_gaw);
672              agaw >= 0; agaw--) {
673                 if (test_bit(agaw, &sagaw))
674                         break;
675         }
676
677         return agaw;
678 }
679
680 /*
681  * Calculate max SAGAW for each iommu.
682  */
683 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
684 {
685         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
686 }
687
688 /*
689  * calculate agaw for each iommu.
690  * "SAGAW" may be different across iommus, use a default agaw, and
691  * get a supported less agaw for iommus that don't support the default agaw.
692  */
693 int iommu_calculate_agaw(struct intel_iommu *iommu)
694 {
695         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
696 }
697
698 /* This functionin only returns single iommu in a domain */
699 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
700 {
701         int iommu_id;
702
703         /* si_domain and vm domain should not get here. */
704         BUG_ON(domain_type_is_vm_or_si(domain));
705         for_each_domain_iommu(iommu_id, domain)
706                 break;
707
708         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
709                 return NULL;
710
711         return g_iommus[iommu_id];
712 }
713
714 static void domain_update_iommu_coherency(struct dmar_domain *domain)
715 {
716         struct dmar_drhd_unit *drhd;
717         struct intel_iommu *iommu;
718         bool found = false;
719         int i;
720
721         domain->iommu_coherency = 1;
722
723         for_each_domain_iommu(i, domain) {
724                 found = true;
725                 if (!ecap_coherent(g_iommus[i]->ecap)) {
726                         domain->iommu_coherency = 0;
727                         break;
728                 }
729         }
730         if (found)
731                 return;
732
733         /* No hardware attached; use lowest common denominator */
734         rcu_read_lock();
735         for_each_active_iommu(iommu, drhd) {
736                 if (!ecap_coherent(iommu->ecap)) {
737                         domain->iommu_coherency = 0;
738                         break;
739                 }
740         }
741         rcu_read_unlock();
742 }
743
744 static int domain_update_iommu_snooping(struct intel_iommu *skip)
745 {
746         struct dmar_drhd_unit *drhd;
747         struct intel_iommu *iommu;
748         int ret = 1;
749
750         rcu_read_lock();
751         for_each_active_iommu(iommu, drhd) {
752                 if (iommu != skip) {
753                         if (!ecap_sc_support(iommu->ecap)) {
754                                 ret = 0;
755                                 break;
756                         }
757                 }
758         }
759         rcu_read_unlock();
760
761         return ret;
762 }
763
764 static int domain_update_iommu_superpage(struct intel_iommu *skip)
765 {
766         struct dmar_drhd_unit *drhd;
767         struct intel_iommu *iommu;
768         int mask = 0xf;
769
770         if (!intel_iommu_superpage) {
771                 return 0;
772         }
773
774         /* set iommu_superpage to the smallest common denominator */
775         rcu_read_lock();
776         for_each_active_iommu(iommu, drhd) {
777                 if (iommu != skip) {
778                         mask &= cap_super_page_val(iommu->cap);
779                         if (!mask)
780                                 break;
781                 }
782         }
783         rcu_read_unlock();
784
785         return fls(mask);
786 }
787
788 /* Some capabilities may be different across iommus */
789 static void domain_update_iommu_cap(struct dmar_domain *domain)
790 {
791         domain_update_iommu_coherency(domain);
792         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
793         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
794 }
795
796 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
797                                                        u8 bus, u8 devfn, int alloc)
798 {
799         struct root_entry *root = &iommu->root_entry[bus];
800         struct context_entry *context;
801         u64 *entry;
802
803         entry = &root->lo;
804         if (ecs_enabled(iommu)) {
805                 if (devfn >= 0x80) {
806                         devfn -= 0x80;
807                         entry = &root->hi;
808                 }
809                 devfn *= 2;
810         }
811         if (*entry & 1)
812                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
813         else {
814                 unsigned long phy_addr;
815                 if (!alloc)
816                         return NULL;
817
818                 context = alloc_pgtable_page(iommu->node);
819                 if (!context)
820                         return NULL;
821
822                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
823                 phy_addr = virt_to_phys((void *)context);
824                 *entry = phy_addr | 1;
825                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
826         }
827         return &context[devfn];
828 }
829
830 static int iommu_dummy(struct device *dev)
831 {
832         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
833 }
834
835 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
836 {
837         struct dmar_drhd_unit *drhd = NULL;
838         struct intel_iommu *iommu;
839         struct device *tmp;
840         struct pci_dev *ptmp, *pdev = NULL;
841         u16 segment = 0;
842         int i;
843
844         if (iommu_dummy(dev))
845                 return NULL;
846
847         if (dev_is_pci(dev)) {
848                 struct pci_dev *pf_pdev;
849
850                 pdev = to_pci_dev(dev);
851
852 #ifdef CONFIG_X86
853                 /* VMD child devices currently cannot be handled individually */
854                 if (is_vmd(pdev->bus))
855                         return NULL;
856 #endif
857
858                 /* VFs aren't listed in scope tables; we need to look up
859                  * the PF instead to find the IOMMU. */
860                 pf_pdev = pci_physfn(pdev);
861                 dev = &pf_pdev->dev;
862                 segment = pci_domain_nr(pdev->bus);
863         } else if (has_acpi_companion(dev))
864                 dev = &ACPI_COMPANION(dev)->dev;
865
866         rcu_read_lock();
867         for_each_active_iommu(iommu, drhd) {
868                 if (pdev && segment != drhd->segment)
869                         continue;
870
871                 for_each_active_dev_scope(drhd->devices,
872                                           drhd->devices_cnt, i, tmp) {
873                         if (tmp == dev) {
874                                 /* For a VF use its original BDF# not that of the PF
875                                  * which we used for the IOMMU lookup. Strictly speaking
876                                  * we could do this for all PCI devices; we only need to
877                                  * get the BDF# from the scope table for ACPI matches. */
878                                 if (pdev && pdev->is_virtfn)
879                                         goto got_pdev;
880
881                                 *bus = drhd->devices[i].bus;
882                                 *devfn = drhd->devices[i].devfn;
883                                 goto out;
884                         }
885
886                         if (!pdev || !dev_is_pci(tmp))
887                                 continue;
888
889                         ptmp = to_pci_dev(tmp);
890                         if (ptmp->subordinate &&
891                             ptmp->subordinate->number <= pdev->bus->number &&
892                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
893                                 goto got_pdev;
894                 }
895
896                 if (pdev && drhd->include_all) {
897                 got_pdev:
898                         *bus = pdev->bus->number;
899                         *devfn = pdev->devfn;
900                         goto out;
901                 }
902         }
903         iommu = NULL;
904  out:
905         rcu_read_unlock();
906
907         return iommu;
908 }
909
910 static void domain_flush_cache(struct dmar_domain *domain,
911                                void *addr, int size)
912 {
913         if (!domain->iommu_coherency)
914                 clflush_cache_range(addr, size);
915 }
916
917 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
918 {
919         struct context_entry *context;
920         int ret = 0;
921         unsigned long flags;
922
923         spin_lock_irqsave(&iommu->lock, flags);
924         context = iommu_context_addr(iommu, bus, devfn, 0);
925         if (context)
926                 ret = context_present(context);
927         spin_unlock_irqrestore(&iommu->lock, flags);
928         return ret;
929 }
930
931 static void free_context_table(struct intel_iommu *iommu)
932 {
933         int i;
934         unsigned long flags;
935         struct context_entry *context;
936
937         spin_lock_irqsave(&iommu->lock, flags);
938         if (!iommu->root_entry) {
939                 goto out;
940         }
941         for (i = 0; i < ROOT_ENTRY_NR; i++) {
942                 context = iommu_context_addr(iommu, i, 0, 0);
943                 if (context)
944                         free_pgtable_page(context);
945
946                 if (!ecs_enabled(iommu))
947                         continue;
948
949                 context = iommu_context_addr(iommu, i, 0x80, 0);
950                 if (context)
951                         free_pgtable_page(context);
952
953         }
954         free_pgtable_page(iommu->root_entry);
955         iommu->root_entry = NULL;
956 out:
957         spin_unlock_irqrestore(&iommu->lock, flags);
958 }
959
960 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
961                                       unsigned long pfn, int *target_level)
962 {
963         struct dma_pte *parent, *pte = NULL;
964         int level = agaw_to_level(domain->agaw);
965         int offset;
966
967         BUG_ON(!domain->pgd);
968
969         if (!domain_pfn_supported(domain, pfn))
970                 /* Address beyond IOMMU's addressing capabilities. */
971                 return NULL;
972
973         parent = domain->pgd;
974
975         while (1) {
976                 void *tmp_page;
977
978                 offset = pfn_level_offset(pfn, level);
979                 pte = &parent[offset];
980                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
981                         break;
982                 if (level == *target_level)
983                         break;
984
985                 if (!dma_pte_present(pte)) {
986                         uint64_t pteval;
987
988                         tmp_page = alloc_pgtable_page(domain->nid);
989
990                         if (!tmp_page)
991                                 return NULL;
992
993                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
994                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
995                         if (cmpxchg64(&pte->val, 0ULL, pteval))
996                                 /* Someone else set it while we were thinking; use theirs. */
997                                 free_pgtable_page(tmp_page);
998                         else
999                                 domain_flush_cache(domain, pte, sizeof(*pte));
1000                 }
1001                 if (level == 1)
1002                         break;
1003
1004                 parent = phys_to_virt(dma_pte_addr(pte));
1005                 level--;
1006         }
1007
1008         if (!*target_level)
1009                 *target_level = level;
1010
1011         return pte;
1012 }
1013
1014
1015 /* return address's pte at specific level */
1016 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1017                                          unsigned long pfn,
1018                                          int level, int *large_page)
1019 {
1020         struct dma_pte *parent, *pte = NULL;
1021         int total = agaw_to_level(domain->agaw);
1022         int offset;
1023
1024         parent = domain->pgd;
1025         while (level <= total) {
1026                 offset = pfn_level_offset(pfn, total);
1027                 pte = &parent[offset];
1028                 if (level == total)
1029                         return pte;
1030
1031                 if (!dma_pte_present(pte)) {
1032                         *large_page = total;
1033                         break;
1034                 }
1035
1036                 if (dma_pte_superpage(pte)) {
1037                         *large_page = total;
1038                         return pte;
1039                 }
1040
1041                 parent = phys_to_virt(dma_pte_addr(pte));
1042                 total--;
1043         }
1044         return NULL;
1045 }
1046
1047 /* clear last level pte, a tlb flush should be followed */
1048 static void dma_pte_clear_range(struct dmar_domain *domain,
1049                                 unsigned long start_pfn,
1050                                 unsigned long last_pfn)
1051 {
1052         unsigned int large_page = 1;
1053         struct dma_pte *first_pte, *pte;
1054
1055         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1056         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1057         BUG_ON(start_pfn > last_pfn);
1058
1059         /* we don't need lock here; nobody else touches the iova range */
1060         do {
1061                 large_page = 1;
1062                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1063                 if (!pte) {
1064                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1065                         continue;
1066                 }
1067                 do {
1068                         dma_clear_pte(pte);
1069                         start_pfn += lvl_to_nr_pages(large_page);
1070                         pte++;
1071                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1072
1073                 domain_flush_cache(domain, first_pte,
1074                                    (void *)pte - (void *)first_pte);
1075
1076         } while (start_pfn && start_pfn <= last_pfn);
1077 }
1078
1079 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1080                                int retain_level, struct dma_pte *pte,
1081                                unsigned long pfn, unsigned long start_pfn,
1082                                unsigned long last_pfn)
1083 {
1084         pfn = max(start_pfn, pfn);
1085         pte = &pte[pfn_level_offset(pfn, level)];
1086
1087         do {
1088                 unsigned long level_pfn;
1089                 struct dma_pte *level_pte;
1090
1091                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1092                         goto next;
1093
1094                 level_pfn = pfn & level_mask(level);
1095                 level_pte = phys_to_virt(dma_pte_addr(pte));
1096
1097                 if (level > 2) {
1098                         dma_pte_free_level(domain, level - 1, retain_level,
1099                                            level_pte, level_pfn, start_pfn,
1100                                            last_pfn);
1101                 }
1102
1103                 /*
1104                  * Free the page table if we're below the level we want to
1105                  * retain and the range covers the entire table.
1106                  */
1107                 if (level < retain_level && !(start_pfn > level_pfn ||
1108                       last_pfn < level_pfn + level_size(level) - 1)) {
1109                         dma_clear_pte(pte);
1110                         domain_flush_cache(domain, pte, sizeof(*pte));
1111                         free_pgtable_page(level_pte);
1112                 }
1113 next:
1114                 pfn += level_size(level);
1115         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1116 }
1117
1118 /*
1119  * clear last level (leaf) ptes and free page table pages below the
1120  * level we wish to keep intact.
1121  */
1122 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1123                                    unsigned long start_pfn,
1124                                    unsigned long last_pfn,
1125                                    int retain_level)
1126 {
1127         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1128         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1129         BUG_ON(start_pfn > last_pfn);
1130
1131         dma_pte_clear_range(domain, start_pfn, last_pfn);
1132
1133         /* We don't need lock here; nobody else touches the iova range */
1134         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1135                            domain->pgd, 0, start_pfn, last_pfn);
1136
1137         /* free pgd */
1138         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1139                 free_pgtable_page(domain->pgd);
1140                 domain->pgd = NULL;
1141         }
1142 }
1143
1144 /* When a page at a given level is being unlinked from its parent, we don't
1145    need to *modify* it at all. All we need to do is make a list of all the
1146    pages which can be freed just as soon as we've flushed the IOTLB and we
1147    know the hardware page-walk will no longer touch them.
1148    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1149    be freed. */
1150 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1151                                             int level, struct dma_pte *pte,
1152                                             struct page *freelist)
1153 {
1154         struct page *pg;
1155
1156         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1157         pg->freelist = freelist;
1158         freelist = pg;
1159
1160         if (level == 1)
1161                 return freelist;
1162
1163         pte = page_address(pg);
1164         do {
1165                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1166                         freelist = dma_pte_list_pagetables(domain, level - 1,
1167                                                            pte, freelist);
1168                 pte++;
1169         } while (!first_pte_in_page(pte));
1170
1171         return freelist;
1172 }
1173
1174 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1175                                         struct dma_pte *pte, unsigned long pfn,
1176                                         unsigned long start_pfn,
1177                                         unsigned long last_pfn,
1178                                         struct page *freelist)
1179 {
1180         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1181
1182         pfn = max(start_pfn, pfn);
1183         pte = &pte[pfn_level_offset(pfn, level)];
1184
1185         do {
1186                 unsigned long level_pfn;
1187
1188                 if (!dma_pte_present(pte))
1189                         goto next;
1190
1191                 level_pfn = pfn & level_mask(level);
1192
1193                 /* If range covers entire pagetable, free it */
1194                 if (start_pfn <= level_pfn &&
1195                     last_pfn >= level_pfn + level_size(level) - 1) {
1196                         /* These suborbinate page tables are going away entirely. Don't
1197                            bother to clear them; we're just going to *free* them. */
1198                         if (level > 1 && !dma_pte_superpage(pte))
1199                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1200
1201                         dma_clear_pte(pte);
1202                         if (!first_pte)
1203                                 first_pte = pte;
1204                         last_pte = pte;
1205                 } else if (level > 1) {
1206                         /* Recurse down into a level that isn't *entirely* obsolete */
1207                         freelist = dma_pte_clear_level(domain, level - 1,
1208                                                        phys_to_virt(dma_pte_addr(pte)),
1209                                                        level_pfn, start_pfn, last_pfn,
1210                                                        freelist);
1211                 }
1212 next:
1213                 pfn += level_size(level);
1214         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1215
1216         if (first_pte)
1217                 domain_flush_cache(domain, first_pte,
1218                                    (void *)++last_pte - (void *)first_pte);
1219
1220         return freelist;
1221 }
1222
1223 /* We can't just free the pages because the IOMMU may still be walking
1224    the page tables, and may have cached the intermediate levels. The
1225    pages can only be freed after the IOTLB flush has been done. */
1226 static struct page *domain_unmap(struct dmar_domain *domain,
1227                                  unsigned long start_pfn,
1228                                  unsigned long last_pfn)
1229 {
1230         struct page *freelist = NULL;
1231
1232         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1233         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1234         BUG_ON(start_pfn > last_pfn);
1235
1236         /* we don't need lock here; nobody else touches the iova range */
1237         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1238                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1239
1240         /* free pgd */
1241         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1242                 struct page *pgd_page = virt_to_page(domain->pgd);
1243                 pgd_page->freelist = freelist;
1244                 freelist = pgd_page;
1245
1246                 domain->pgd = NULL;
1247         }
1248
1249         return freelist;
1250 }
1251
1252 static void dma_free_pagelist(struct page *freelist)
1253 {
1254         struct page *pg;
1255
1256         while ((pg = freelist)) {
1257                 freelist = pg->freelist;
1258                 free_pgtable_page(page_address(pg));
1259         }
1260 }
1261
1262 static void iova_entry_free(unsigned long data)
1263 {
1264         struct page *freelist = (struct page *)data;
1265
1266         dma_free_pagelist(freelist);
1267 }
1268
1269 /* iommu handling */
1270 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1271 {
1272         struct root_entry *root;
1273         unsigned long flags;
1274
1275         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1276         if (!root) {
1277                 pr_err("Allocating root entry for %s failed\n",
1278                         iommu->name);
1279                 return -ENOMEM;
1280         }
1281
1282         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1283
1284         spin_lock_irqsave(&iommu->lock, flags);
1285         iommu->root_entry = root;
1286         spin_unlock_irqrestore(&iommu->lock, flags);
1287
1288         return 0;
1289 }
1290
1291 static void iommu_set_root_entry(struct intel_iommu *iommu)
1292 {
1293         u64 addr;
1294         u32 sts;
1295         unsigned long flag;
1296
1297         addr = virt_to_phys(iommu->root_entry);
1298         if (ecs_enabled(iommu))
1299                 addr |= DMA_RTADDR_RTT;
1300
1301         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1303
1304         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1305
1306         /* Make sure hardware complete it */
1307         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1308                       readl, (sts & DMA_GSTS_RTPS), sts);
1309
1310         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1311 }
1312
1313 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1314 {
1315         u32 val;
1316         unsigned long flag;
1317
1318         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1319                 return;
1320
1321         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1326                       readl, (!(val & DMA_GSTS_WBFS)), val);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329 }
1330
1331 /* return value determine if we need a write buffer flush */
1332 static void __iommu_flush_context(struct intel_iommu *iommu,
1333                                   u16 did, u16 source_id, u8 function_mask,
1334                                   u64 type)
1335 {
1336         u64 val = 0;
1337         unsigned long flag;
1338
1339         switch (type) {
1340         case DMA_CCMD_GLOBAL_INVL:
1341                 val = DMA_CCMD_GLOBAL_INVL;
1342                 break;
1343         case DMA_CCMD_DOMAIN_INVL:
1344                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1345                 break;
1346         case DMA_CCMD_DEVICE_INVL:
1347                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1348                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1349                 break;
1350         default:
1351                 BUG();
1352         }
1353         val |= DMA_CCMD_ICC;
1354
1355         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1356         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1357
1358         /* Make sure hardware complete it */
1359         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1360                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1361
1362         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1363 }
1364
1365 /* return value determine if we need a write buffer flush */
1366 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1367                                 u64 addr, unsigned int size_order, u64 type)
1368 {
1369         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1370         u64 val = 0, val_iva = 0;
1371         unsigned long flag;
1372
1373         switch (type) {
1374         case DMA_TLB_GLOBAL_FLUSH:
1375                 /* global flush doesn't need set IVA_REG */
1376                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1377                 break;
1378         case DMA_TLB_DSI_FLUSH:
1379                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1380                 break;
1381         case DMA_TLB_PSI_FLUSH:
1382                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1383                 /* IH bit is passed in as part of address */
1384                 val_iva = size_order | addr;
1385                 break;
1386         default:
1387                 BUG();
1388         }
1389         /* Note: set drain read/write */
1390 #if 0
1391         /*
1392          * This is probably to be super secure.. Looks like we can
1393          * ignore it without any impact.
1394          */
1395         if (cap_read_drain(iommu->cap))
1396                 val |= DMA_TLB_READ_DRAIN;
1397 #endif
1398         if (cap_write_drain(iommu->cap))
1399                 val |= DMA_TLB_WRITE_DRAIN;
1400
1401         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1402         /* Note: Only uses first TLB reg currently */
1403         if (val_iva)
1404                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1405         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1406
1407         /* Make sure hardware complete it */
1408         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1409                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1410
1411         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1412
1413         /* check IOTLB invalidation granularity */
1414         if (DMA_TLB_IAIG(val) == 0)
1415                 pr_err("Flush IOTLB failed\n");
1416         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1417                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1418                         (unsigned long long)DMA_TLB_IIRG(type),
1419                         (unsigned long long)DMA_TLB_IAIG(val));
1420 }
1421
1422 static struct device_domain_info *
1423 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1424                          u8 bus, u8 devfn)
1425 {
1426         struct device_domain_info *info;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!iommu->qi)
1431                 return NULL;
1432
1433         list_for_each_entry(info, &domain->devices, link)
1434                 if (info->iommu == iommu && info->bus == bus &&
1435                     info->devfn == devfn) {
1436                         if (info->ats_supported && info->dev)
1437                                 return info;
1438                         break;
1439                 }
1440
1441         return NULL;
1442 }
1443
1444 static void domain_update_iotlb(struct dmar_domain *domain)
1445 {
1446         struct device_domain_info *info;
1447         bool has_iotlb_device = false;
1448
1449         assert_spin_locked(&device_domain_lock);
1450
1451         list_for_each_entry(info, &domain->devices, link) {
1452                 struct pci_dev *pdev;
1453
1454                 if (!info->dev || !dev_is_pci(info->dev))
1455                         continue;
1456
1457                 pdev = to_pci_dev(info->dev);
1458                 if (pdev->ats_enabled) {
1459                         has_iotlb_device = true;
1460                         break;
1461                 }
1462         }
1463
1464         domain->has_iotlb_device = has_iotlb_device;
1465 }
1466
1467 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1468 {
1469         struct pci_dev *pdev;
1470
1471         assert_spin_locked(&device_domain_lock);
1472
1473         if (!info || !dev_is_pci(info->dev))
1474                 return;
1475
1476         pdev = to_pci_dev(info->dev);
1477
1478 #ifdef CONFIG_INTEL_IOMMU_SVM
1479         /* The PCIe spec, in its wisdom, declares that the behaviour of
1480            the device if you enable PASID support after ATS support is
1481            undefined. So always enable PASID support on devices which
1482            have it, even if we can't yet know if we're ever going to
1483            use it. */
1484         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1485                 info->pasid_enabled = 1;
1486
1487         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1488                 info->pri_enabled = 1;
1489 #endif
1490         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1491                 info->ats_enabled = 1;
1492                 domain_update_iotlb(info->domain);
1493                 info->ats_qdep = pci_ats_queue_depth(pdev);
1494         }
1495 }
1496
1497 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1498 {
1499         struct pci_dev *pdev;
1500
1501         assert_spin_locked(&device_domain_lock);
1502
1503         if (!dev_is_pci(info->dev))
1504                 return;
1505
1506         pdev = to_pci_dev(info->dev);
1507
1508         if (info->ats_enabled) {
1509                 pci_disable_ats(pdev);
1510                 info->ats_enabled = 0;
1511                 domain_update_iotlb(info->domain);
1512         }
1513 #ifdef CONFIG_INTEL_IOMMU_SVM
1514         if (info->pri_enabled) {
1515                 pci_disable_pri(pdev);
1516                 info->pri_enabled = 0;
1517         }
1518         if (info->pasid_enabled) {
1519                 pci_disable_pasid(pdev);
1520                 info->pasid_enabled = 0;
1521         }
1522 #endif
1523 }
1524
1525 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1526                                   u64 addr, unsigned mask)
1527 {
1528         u16 sid, qdep;
1529         unsigned long flags;
1530         struct device_domain_info *info;
1531
1532         if (!domain->has_iotlb_device)
1533                 return;
1534
1535         spin_lock_irqsave(&device_domain_lock, flags);
1536         list_for_each_entry(info, &domain->devices, link) {
1537                 if (!info->ats_enabled)
1538                         continue;
1539
1540                 sid = info->bus << 8 | info->devfn;
1541                 qdep = info->ats_qdep;
1542                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1543         }
1544         spin_unlock_irqrestore(&device_domain_lock, flags);
1545 }
1546
1547 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1548                                   struct dmar_domain *domain,
1549                                   unsigned long pfn, unsigned int pages,
1550                                   int ih, int map)
1551 {
1552         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1553         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1554         u16 did = domain->iommu_did[iommu->seq_id];
1555
1556         BUG_ON(pages == 0);
1557
1558         if (ih)
1559                 ih = 1 << 6;
1560         /*
1561          * Fallback to domain selective flush if no PSI support or the size is
1562          * too big.
1563          * PSI requires page size to be 2 ^ x, and the base address is naturally
1564          * aligned to the size
1565          */
1566         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1567                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1568                                                 DMA_TLB_DSI_FLUSH);
1569         else
1570                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1571                                                 DMA_TLB_PSI_FLUSH);
1572
1573         /*
1574          * In caching mode, changes of pages from non-present to present require
1575          * flush. However, device IOTLB doesn't need to be flushed in this case.
1576          */
1577         if (!cap_caching_mode(iommu->cap) || !map)
1578                 iommu_flush_dev_iotlb(domain, addr, mask);
1579 }
1580
1581 /* Notification for newly created mappings */
1582 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1583                                         struct dmar_domain *domain,
1584                                         unsigned long pfn, unsigned int pages)
1585 {
1586         /* It's a non-present to present mapping. Only flush if caching mode */
1587         if (cap_caching_mode(iommu->cap))
1588                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1589         else
1590                 iommu_flush_write_buffer(iommu);
1591 }
1592
1593 static void iommu_flush_iova(struct iova_domain *iovad)
1594 {
1595         struct dmar_domain *domain;
1596         int idx;
1597
1598         domain = container_of(iovad, struct dmar_domain, iovad);
1599
1600         for_each_domain_iommu(idx, domain) {
1601                 struct intel_iommu *iommu = g_iommus[idx];
1602                 u16 did = domain->iommu_did[iommu->seq_id];
1603
1604                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1605
1606                 if (!cap_caching_mode(iommu->cap))
1607                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1608                                               0, MAX_AGAW_PFN_WIDTH);
1609         }
1610 }
1611
1612 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1613 {
1614         u32 pmen;
1615         unsigned long flags;
1616
1617         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1618         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1619         pmen &= ~DMA_PMEN_EPM;
1620         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621
1622         /* wait for the protected region status bit to clear */
1623         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1624                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625
1626         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1627 }
1628
1629 static void iommu_enable_translation(struct intel_iommu *iommu)
1630 {
1631         u32 sts;
1632         unsigned long flags;
1633
1634         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1635         iommu->gcmd |= DMA_GCMD_TE;
1636         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637
1638         /* Make sure hardware complete it */
1639         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1640                       readl, (sts & DMA_GSTS_TES), sts);
1641
1642         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643 }
1644
1645 static void iommu_disable_translation(struct intel_iommu *iommu)
1646 {
1647         u32 sts;
1648         unsigned long flag;
1649
1650         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1651         iommu->gcmd &= ~DMA_GCMD_TE;
1652         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654         /* Make sure hardware complete it */
1655         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656                       readl, (!(sts & DMA_GSTS_TES)), sts);
1657
1658         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1659 }
1660
1661
1662 static int iommu_init_domains(struct intel_iommu *iommu)
1663 {
1664         u32 ndomains, nlongs;
1665         size_t size;
1666
1667         ndomains = cap_ndoms(iommu->cap);
1668         pr_debug("%s: Number of Domains supported <%d>\n",
1669                  iommu->name, ndomains);
1670         nlongs = BITS_TO_LONGS(ndomains);
1671
1672         spin_lock_init(&iommu->lock);
1673
1674         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1675         if (!iommu->domain_ids) {
1676                 pr_err("%s: Allocating domain id array failed\n",
1677                        iommu->name);
1678                 return -ENOMEM;
1679         }
1680
1681         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1682         iommu->domains = kzalloc(size, GFP_KERNEL);
1683
1684         if (iommu->domains) {
1685                 size = 256 * sizeof(struct dmar_domain *);
1686                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1687         }
1688
1689         if (!iommu->domains || !iommu->domains[0]) {
1690                 pr_err("%s: Allocating domain array failed\n",
1691                        iommu->name);
1692                 kfree(iommu->domain_ids);
1693                 kfree(iommu->domains);
1694                 iommu->domain_ids = NULL;
1695                 iommu->domains    = NULL;
1696                 return -ENOMEM;
1697         }
1698
1699
1700
1701         /*
1702          * If Caching mode is set, then invalid translations are tagged
1703          * with domain-id 0, hence we need to pre-allocate it. We also
1704          * use domain-id 0 as a marker for non-allocated domain-id, so
1705          * make sure it is not used for a real domain.
1706          */
1707         set_bit(0, iommu->domain_ids);
1708
1709         return 0;
1710 }
1711
1712 static void disable_dmar_iommu(struct intel_iommu *iommu)
1713 {
1714         struct device_domain_info *info, *tmp;
1715         unsigned long flags;
1716
1717         if (!iommu->domains || !iommu->domain_ids)
1718                 return;
1719
1720 again:
1721         spin_lock_irqsave(&device_domain_lock, flags);
1722         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1723                 struct dmar_domain *domain;
1724
1725                 if (info->iommu != iommu)
1726                         continue;
1727
1728                 if (!info->dev || !info->domain)
1729                         continue;
1730
1731                 domain = info->domain;
1732
1733                 __dmar_remove_one_dev_info(info);
1734
1735                 if (!domain_type_is_vm_or_si(domain)) {
1736                         /*
1737                          * The domain_exit() function  can't be called under
1738                          * device_domain_lock, as it takes this lock itself.
1739                          * So release the lock here and re-run the loop
1740                          * afterwards.
1741                          */
1742                         spin_unlock_irqrestore(&device_domain_lock, flags);
1743                         domain_exit(domain);
1744                         goto again;
1745                 }
1746         }
1747         spin_unlock_irqrestore(&device_domain_lock, flags);
1748
1749         if (iommu->gcmd & DMA_GCMD_TE)
1750                 iommu_disable_translation(iommu);
1751 }
1752
1753 static void free_dmar_iommu(struct intel_iommu *iommu)
1754 {
1755         if ((iommu->domains) && (iommu->domain_ids)) {
1756                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1757                 int i;
1758
1759                 for (i = 0; i < elems; i++)
1760                         kfree(iommu->domains[i]);
1761                 kfree(iommu->domains);
1762                 kfree(iommu->domain_ids);
1763                 iommu->domains = NULL;
1764                 iommu->domain_ids = NULL;
1765         }
1766
1767         g_iommus[iommu->seq_id] = NULL;
1768
1769         /* free context mapping */
1770         free_context_table(iommu);
1771
1772 #ifdef CONFIG_INTEL_IOMMU_SVM
1773         if (pasid_enabled(iommu)) {
1774                 if (ecap_prs(iommu->ecap))
1775                         intel_svm_finish_prq(iommu);
1776                 intel_svm_free_pasid_tables(iommu);
1777         }
1778 #endif
1779 }
1780
1781 static struct dmar_domain *alloc_domain(int flags)
1782 {
1783         struct dmar_domain *domain;
1784
1785         domain = alloc_domain_mem();
1786         if (!domain)
1787                 return NULL;
1788
1789         memset(domain, 0, sizeof(*domain));
1790         domain->nid = -1;
1791         domain->flags = flags;
1792         domain->has_iotlb_device = false;
1793         INIT_LIST_HEAD(&domain->devices);
1794
1795         return domain;
1796 }
1797
1798 /* Must be called with iommu->lock */
1799 static int domain_attach_iommu(struct dmar_domain *domain,
1800                                struct intel_iommu *iommu)
1801 {
1802         unsigned long ndomains;
1803         int num;
1804
1805         assert_spin_locked(&device_domain_lock);
1806         assert_spin_locked(&iommu->lock);
1807
1808         domain->iommu_refcnt[iommu->seq_id] += 1;
1809         domain->iommu_count += 1;
1810         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1811                 ndomains = cap_ndoms(iommu->cap);
1812                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1813
1814                 if (num >= ndomains) {
1815                         pr_err("%s: No free domain ids\n", iommu->name);
1816                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1817                         domain->iommu_count -= 1;
1818                         return -ENOSPC;
1819                 }
1820
1821                 set_bit(num, iommu->domain_ids);
1822                 set_iommu_domain(iommu, num, domain);
1823
1824                 domain->iommu_did[iommu->seq_id] = num;
1825                 domain->nid                      = iommu->node;
1826
1827                 domain_update_iommu_cap(domain);
1828         }
1829
1830         return 0;
1831 }
1832
1833 static int domain_detach_iommu(struct dmar_domain *domain,
1834                                struct intel_iommu *iommu)
1835 {
1836         int num, count = INT_MAX;
1837
1838         assert_spin_locked(&device_domain_lock);
1839         assert_spin_locked(&iommu->lock);
1840
1841         domain->iommu_refcnt[iommu->seq_id] -= 1;
1842         count = --domain->iommu_count;
1843         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1844                 num = domain->iommu_did[iommu->seq_id];
1845                 clear_bit(num, iommu->domain_ids);
1846                 set_iommu_domain(iommu, num, NULL);
1847
1848                 domain_update_iommu_cap(domain);
1849                 domain->iommu_did[iommu->seq_id] = 0;
1850         }
1851
1852         return count;
1853 }
1854
1855 static struct iova_domain reserved_iova_list;
1856 static struct lock_class_key reserved_rbtree_key;
1857
1858 static int dmar_init_reserved_ranges(void)
1859 {
1860         struct pci_dev *pdev = NULL;
1861         struct iova *iova;
1862         int i;
1863
1864         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1865
1866         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1867                 &reserved_rbtree_key);
1868
1869         /* IOAPIC ranges shouldn't be accessed by DMA */
1870         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1871                 IOVA_PFN(IOAPIC_RANGE_END));
1872         if (!iova) {
1873                 pr_err("Reserve IOAPIC range failed\n");
1874                 return -ENODEV;
1875         }
1876
1877         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1878         for_each_pci_dev(pdev) {
1879                 struct resource *r;
1880
1881                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1882                         r = &pdev->resource[i];
1883                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1884                                 continue;
1885                         iova = reserve_iova(&reserved_iova_list,
1886                                             IOVA_PFN(r->start),
1887                                             IOVA_PFN(r->end));
1888                         if (!iova) {
1889                                 pr_err("Reserve iova failed\n");
1890                                 return -ENODEV;
1891                         }
1892                 }
1893         }
1894         return 0;
1895 }
1896
1897 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1898 {
1899         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1900 }
1901
1902 static inline int guestwidth_to_adjustwidth(int gaw)
1903 {
1904         int agaw;
1905         int r = (gaw - 12) % 9;
1906
1907         if (r == 0)
1908                 agaw = gaw;
1909         else
1910                 agaw = gaw + 9 - r;
1911         if (agaw > 64)
1912                 agaw = 64;
1913         return agaw;
1914 }
1915
1916 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1917                        int guest_width)
1918 {
1919         int adjust_width, agaw;
1920         unsigned long sagaw;
1921         int err;
1922
1923         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1924
1925         err = init_iova_flush_queue(&domain->iovad,
1926                                     iommu_flush_iova, iova_entry_free);
1927         if (err)
1928                 return err;
1929
1930         domain_reserve_special_ranges(domain);
1931
1932         /* calculate AGAW */
1933         if (guest_width > cap_mgaw(iommu->cap))
1934                 guest_width = cap_mgaw(iommu->cap);
1935         domain->gaw = guest_width;
1936         adjust_width = guestwidth_to_adjustwidth(guest_width);
1937         agaw = width_to_agaw(adjust_width);
1938         sagaw = cap_sagaw(iommu->cap);
1939         if (!test_bit(agaw, &sagaw)) {
1940                 /* hardware doesn't support it, choose a bigger one */
1941                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1942                 agaw = find_next_bit(&sagaw, 5, agaw);
1943                 if (agaw >= 5)
1944                         return -ENODEV;
1945         }
1946         domain->agaw = agaw;
1947
1948         if (ecap_coherent(iommu->ecap))
1949                 domain->iommu_coherency = 1;
1950         else
1951                 domain->iommu_coherency = 0;
1952
1953         if (ecap_sc_support(iommu->ecap))
1954                 domain->iommu_snooping = 1;
1955         else
1956                 domain->iommu_snooping = 0;
1957
1958         if (intel_iommu_superpage)
1959                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1960         else
1961                 domain->iommu_superpage = 0;
1962
1963         domain->nid = iommu->node;
1964
1965         /* always allocate the top pgd */
1966         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1967         if (!domain->pgd)
1968                 return -ENOMEM;
1969         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1970         return 0;
1971 }
1972
1973 static void domain_exit(struct dmar_domain *domain)
1974 {
1975         struct page *freelist = NULL;
1976
1977         /* Domain 0 is reserved, so dont process it */
1978         if (!domain)
1979                 return;
1980
1981         /* Remove associated devices and clear attached or cached domains */
1982         rcu_read_lock();
1983         domain_remove_dev_info(domain);
1984         rcu_read_unlock();
1985
1986         /* destroy iovas */
1987         put_iova_domain(&domain->iovad);
1988
1989         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1990
1991         dma_free_pagelist(freelist);
1992
1993         free_domain_mem(domain);
1994 }
1995
1996 static int domain_context_mapping_one(struct dmar_domain *domain,
1997                                       struct intel_iommu *iommu,
1998                                       u8 bus, u8 devfn)
1999 {
2000         u16 did = domain->iommu_did[iommu->seq_id];
2001         int translation = CONTEXT_TT_MULTI_LEVEL;
2002         struct device_domain_info *info = NULL;
2003         struct context_entry *context;
2004         unsigned long flags;
2005         struct dma_pte *pgd;
2006         int ret, agaw;
2007
2008         WARN_ON(did == 0);
2009
2010         if (hw_pass_through && domain_type_is_si(domain))
2011                 translation = CONTEXT_TT_PASS_THROUGH;
2012
2013         pr_debug("Set context mapping for %02x:%02x.%d\n",
2014                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2015
2016         BUG_ON(!domain->pgd);
2017
2018         spin_lock_irqsave(&device_domain_lock, flags);
2019         spin_lock(&iommu->lock);
2020
2021         ret = -ENOMEM;
2022         context = iommu_context_addr(iommu, bus, devfn, 1);
2023         if (!context)
2024                 goto out_unlock;
2025
2026         ret = 0;
2027         if (context_present(context))
2028                 goto out_unlock;
2029
2030         /*
2031          * For kdump cases, old valid entries may be cached due to the
2032          * in-flight DMA and copied pgtable, but there is no unmapping
2033          * behaviour for them, thus we need an explicit cache flush for
2034          * the newly-mapped device. For kdump, at this point, the device
2035          * is supposed to finish reset at its driver probe stage, so no
2036          * in-flight DMA will exist, and we don't need to worry anymore
2037          * hereafter.
2038          */
2039         if (context_copied(context)) {
2040                 u16 did_old = context_domain_id(context);
2041
2042                 if (did_old < cap_ndoms(iommu->cap)) {
2043                         iommu->flush.flush_context(iommu, did_old,
2044                                                    (((u16)bus) << 8) | devfn,
2045                                                    DMA_CCMD_MASK_NOBIT,
2046                                                    DMA_CCMD_DEVICE_INVL);
2047                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2048                                                  DMA_TLB_DSI_FLUSH);
2049                 }
2050         }
2051
2052         pgd = domain->pgd;
2053
2054         context_clear_entry(context);
2055         context_set_domain_id(context, did);
2056
2057         /*
2058          * Skip top levels of page tables for iommu which has less agaw
2059          * than default.  Unnecessary for PT mode.
2060          */
2061         if (translation != CONTEXT_TT_PASS_THROUGH) {
2062                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2063                         ret = -ENOMEM;
2064                         pgd = phys_to_virt(dma_pte_addr(pgd));
2065                         if (!dma_pte_present(pgd))
2066                                 goto out_unlock;
2067                 }
2068
2069                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2070                 if (info && info->ats_supported)
2071                         translation = CONTEXT_TT_DEV_IOTLB;
2072                 else
2073                         translation = CONTEXT_TT_MULTI_LEVEL;
2074
2075                 context_set_address_root(context, virt_to_phys(pgd));
2076                 context_set_address_width(context, iommu->agaw);
2077         } else {
2078                 /*
2079                  * In pass through mode, AW must be programmed to
2080                  * indicate the largest AGAW value supported by
2081                  * hardware. And ASR is ignored by hardware.
2082                  */
2083                 context_set_address_width(context, iommu->msagaw);
2084         }
2085
2086         context_set_translation_type(context, translation);
2087         context_set_fault_enable(context);
2088         context_set_present(context);
2089         domain_flush_cache(domain, context, sizeof(*context));
2090
2091         /*
2092          * It's a non-present to present mapping. If hardware doesn't cache
2093          * non-present entry we only need to flush the write-buffer. If the
2094          * _does_ cache non-present entries, then it does so in the special
2095          * domain #0, which we have to flush:
2096          */
2097         if (cap_caching_mode(iommu->cap)) {
2098                 iommu->flush.flush_context(iommu, 0,
2099                                            (((u16)bus) << 8) | devfn,
2100                                            DMA_CCMD_MASK_NOBIT,
2101                                            DMA_CCMD_DEVICE_INVL);
2102                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2103         } else {
2104                 iommu_flush_write_buffer(iommu);
2105         }
2106         iommu_enable_dev_iotlb(info);
2107
2108         ret = 0;
2109
2110 out_unlock:
2111         spin_unlock(&iommu->lock);
2112         spin_unlock_irqrestore(&device_domain_lock, flags);
2113
2114         return ret;
2115 }
2116
2117 struct domain_context_mapping_data {
2118         struct dmar_domain *domain;
2119         struct intel_iommu *iommu;
2120 };
2121
2122 static int domain_context_mapping_cb(struct pci_dev *pdev,
2123                                      u16 alias, void *opaque)
2124 {
2125         struct domain_context_mapping_data *data = opaque;
2126
2127         return domain_context_mapping_one(data->domain, data->iommu,
2128                                           PCI_BUS_NUM(alias), alias & 0xff);
2129 }
2130
2131 static int
2132 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2133 {
2134         struct intel_iommu *iommu;
2135         u8 bus, devfn;
2136         struct domain_context_mapping_data data;
2137
2138         iommu = device_to_iommu(dev, &bus, &devfn);
2139         if (!iommu)
2140                 return -ENODEV;
2141
2142         if (!dev_is_pci(dev))
2143                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2144
2145         data.domain = domain;
2146         data.iommu = iommu;
2147
2148         return pci_for_each_dma_alias(to_pci_dev(dev),
2149                                       &domain_context_mapping_cb, &data);
2150 }
2151
2152 static int domain_context_mapped_cb(struct pci_dev *pdev,
2153                                     u16 alias, void *opaque)
2154 {
2155         struct intel_iommu *iommu = opaque;
2156
2157         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158 }
2159
2160 static int domain_context_mapped(struct device *dev)
2161 {
2162         struct intel_iommu *iommu;
2163         u8 bus, devfn;
2164
2165         iommu = device_to_iommu(dev, &bus, &devfn);
2166         if (!iommu)
2167                 return -ENODEV;
2168
2169         if (!dev_is_pci(dev))
2170                 return device_context_mapped(iommu, bus, devfn);
2171
2172         return !pci_for_each_dma_alias(to_pci_dev(dev),
2173                                        domain_context_mapped_cb, iommu);
2174 }
2175
2176 /* Returns a number of VTD pages, but aligned to MM page size */
2177 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178                                             size_t size)
2179 {
2180         host_addr &= ~PAGE_MASK;
2181         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182 }
2183
2184 /* Return largest possible superpage level for a given mapping */
2185 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186                                           unsigned long iov_pfn,
2187                                           unsigned long phy_pfn,
2188                                           unsigned long pages)
2189 {
2190         int support, level = 1;
2191         unsigned long pfnmerge;
2192
2193         support = domain->iommu_superpage;
2194
2195         /* To use a large page, the virtual *and* physical addresses
2196            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197            of them will mean we have to use smaller pages. So just
2198            merge them and check both at once. */
2199         pfnmerge = iov_pfn | phy_pfn;
2200
2201         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202                 pages >>= VTD_STRIDE_SHIFT;
2203                 if (!pages)
2204                         break;
2205                 pfnmerge >>= VTD_STRIDE_SHIFT;
2206                 level++;
2207                 support--;
2208         }
2209         return level;
2210 }
2211
2212 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213                             struct scatterlist *sg, unsigned long phys_pfn,
2214                             unsigned long nr_pages, int prot)
2215 {
2216         struct dma_pte *first_pte = NULL, *pte = NULL;
2217         phys_addr_t uninitialized_var(pteval);
2218         unsigned long sg_res = 0;
2219         unsigned int largepage_lvl = 0;
2220         unsigned long lvl_pages = 0;
2221
2222         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2223
2224         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2225                 return -EINVAL;
2226
2227         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2228
2229         if (!sg) {
2230                 sg_res = nr_pages;
2231                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2232         }
2233
2234         while (nr_pages > 0) {
2235                 uint64_t tmp;
2236
2237                 if (!sg_res) {
2238                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2239
2240                         sg_res = aligned_nrpages(sg->offset, sg->length);
2241                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242                         sg->dma_length = sg->length;
2243                         pteval = (sg_phys(sg) - pgoff) | prot;
2244                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2245                 }
2246
2247                 if (!pte) {
2248                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2249
2250                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251                         if (!pte)
2252                                 return -ENOMEM;
2253                         /* It is large page*/
2254                         if (largepage_lvl > 1) {
2255                                 unsigned long nr_superpages, end_pfn;
2256
2257                                 pteval |= DMA_PTE_LARGE_PAGE;
2258                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2259
2260                                 nr_superpages = sg_res / lvl_pages;
2261                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2262
2263                                 /*
2264                                  * Ensure that old small page tables are
2265                                  * removed to make room for superpage(s).
2266                                  * We're adding new large pages, so make sure
2267                                  * we don't remove their parent tables.
2268                                  */
2269                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2270                                                        largepage_lvl + 1);
2271                         } else {
2272                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2273                         }
2274
2275                 }
2276                 /* We don't need lock here, nobody else
2277                  * touches the iova range
2278                  */
2279                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2280                 if (tmp) {
2281                         static int dumps = 5;
2282                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283                                 iov_pfn, tmp, (unsigned long long)pteval);
2284                         if (dumps) {
2285                                 dumps--;
2286                                 debug_dma_dump_mappings(NULL);
2287                         }
2288                         WARN_ON(1);
2289                 }
2290
2291                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2292
2293                 BUG_ON(nr_pages < lvl_pages);
2294                 BUG_ON(sg_res < lvl_pages);
2295
2296                 nr_pages -= lvl_pages;
2297                 iov_pfn += lvl_pages;
2298                 phys_pfn += lvl_pages;
2299                 pteval += lvl_pages * VTD_PAGE_SIZE;
2300                 sg_res -= lvl_pages;
2301
2302                 /* If the next PTE would be the first in a new page, then we
2303                    need to flush the cache on the entries we've just written.
2304                    And then we'll need to recalculate 'pte', so clear it and
2305                    let it get set again in the if (!pte) block above.
2306
2307                    If we're done (!nr_pages) we need to flush the cache too.
2308
2309                    Also if we've been setting superpages, we may need to
2310                    recalculate 'pte' and switch back to smaller pages for the
2311                    end of the mapping, if the trailing size is not enough to
2312                    use another superpage (i.e. sg_res < lvl_pages). */
2313                 pte++;
2314                 if (!nr_pages || first_pte_in_page(pte) ||
2315                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316                         domain_flush_cache(domain, first_pte,
2317                                            (void *)pte - (void *)first_pte);
2318                         pte = NULL;
2319                 }
2320
2321                 if (!sg_res && nr_pages)
2322                         sg = sg_next(sg);
2323         }
2324         return 0;
2325 }
2326
2327 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328                          struct scatterlist *sg, unsigned long phys_pfn,
2329                          unsigned long nr_pages, int prot)
2330 {
2331        int ret;
2332        struct intel_iommu *iommu;
2333
2334        /* Do the real mapping first */
2335        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2336        if (ret)
2337                return ret;
2338
2339        /* Notify about the new mapping */
2340        if (domain_type_is_vm(domain)) {
2341                /* VM typed domains can have more than one IOMMUs */
2342                int iommu_id;
2343                for_each_domain_iommu(iommu_id, domain) {
2344                        iommu = g_iommus[iommu_id];
2345                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2346                }
2347        } else {
2348                /* General domains only have one IOMMU */
2349                iommu = domain_get_iommu(domain);
2350                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2351        }
2352
2353        return 0;
2354 }
2355
2356 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2357                                     struct scatterlist *sg, unsigned long nr_pages,
2358                                     int prot)
2359 {
2360         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2361 }
2362
2363 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2364                                      unsigned long phys_pfn, unsigned long nr_pages,
2365                                      int prot)
2366 {
2367         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2368 }
2369
2370 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2371 {
2372         unsigned long flags;
2373         struct context_entry *context;
2374         u16 did_old;
2375
2376         if (!iommu)
2377                 return;
2378
2379         spin_lock_irqsave(&iommu->lock, flags);
2380         context = iommu_context_addr(iommu, bus, devfn, 0);
2381         if (!context) {
2382                 spin_unlock_irqrestore(&iommu->lock, flags);
2383                 return;
2384         }
2385         did_old = context_domain_id(context);
2386         context_clear_entry(context);
2387         __iommu_flush_cache(iommu, context, sizeof(*context));
2388         spin_unlock_irqrestore(&iommu->lock, flags);
2389         iommu->flush.flush_context(iommu,
2390                                    did_old,
2391                                    (((u16)bus) << 8) | devfn,
2392                                    DMA_CCMD_MASK_NOBIT,
2393                                    DMA_CCMD_DEVICE_INVL);
2394         iommu->flush.flush_iotlb(iommu,
2395                                  did_old,
2396                                  0,
2397                                  0,
2398                                  DMA_TLB_DSI_FLUSH);
2399 }
2400
2401 static inline void unlink_domain_info(struct device_domain_info *info)
2402 {
2403         assert_spin_locked(&device_domain_lock);
2404         list_del(&info->link);
2405         list_del(&info->global);
2406         if (info->dev)
2407                 info->dev->archdata.iommu = NULL;
2408 }
2409
2410 static void domain_remove_dev_info(struct dmar_domain *domain)
2411 {
2412         struct device_domain_info *info, *tmp;
2413         unsigned long flags;
2414
2415         spin_lock_irqsave(&device_domain_lock, flags);
2416         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2417                 __dmar_remove_one_dev_info(info);
2418         spin_unlock_irqrestore(&device_domain_lock, flags);
2419 }
2420
2421 /*
2422  * find_domain
2423  * Note: we use struct device->archdata.iommu stores the info
2424  */
2425 static struct dmar_domain *find_domain(struct device *dev)
2426 {
2427         struct device_domain_info *info;
2428
2429         /* No lock here, assumes no domain exit in normal case */
2430         info = dev->archdata.iommu;
2431         if (likely(info))
2432                 return info->domain;
2433         return NULL;
2434 }
2435
2436 static inline struct device_domain_info *
2437 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2438 {
2439         struct device_domain_info *info;
2440
2441         list_for_each_entry(info, &device_domain_list, global)
2442                 if (info->iommu->segment == segment && info->bus == bus &&
2443                     info->devfn == devfn)
2444                         return info;
2445
2446         return NULL;
2447 }
2448
2449 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2450                                                     int bus, int devfn,
2451                                                     struct device *dev,
2452                                                     struct dmar_domain *domain)
2453 {
2454         struct dmar_domain *found = NULL;
2455         struct device_domain_info *info;
2456         unsigned long flags;
2457         int ret;
2458
2459         info = alloc_devinfo_mem();
2460         if (!info)
2461                 return NULL;
2462
2463         info->bus = bus;
2464         info->devfn = devfn;
2465         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2466         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2467         info->ats_qdep = 0;
2468         info->dev = dev;
2469         info->domain = domain;
2470         info->iommu = iommu;
2471
2472         if (dev && dev_is_pci(dev)) {
2473                 struct pci_dev *pdev = to_pci_dev(info->dev);
2474
2475                 if (!pci_ats_disabled() &&
2476                     ecap_dev_iotlb_support(iommu->ecap) &&
2477                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2478                     dmar_find_matched_atsr_unit(pdev))
2479                         info->ats_supported = 1;
2480
2481                 if (ecs_enabled(iommu)) {
2482                         if (pasid_enabled(iommu)) {
2483                                 int features = pci_pasid_features(pdev);
2484                                 if (features >= 0)
2485                                         info->pasid_supported = features | 1;
2486                         }
2487
2488                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2489                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2490                                 info->pri_supported = 1;
2491                 }
2492         }
2493
2494         spin_lock_irqsave(&device_domain_lock, flags);
2495         if (dev)
2496                 found = find_domain(dev);
2497
2498         if (!found) {
2499                 struct device_domain_info *info2;
2500                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2501                 if (info2) {
2502                         found      = info2->domain;
2503                         info2->dev = dev;
2504                 }
2505         }
2506
2507         if (found) {
2508                 spin_unlock_irqrestore(&device_domain_lock, flags);
2509                 free_devinfo_mem(info);
2510                 /* Caller must free the original domain */
2511                 return found;
2512         }
2513
2514         spin_lock(&iommu->lock);
2515         ret = domain_attach_iommu(domain, iommu);
2516         spin_unlock(&iommu->lock);
2517
2518         if (ret) {
2519                 spin_unlock_irqrestore(&device_domain_lock, flags);
2520                 free_devinfo_mem(info);
2521                 return NULL;
2522         }
2523
2524         list_add(&info->link, &domain->devices);
2525         list_add(&info->global, &device_domain_list);
2526         if (dev)
2527                 dev->archdata.iommu = info;
2528         spin_unlock_irqrestore(&device_domain_lock, flags);
2529
2530         if (dev && domain_context_mapping(domain, dev)) {
2531                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2532                 dmar_remove_one_dev_info(domain, dev);
2533                 return NULL;
2534         }
2535
2536         return domain;
2537 }
2538
2539 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2540 {
2541         *(u16 *)opaque = alias;
2542         return 0;
2543 }
2544
2545 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2546 {
2547         struct device_domain_info *info = NULL;
2548         struct dmar_domain *domain = NULL;
2549         struct intel_iommu *iommu;
2550         u16 dma_alias;
2551         unsigned long flags;
2552         u8 bus, devfn;
2553
2554         iommu = device_to_iommu(dev, &bus, &devfn);
2555         if (!iommu)
2556                 return NULL;
2557
2558         if (dev_is_pci(dev)) {
2559                 struct pci_dev *pdev = to_pci_dev(dev);
2560
2561                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2562
2563                 spin_lock_irqsave(&device_domain_lock, flags);
2564                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2565                                                       PCI_BUS_NUM(dma_alias),
2566                                                       dma_alias & 0xff);
2567                 if (info) {
2568                         iommu = info->iommu;
2569                         domain = info->domain;
2570                 }
2571                 spin_unlock_irqrestore(&device_domain_lock, flags);
2572
2573                 /* DMA alias already has a domain, use it */
2574                 if (info)
2575                         goto out;
2576         }
2577
2578         /* Allocate and initialize new domain for the device */
2579         domain = alloc_domain(0);
2580         if (!domain)
2581                 return NULL;
2582         if (domain_init(domain, iommu, gaw)) {
2583                 domain_exit(domain);
2584                 return NULL;
2585         }
2586
2587 out:
2588
2589         return domain;
2590 }
2591
2592 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2593                                               struct dmar_domain *domain)
2594 {
2595         struct intel_iommu *iommu;
2596         struct dmar_domain *tmp;
2597         u16 req_id, dma_alias;
2598         u8 bus, devfn;
2599
2600         iommu = device_to_iommu(dev, &bus, &devfn);
2601         if (!iommu)
2602                 return NULL;
2603
2604         req_id = ((u16)bus << 8) | devfn;
2605
2606         if (dev_is_pci(dev)) {
2607                 struct pci_dev *pdev = to_pci_dev(dev);
2608
2609                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2610
2611                 /* register PCI DMA alias device */
2612                 if (req_id != dma_alias) {
2613                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2614                                         dma_alias & 0xff, NULL, domain);
2615
2616                         if (!tmp || tmp != domain)
2617                                 return tmp;
2618                 }
2619         }
2620
2621         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2622         if (!tmp || tmp != domain)
2623                 return tmp;
2624
2625         return domain;
2626 }
2627
2628 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2629 {
2630         struct dmar_domain *domain, *tmp;
2631
2632         domain = find_domain(dev);
2633         if (domain)
2634                 goto out;
2635
2636         domain = find_or_alloc_domain(dev, gaw);
2637         if (!domain)
2638                 goto out;
2639
2640         tmp = set_domain_for_dev(dev, domain);
2641         if (!tmp || domain != tmp) {
2642                 domain_exit(domain);
2643                 domain = tmp;
2644         }
2645
2646 out:
2647
2648         return domain;
2649 }
2650
2651 static int iommu_domain_identity_map(struct dmar_domain *domain,
2652                                      unsigned long long start,
2653                                      unsigned long long end)
2654 {
2655         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2656         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2657
2658         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2659                           dma_to_mm_pfn(last_vpfn))) {
2660                 pr_err("Reserving iova failed\n");
2661                 return -ENOMEM;
2662         }
2663
2664         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2665         /*
2666          * RMRR range might have overlap with physical memory range,
2667          * clear it first
2668          */
2669         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2670
2671         return __domain_mapping(domain, first_vpfn, NULL,
2672                                 first_vpfn, last_vpfn - first_vpfn + 1,
2673                                 DMA_PTE_READ|DMA_PTE_WRITE);
2674 }
2675
2676 static int domain_prepare_identity_map(struct device *dev,
2677                                        struct dmar_domain *domain,
2678                                        unsigned long long start,
2679                                        unsigned long long end)
2680 {
2681         /* For _hardware_ passthrough, don't bother. But for software
2682            passthrough, we do it anyway -- it may indicate a memory
2683            range which is reserved in E820, so which didn't get set
2684            up to start with in si_domain */
2685         if (domain == si_domain && hw_pass_through) {
2686                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2687                         dev_name(dev), start, end);
2688                 return 0;
2689         }
2690
2691         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2692                 dev_name(dev), start, end);
2693
2694         if (end < start) {
2695                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2696                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2697                         dmi_get_system_info(DMI_BIOS_VENDOR),
2698                         dmi_get_system_info(DMI_BIOS_VERSION),
2699                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2700                 return -EIO;
2701         }
2702
2703         if (end >> agaw_to_width(domain->agaw)) {
2704                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2705                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2706                      agaw_to_width(domain->agaw),
2707                      dmi_get_system_info(DMI_BIOS_VENDOR),
2708                      dmi_get_system_info(DMI_BIOS_VERSION),
2709                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2710                 return -EIO;
2711         }
2712
2713         return iommu_domain_identity_map(domain, start, end);
2714 }
2715
2716 static int iommu_prepare_identity_map(struct device *dev,
2717                                       unsigned long long start,
2718                                       unsigned long long end)
2719 {
2720         struct dmar_domain *domain;
2721         int ret;
2722
2723         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2724         if (!domain)
2725                 return -ENOMEM;
2726
2727         ret = domain_prepare_identity_map(dev, domain, start, end);
2728         if (ret)
2729                 domain_exit(domain);
2730
2731         return ret;
2732 }
2733
2734 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2735                                          struct device *dev)
2736 {
2737         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2738                 return 0;
2739         return iommu_prepare_identity_map(dev, rmrr->base_address,
2740                                           rmrr->end_address);
2741 }
2742
2743 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2744 static inline void iommu_prepare_isa(void)
2745 {
2746         struct pci_dev *pdev;
2747         int ret;
2748
2749         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2750         if (!pdev)
2751                 return;
2752
2753         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2754         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2755
2756         if (ret)
2757                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2758
2759         pci_dev_put(pdev);
2760 }
2761 #else
2762 static inline void iommu_prepare_isa(void)
2763 {
2764         return;
2765 }
2766 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2767
2768 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2769
2770 static int __init si_domain_init(int hw)
2771 {
2772         int nid, ret = 0;
2773
2774         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2775         if (!si_domain)
2776                 return -EFAULT;
2777
2778         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2779                 domain_exit(si_domain);
2780                 return -EFAULT;
2781         }
2782
2783         pr_debug("Identity mapping domain allocated\n");
2784
2785         if (hw)
2786                 return 0;
2787
2788         for_each_online_node(nid) {
2789                 unsigned long start_pfn, end_pfn;
2790                 int i;
2791
2792                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2793                         ret = iommu_domain_identity_map(si_domain,
2794                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2795                         if (ret)
2796                                 return ret;
2797                 }
2798         }
2799
2800         return 0;
2801 }
2802
2803 static int identity_mapping(struct device *dev)
2804 {
2805         struct device_domain_info *info;
2806
2807         if (likely(!iommu_identity_mapping))
2808                 return 0;
2809
2810         info = dev->archdata.iommu;
2811         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2812                 return (info->domain == si_domain);
2813
2814         return 0;
2815 }
2816
2817 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2818 {
2819         struct dmar_domain *ndomain;
2820         struct intel_iommu *iommu;
2821         u8 bus, devfn;
2822
2823         iommu = device_to_iommu(dev, &bus, &devfn);
2824         if (!iommu)
2825                 return -ENODEV;
2826
2827         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2828         if (ndomain != domain)
2829                 return -EBUSY;
2830
2831         return 0;
2832 }
2833
2834 static bool device_has_rmrr(struct device *dev)
2835 {
2836         struct dmar_rmrr_unit *rmrr;
2837         struct device *tmp;
2838         int i;
2839
2840         rcu_read_lock();
2841         for_each_rmrr_units(rmrr) {
2842                 /*
2843                  * Return TRUE if this RMRR contains the device that
2844                  * is passed in.
2845                  */
2846                 for_each_active_dev_scope(rmrr->devices,
2847                                           rmrr->devices_cnt, i, tmp)
2848                         if (tmp == dev) {
2849                                 rcu_read_unlock();
2850                                 return true;
2851                         }
2852         }
2853         rcu_read_unlock();
2854         return false;
2855 }
2856
2857 /*
2858  * There are a couple cases where we need to restrict the functionality of
2859  * devices associated with RMRRs.  The first is when evaluating a device for
2860  * identity mapping because problems exist when devices are moved in and out
2861  * of domains and their respective RMRR information is lost.  This means that
2862  * a device with associated RMRRs will never be in a "passthrough" domain.
2863  * The second is use of the device through the IOMMU API.  This interface
2864  * expects to have full control of the IOVA space for the device.  We cannot
2865  * satisfy both the requirement that RMRR access is maintained and have an
2866  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2867  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2868  * We therefore prevent devices associated with an RMRR from participating in
2869  * the IOMMU API, which eliminates them from device assignment.
2870  *
2871  * In both cases we assume that PCI USB devices with RMRRs have them largely
2872  * for historical reasons and that the RMRR space is not actively used post
2873  * boot.  This exclusion may change if vendors begin to abuse it.
2874  *
2875  * The same exception is made for graphics devices, with the requirement that
2876  * any use of the RMRR regions will be torn down before assigning the device
2877  * to a guest.
2878  */
2879 static bool device_is_rmrr_locked(struct device *dev)
2880 {
2881         if (!device_has_rmrr(dev))
2882                 return false;
2883
2884         if (dev_is_pci(dev)) {
2885                 struct pci_dev *pdev = to_pci_dev(dev);
2886
2887                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2888                         return false;
2889         }
2890
2891         return true;
2892 }
2893
2894 static int iommu_should_identity_map(struct device *dev, int startup)
2895 {
2896
2897         if (dev_is_pci(dev)) {
2898                 struct pci_dev *pdev = to_pci_dev(dev);
2899
2900                 if (device_is_rmrr_locked(dev))
2901                         return 0;
2902
2903                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2904                         return 1;
2905
2906                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2907                         return 1;
2908
2909                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2910                         return 0;
2911
2912                 /*
2913                  * We want to start off with all devices in the 1:1 domain, and
2914                  * take them out later if we find they can't access all of memory.
2915                  *
2916                  * However, we can't do this for PCI devices behind bridges,
2917                  * because all PCI devices behind the same bridge will end up
2918                  * with the same source-id on their transactions.
2919                  *
2920                  * Practically speaking, we can't change things around for these
2921                  * devices at run-time, because we can't be sure there'll be no
2922                  * DMA transactions in flight for any of their siblings.
2923                  *
2924                  * So PCI devices (unless they're on the root bus) as well as
2925                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2926                  * the 1:1 domain, just in _case_ one of their siblings turns out
2927                  * not to be able to map all of memory.
2928                  */
2929                 if (!pci_is_pcie(pdev)) {
2930                         if (!pci_is_root_bus(pdev->bus))
2931                                 return 0;
2932                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2933                                 return 0;
2934                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2935                         return 0;
2936         } else {
2937                 if (device_has_rmrr(dev))
2938                         return 0;
2939         }
2940
2941         /*
2942          * At boot time, we don't yet know if devices will be 64-bit capable.
2943          * Assume that they will — if they turn out not to be, then we can
2944          * take them out of the 1:1 domain later.
2945          */
2946         if (!startup) {
2947                 /*
2948                  * If the device's dma_mask is less than the system's memory
2949                  * size then this is not a candidate for identity mapping.
2950                  */
2951                 u64 dma_mask = *dev->dma_mask;
2952
2953                 if (dev->coherent_dma_mask &&
2954                     dev->coherent_dma_mask < dma_mask)
2955                         dma_mask = dev->coherent_dma_mask;
2956
2957                 return dma_mask >= dma_get_required_mask(dev);
2958         }
2959
2960         return 1;
2961 }
2962
2963 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2964 {
2965         int ret;
2966
2967         if (!iommu_should_identity_map(dev, 1))
2968                 return 0;
2969
2970         ret = domain_add_dev_info(si_domain, dev);
2971         if (!ret)
2972                 pr_info("%s identity mapping for device %s\n",
2973                         hw ? "Hardware" : "Software", dev_name(dev));
2974         else if (ret == -ENODEV)
2975                 /* device not associated with an iommu */
2976                 ret = 0;
2977
2978         return ret;
2979 }
2980
2981
2982 static int __init iommu_prepare_static_identity_mapping(int hw)
2983 {
2984         struct pci_dev *pdev = NULL;
2985         struct dmar_drhd_unit *drhd;
2986         struct intel_iommu *iommu;
2987         struct device *dev;
2988         int i;
2989         int ret = 0;
2990
2991         for_each_pci_dev(pdev) {
2992                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2993                 if (ret)
2994                         return ret;
2995         }
2996
2997         for_each_active_iommu(iommu, drhd)
2998                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2999                         struct acpi_device_physical_node *pn;
3000                         struct acpi_device *adev;
3001
3002                         if (dev->bus != &acpi_bus_type)
3003                                 continue;
3004
3005                         adev= to_acpi_device(dev);
3006                         mutex_lock(&adev->physical_node_lock);
3007                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3008                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3009                                 if (ret)
3010                                         break;
3011                         }
3012                         mutex_unlock(&adev->physical_node_lock);
3013                         if (ret)
3014                                 return ret;
3015                 }
3016
3017         return 0;
3018 }
3019
3020 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3021 {
3022         /*
3023          * Start from the sane iommu hardware state.
3024          * If the queued invalidation is already initialized by us
3025          * (for example, while enabling interrupt-remapping) then
3026          * we got the things already rolling from a sane state.
3027          */
3028         if (!iommu->qi) {
3029                 /*
3030                  * Clear any previous faults.
3031                  */
3032                 dmar_fault(-1, iommu);
3033                 /*
3034                  * Disable queued invalidation if supported and already enabled
3035                  * before OS handover.
3036                  */
3037                 dmar_disable_qi(iommu);
3038         }
3039
3040         if (dmar_enable_qi(iommu)) {
3041                 /*
3042                  * Queued Invalidate not enabled, use Register Based Invalidate
3043                  */
3044                 iommu->flush.flush_context = __iommu_flush_context;
3045                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3046                 pr_info("%s: Using Register based invalidation\n",
3047                         iommu->name);
3048         } else {
3049                 iommu->flush.flush_context = qi_flush_context;
3050                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3051                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3052         }
3053 }
3054
3055 static int copy_context_table(struct intel_iommu *iommu,
3056                               struct root_entry *old_re,
3057                               struct context_entry **tbl,
3058                               int bus, bool ext)
3059 {
3060         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3061         struct context_entry *new_ce = NULL, ce;
3062         struct context_entry *old_ce = NULL;
3063         struct root_entry re;
3064         phys_addr_t old_ce_phys;
3065
3066         tbl_idx = ext ? bus * 2 : bus;
3067         memcpy(&re, old_re, sizeof(re));
3068
3069         for (devfn = 0; devfn < 256; devfn++) {
3070                 /* First calculate the correct index */
3071                 idx = (ext ? devfn * 2 : devfn) % 256;
3072
3073                 if (idx == 0) {
3074                         /* First save what we may have and clean up */
3075                         if (new_ce) {
3076                                 tbl[tbl_idx] = new_ce;
3077                                 __iommu_flush_cache(iommu, new_ce,
3078                                                     VTD_PAGE_SIZE);
3079                                 pos = 1;
3080                         }
3081
3082                         if (old_ce)
3083                                 iounmap(old_ce);
3084
3085                         ret = 0;
3086                         if (devfn < 0x80)
3087                                 old_ce_phys = root_entry_lctp(&re);
3088                         else
3089                                 old_ce_phys = root_entry_uctp(&re);
3090
3091                         if (!old_ce_phys) {
3092                                 if (ext && devfn == 0) {
3093                                         /* No LCTP, try UCTP */
3094                                         devfn = 0x7f;
3095                                         continue;
3096                                 } else {
3097                                         goto out;
3098                                 }
3099                         }
3100
3101                         ret = -ENOMEM;
3102                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3103                                         MEMREMAP_WB);
3104                         if (!old_ce)
3105                                 goto out;
3106
3107                         new_ce = alloc_pgtable_page(iommu->node);
3108                         if (!new_ce)
3109                                 goto out_unmap;
3110
3111                         ret = 0;
3112                 }
3113
3114                 /* Now copy the context entry */
3115                 memcpy(&ce, old_ce + idx, sizeof(ce));
3116
3117                 if (!__context_present(&ce))
3118                         continue;
3119
3120                 did = context_domain_id(&ce);
3121                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3122                         set_bit(did, iommu->domain_ids);
3123
3124                 /*
3125                  * We need a marker for copied context entries. This
3126                  * marker needs to work for the old format as well as
3127                  * for extended context entries.
3128                  *
3129                  * Bit 67 of the context entry is used. In the old
3130                  * format this bit is available to software, in the
3131                  * extended format it is the PGE bit, but PGE is ignored
3132                  * by HW if PASIDs are disabled (and thus still
3133                  * available).
3134                  *
3135                  * So disable PASIDs first and then mark the entry
3136                  * copied. This means that we don't copy PASID
3137                  * translations from the old kernel, but this is fine as
3138                  * faults there are not fatal.
3139                  */
3140                 context_clear_pasid_enable(&ce);
3141                 context_set_copied(&ce);
3142
3143                 new_ce[idx] = ce;
3144         }
3145
3146         tbl[tbl_idx + pos] = new_ce;
3147
3148         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3149
3150 out_unmap:
3151         memunmap(old_ce);
3152
3153 out:
3154         return ret;
3155 }
3156
3157 static int copy_translation_tables(struct intel_iommu *iommu)
3158 {
3159         struct context_entry **ctxt_tbls;
3160         struct root_entry *old_rt;
3161         phys_addr_t old_rt_phys;
3162         int ctxt_table_entries;
3163         unsigned long flags;
3164         u64 rtaddr_reg;
3165         int bus, ret;
3166         bool new_ext, ext;
3167
3168         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3169         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3170         new_ext    = !!ecap_ecs(iommu->ecap);
3171
3172         /*
3173          * The RTT bit can only be changed when translation is disabled,
3174          * but disabling translation means to open a window for data
3175          * corruption. So bail out and don't copy anything if we would
3176          * have to change the bit.
3177          */
3178         if (new_ext != ext)
3179                 return -EINVAL;
3180
3181         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3182         if (!old_rt_phys)
3183                 return -EINVAL;
3184
3185         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3186         if (!old_rt)
3187                 return -ENOMEM;
3188
3189         /* This is too big for the stack - allocate it from slab */
3190         ctxt_table_entries = ext ? 512 : 256;
3191         ret = -ENOMEM;
3192         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3193         if (!ctxt_tbls)
3194                 goto out_unmap;
3195
3196         for (bus = 0; bus < 256; bus++) {
3197                 ret = copy_context_table(iommu, &old_rt[bus],
3198                                          ctxt_tbls, bus, ext);
3199                 if (ret) {
3200                         pr_err("%s: Failed to copy context table for bus %d\n",
3201                                 iommu->name, bus);
3202                         continue;
3203                 }
3204         }
3205
3206         spin_lock_irqsave(&iommu->lock, flags);
3207
3208         /* Context tables are copied, now write them to the root_entry table */
3209         for (bus = 0; bus < 256; bus++) {
3210                 int idx = ext ? bus * 2 : bus;
3211                 u64 val;
3212
3213                 if (ctxt_tbls[idx]) {
3214                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3215                         iommu->root_entry[bus].lo = val;
3216                 }
3217
3218                 if (!ext || !ctxt_tbls[idx + 1])
3219                         continue;
3220
3221                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3222                 iommu->root_entry[bus].hi = val;
3223         }
3224
3225         spin_unlock_irqrestore(&iommu->lock, flags);
3226
3227         kfree(ctxt_tbls);
3228
3229         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3230
3231         ret = 0;
3232
3233 out_unmap:
3234         memunmap(old_rt);
3235
3236         return ret;
3237 }
3238
3239 static int __init init_dmars(void)
3240 {
3241         struct dmar_drhd_unit *drhd;
3242         struct dmar_rmrr_unit *rmrr;
3243         bool copied_tables = false;
3244         struct device *dev;
3245         struct intel_iommu *iommu;
3246         int i, ret;
3247
3248         /*
3249          * for each drhd
3250          *    allocate root
3251          *    initialize and program root entry to not present
3252          * endfor
3253          */
3254         for_each_drhd_unit(drhd) {
3255                 /*
3256                  * lock not needed as this is only incremented in the single
3257                  * threaded kernel __init code path all other access are read
3258                  * only
3259                  */
3260                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3261                         g_num_of_iommus++;
3262                         continue;
3263                 }
3264                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3265         }
3266
3267         /* Preallocate enough resources for IOMMU hot-addition */
3268         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3269                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3270
3271         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3272                         GFP_KERNEL);
3273         if (!g_iommus) {
3274                 pr_err("Allocating global iommu array failed\n");
3275                 ret = -ENOMEM;
3276                 goto error;
3277         }
3278
3279         for_each_active_iommu(iommu, drhd) {
3280                 g_iommus[iommu->seq_id] = iommu;
3281
3282                 intel_iommu_init_qi(iommu);
3283
3284                 ret = iommu_init_domains(iommu);
3285                 if (ret)
3286                         goto free_iommu;
3287
3288                 init_translation_status(iommu);
3289
3290                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3291                         iommu_disable_translation(iommu);
3292                         clear_translation_pre_enabled(iommu);
3293                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3294                                 iommu->name);
3295                 }
3296
3297                 /*
3298                  * TBD:
3299                  * we could share the same root & context tables
3300                  * among all IOMMU's. Need to Split it later.
3301                  */
3302                 ret = iommu_alloc_root_entry(iommu);
3303                 if (ret)
3304                         goto free_iommu;
3305
3306                 if (translation_pre_enabled(iommu)) {
3307                         pr_info("Translation already enabled - trying to copy translation structures\n");
3308
3309                         ret = copy_translation_tables(iommu);
3310                         if (ret) {
3311                                 /*
3312                                  * We found the IOMMU with translation
3313                                  * enabled - but failed to copy over the
3314                                  * old root-entry table. Try to proceed
3315                                  * by disabling translation now and
3316                                  * allocating a clean root-entry table.
3317                                  * This might cause DMAR faults, but
3318                                  * probably the dump will still succeed.
3319                                  */
3320                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3321                                        iommu->name);
3322                                 iommu_disable_translation(iommu);
3323                                 clear_translation_pre_enabled(iommu);
3324                         } else {
3325                                 pr_info("Copied translation tables from previous kernel for %s\n",
3326                                         iommu->name);
3327                                 copied_tables = true;
3328                         }
3329                 }
3330
3331                 if (!ecap_pass_through(iommu->ecap))
3332                         hw_pass_through = 0;
3333 #ifdef CONFIG_INTEL_IOMMU_SVM
3334                 if (pasid_enabled(iommu))
3335                         intel_svm_alloc_pasid_tables(iommu);
3336 #endif
3337         }
3338
3339         /*
3340          * Now that qi is enabled on all iommus, set the root entry and flush
3341          * caches. This is required on some Intel X58 chipsets, otherwise the
3342          * flush_context function will loop forever and the boot hangs.
3343          */
3344         for_each_active_iommu(iommu, drhd) {
3345                 iommu_flush_write_buffer(iommu);
3346                 iommu_set_root_entry(iommu);
3347                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3348                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3349         }
3350
3351         if (iommu_pass_through)
3352                 iommu_identity_mapping |= IDENTMAP_ALL;
3353
3354 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3355         iommu_identity_mapping |= IDENTMAP_GFX;
3356 #endif
3357
3358         check_tylersburg_isoch();
3359
3360         if (iommu_identity_mapping) {
3361                 ret = si_domain_init(hw_pass_through);
3362                 if (ret)
3363                         goto free_iommu;
3364         }
3365
3366
3367         /*
3368          * If we copied translations from a previous kernel in the kdump
3369          * case, we can not assign the devices to domains now, as that
3370          * would eliminate the old mappings. So skip this part and defer
3371          * the assignment to device driver initialization time.
3372          */
3373         if (copied_tables)
3374                 goto domains_done;
3375
3376         /*
3377          * If pass through is not set or not enabled, setup context entries for
3378          * identity mappings for rmrr, gfx, and isa and may fall back to static
3379          * identity mapping if iommu_identity_mapping is set.
3380          */
3381         if (iommu_identity_mapping) {
3382                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3383                 if (ret) {
3384                         pr_crit("Failed to setup IOMMU pass-through\n");
3385                         goto free_iommu;
3386                 }
3387         }
3388         /*
3389          * For each rmrr
3390          *   for each dev attached to rmrr
3391          *   do
3392          *     locate drhd for dev, alloc domain for dev
3393          *     allocate free domain
3394          *     allocate page table entries for rmrr
3395          *     if context not allocated for bus
3396          *           allocate and init context
3397          *           set present in root table for this bus
3398          *     init context with domain, translation etc
3399          *    endfor
3400          * endfor
3401          */
3402         pr_info("Setting RMRR:\n");
3403         for_each_rmrr_units(rmrr) {
3404                 /* some BIOS lists non-exist devices in DMAR table. */
3405                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3406                                           i, dev) {
3407                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3408                         if (ret)
3409                                 pr_err("Mapping reserved region failed\n");
3410                 }
3411         }
3412
3413         iommu_prepare_isa();
3414
3415 domains_done:
3416
3417         /*
3418          * for each drhd
3419          *   enable fault log
3420          *   global invalidate context cache
3421          *   global invalidate iotlb
3422          *   enable translation
3423          */
3424         for_each_iommu(iommu, drhd) {
3425                 if (drhd->ignored) {
3426                         /*
3427                          * we always have to disable PMRs or DMA may fail on
3428                          * this device
3429                          */
3430                         if (force_on)
3431                                 iommu_disable_protect_mem_regions(iommu);
3432                         continue;
3433                 }
3434
3435                 iommu_flush_write_buffer(iommu);
3436
3437 #ifdef CONFIG_INTEL_IOMMU_SVM
3438                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3439                         ret = intel_svm_enable_prq(iommu);
3440                         if (ret)
3441                                 goto free_iommu;
3442                 }
3443 #endif
3444                 ret = dmar_set_interrupt(iommu);
3445                 if (ret)
3446                         goto free_iommu;
3447
3448                 if (!translation_pre_enabled(iommu))
3449                         iommu_enable_translation(iommu);
3450
3451                 iommu_disable_protect_mem_regions(iommu);
3452         }
3453
3454         return 0;
3455
3456 free_iommu:
3457         for_each_active_iommu(iommu, drhd) {
3458                 disable_dmar_iommu(iommu);
3459                 free_dmar_iommu(iommu);
3460         }
3461
3462         kfree(g_iommus);
3463
3464 error:
3465         return ret;
3466 }
3467
3468 /* This takes a number of _MM_ pages, not VTD pages */
3469 static unsigned long intel_alloc_iova(struct device *dev,
3470                                      struct dmar_domain *domain,
3471                                      unsigned long nrpages, uint64_t dma_mask)
3472 {
3473         unsigned long iova_pfn = 0;
3474
3475         /* Restrict dma_mask to the width that the iommu can handle */
3476         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3477         /* Ensure we reserve the whole size-aligned region */
3478         nrpages = __roundup_pow_of_two(nrpages);
3479
3480         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3481                 /*
3482                  * First try to allocate an io virtual address in
3483                  * DMA_BIT_MASK(32) and if that fails then try allocating
3484                  * from higher range
3485                  */
3486                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3487                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3488                 if (iova_pfn)
3489                         return iova_pfn;
3490         }
3491         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3492                                    IOVA_PFN(dma_mask), true);
3493         if (unlikely(!iova_pfn)) {
3494                 pr_err("Allocating %ld-page iova for %s failed",
3495                        nrpages, dev_name(dev));
3496                 return 0;
3497         }
3498
3499         return iova_pfn;
3500 }
3501
3502 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3503 {
3504         struct dmar_domain *domain, *tmp;
3505         struct dmar_rmrr_unit *rmrr;
3506         struct device *i_dev;
3507         int i, ret;
3508
3509         domain = find_domain(dev);
3510         if (domain)
3511                 goto out;
3512
3513         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3514         if (!domain)
3515                 goto out;
3516
3517         /* We have a new domain - setup possible RMRRs for the device */
3518         rcu_read_lock();
3519         for_each_rmrr_units(rmrr) {
3520                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3521                                           i, i_dev) {
3522                         if (i_dev != dev)
3523                                 continue;
3524
3525                         ret = domain_prepare_identity_map(dev, domain,
3526                                                           rmrr->base_address,
3527                                                           rmrr->end_address);
3528                         if (ret)
3529                                 dev_err(dev, "Mapping reserved region failed\n");
3530                 }
3531         }
3532         rcu_read_unlock();
3533
3534         tmp = set_domain_for_dev(dev, domain);
3535         if (!tmp || domain != tmp) {
3536                 domain_exit(domain);
3537                 domain = tmp;
3538         }
3539
3540 out:
3541
3542         if (!domain)
3543                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3544
3545
3546         return domain;
3547 }
3548
3549 /* Check if the dev needs to go through non-identity map and unmap process.*/
3550 static int iommu_no_mapping(struct device *dev)
3551 {
3552         int found;
3553
3554         if (iommu_dummy(dev))
3555                 return 1;
3556
3557         if (!iommu_identity_mapping)
3558                 return 0;
3559
3560         found = identity_mapping(dev);
3561         if (found) {
3562                 if (iommu_should_identity_map(dev, 0))
3563                         return 1;
3564                 else {
3565                         /*
3566                          * 32 bit DMA is removed from si_domain and fall back
3567                          * to non-identity mapping.
3568                          */
3569                         dmar_remove_one_dev_info(si_domain, dev);
3570                         pr_info("32bit %s uses non-identity mapping\n",
3571                                 dev_name(dev));
3572                         return 0;
3573                 }
3574         } else {
3575                 /*
3576                  * In case of a detached 64 bit DMA device from vm, the device
3577                  * is put into si_domain for identity mapping.
3578                  */
3579                 if (iommu_should_identity_map(dev, 0)) {
3580                         int ret;
3581                         ret = domain_add_dev_info(si_domain, dev);
3582                         if (!ret) {
3583                                 pr_info("64bit %s uses identity mapping\n",
3584                                         dev_name(dev));
3585                                 return 1;
3586                         }
3587                 }
3588         }
3589
3590         return 0;
3591 }
3592
3593 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3594                                      size_t size, int dir, u64 dma_mask)
3595 {
3596         struct dmar_domain *domain;
3597         phys_addr_t start_paddr;
3598         unsigned long iova_pfn;
3599         int prot = 0;
3600         int ret;
3601         struct intel_iommu *iommu;
3602         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3603
3604         BUG_ON(dir == DMA_NONE);
3605
3606         if (iommu_no_mapping(dev))
3607                 return paddr;
3608
3609         domain = get_valid_domain_for_dev(dev);
3610         if (!domain)
3611                 return 0;
3612
3613         iommu = domain_get_iommu(domain);
3614         size = aligned_nrpages(paddr, size);
3615
3616         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3617         if (!iova_pfn)
3618                 goto error;
3619
3620         /*
3621          * Check if DMAR supports zero-length reads on write only
3622          * mappings..
3623          */
3624         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3625                         !cap_zlr(iommu->cap))
3626                 prot |= DMA_PTE_READ;
3627         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3628                 prot |= DMA_PTE_WRITE;
3629         /*
3630          * paddr - (paddr + size) might be partial page, we should map the whole
3631          * page.  Note: if two part of one page are separately mapped, we
3632          * might have two guest_addr mapping to the same host paddr, but this
3633          * is not a big problem
3634          */
3635         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3636                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3637         if (ret)
3638                 goto error;
3639
3640         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3641         start_paddr += paddr & ~PAGE_MASK;
3642         return start_paddr;
3643
3644 error:
3645         if (iova_pfn)
3646                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3647         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3648                 dev_name(dev), size, (unsigned long long)paddr, dir);
3649         return 0;
3650 }
3651
3652 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3653                                  unsigned long offset, size_t size,
3654                                  enum dma_data_direction dir,
3655                                  unsigned long attrs)
3656 {
3657         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3658                                   dir, *dev->dma_mask);
3659 }
3660
3661 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3662 {
3663         struct dmar_domain *domain;
3664         unsigned long start_pfn, last_pfn;
3665         unsigned long nrpages;
3666         unsigned long iova_pfn;
3667         struct intel_iommu *iommu;
3668         struct page *freelist;
3669
3670         if (iommu_no_mapping(dev))
3671                 return;
3672
3673         domain = find_domain(dev);
3674         BUG_ON(!domain);
3675
3676         iommu = domain_get_iommu(domain);
3677
3678         iova_pfn = IOVA_PFN(dev_addr);
3679
3680         nrpages = aligned_nrpages(dev_addr, size);
3681         start_pfn = mm_to_dma_pfn(iova_pfn);
3682         last_pfn = start_pfn + nrpages - 1;
3683
3684         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3685                  dev_name(dev), start_pfn, last_pfn);
3686
3687         freelist = domain_unmap(domain, start_pfn, last_pfn);
3688
3689         if (intel_iommu_strict) {
3690                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3691                                       nrpages, !freelist, 0);
3692                 /* free iova */
3693                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3694                 dma_free_pagelist(freelist);
3695         } else {
3696                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3697                            (unsigned long)freelist);
3698                 /*
3699                  * queue up the release of the unmap to save the 1/6th of the
3700                  * cpu used up by the iotlb flush operation...
3701                  */
3702         }
3703 }
3704
3705 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3706                              size_t size, enum dma_data_direction dir,
3707                              unsigned long attrs)
3708 {
3709         intel_unmap(dev, dev_addr, size);
3710 }
3711
3712 static void *intel_alloc_coherent(struct device *dev, size_t size,
3713                                   dma_addr_t *dma_handle, gfp_t flags,
3714                                   unsigned long attrs)
3715 {
3716         void *vaddr;
3717
3718         vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3719         if (iommu_no_mapping(dev) || !vaddr)
3720                 return vaddr;
3721
3722         *dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3723                         PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3724                         dev->coherent_dma_mask);
3725         if (!*dma_handle)
3726                 goto out_free_pages;
3727         return vaddr;
3728
3729 out_free_pages:
3730         dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3731         return NULL;
3732 }
3733
3734 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3735                                 dma_addr_t dma_handle, unsigned long attrs)
3736 {
3737         if (!iommu_no_mapping(dev))
3738                 intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3739         dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3740 }
3741
3742 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3743                            int nelems, enum dma_data_direction dir,
3744                            unsigned long attrs)
3745 {
3746         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3747         unsigned long nrpages = 0;
3748         struct scatterlist *sg;
3749         int i;
3750
3751         for_each_sg(sglist, sg, nelems, i) {
3752                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3753         }
3754
3755         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3756 }
3757
3758 static int intel_nontranslate_map_sg(struct device *hddev,
3759         struct scatterlist *sglist, int nelems, int dir)
3760 {
3761         int i;
3762         struct scatterlist *sg;
3763
3764         for_each_sg(sglist, sg, nelems, i) {
3765                 BUG_ON(!sg_page(sg));
3766                 sg->dma_address = sg_phys(sg);
3767                 sg->dma_length = sg->length;
3768         }
3769         return nelems;
3770 }
3771
3772 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3773                         enum dma_data_direction dir, unsigned long attrs)
3774 {
3775         int i;
3776         struct dmar_domain *domain;
3777         size_t size = 0;
3778         int prot = 0;
3779         unsigned long iova_pfn;
3780         int ret;
3781         struct scatterlist *sg;
3782         unsigned long start_vpfn;
3783         struct intel_iommu *iommu;
3784
3785         BUG_ON(dir == DMA_NONE);
3786         if (iommu_no_mapping(dev))
3787                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3788
3789         domain = get_valid_domain_for_dev(dev);
3790         if (!domain)
3791                 return 0;
3792
3793         iommu = domain_get_iommu(domain);
3794
3795         for_each_sg(sglist, sg, nelems, i)
3796                 size += aligned_nrpages(sg->offset, sg->length);
3797
3798         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3799                                 *dev->dma_mask);
3800         if (!iova_pfn) {
3801                 sglist->dma_length = 0;
3802                 return 0;
3803         }
3804
3805         /*
3806          * Check if DMAR supports zero-length reads on write only
3807          * mappings..
3808          */
3809         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3810                         !cap_zlr(iommu->cap))
3811                 prot |= DMA_PTE_READ;
3812         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3813                 prot |= DMA_PTE_WRITE;
3814
3815         start_vpfn = mm_to_dma_pfn(iova_pfn);
3816
3817         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3818         if (unlikely(ret)) {
3819                 dma_pte_free_pagetable(domain, start_vpfn,
3820                                        start_vpfn + size - 1,
3821                                        agaw_to_level(domain->agaw) + 1);
3822                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3823                 return 0;
3824         }
3825
3826         return nelems;
3827 }
3828
3829 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3830 {
3831         return !dma_addr;
3832 }
3833
3834 const struct dma_map_ops intel_dma_ops = {
3835         .alloc = intel_alloc_coherent,
3836         .free = intel_free_coherent,
3837         .map_sg = intel_map_sg,
3838         .unmap_sg = intel_unmap_sg,
3839         .map_page = intel_map_page,
3840         .unmap_page = intel_unmap_page,
3841         .mapping_error = intel_mapping_error,
3842 #ifdef CONFIG_X86
3843         .dma_supported = dma_direct_supported,
3844 #endif
3845 };
3846
3847 static inline int iommu_domain_cache_init(void)
3848 {
3849         int ret = 0;
3850
3851         iommu_domain_cache = kmem_cache_create("iommu_domain",
3852                                          sizeof(struct dmar_domain),
3853                                          0,
3854                                          SLAB_HWCACHE_ALIGN,
3855
3856                                          NULL);
3857         if (!iommu_domain_cache) {
3858                 pr_err("Couldn't create iommu_domain cache\n");
3859                 ret = -ENOMEM;
3860         }
3861
3862         return ret;
3863 }
3864
3865 static inline int iommu_devinfo_cache_init(void)
3866 {
3867         int ret = 0;
3868
3869         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3870                                          sizeof(struct device_domain_info),
3871                                          0,
3872                                          SLAB_HWCACHE_ALIGN,
3873                                          NULL);
3874         if (!iommu_devinfo_cache) {
3875                 pr_err("Couldn't create devinfo cache\n");
3876                 ret = -ENOMEM;
3877         }
3878
3879         return ret;
3880 }
3881
3882 static int __init iommu_init_mempool(void)
3883 {
3884         int ret;
3885         ret = iova_cache_get();
3886         if (ret)
3887                 return ret;
3888
3889         ret = iommu_domain_cache_init();
3890         if (ret)
3891                 goto domain_error;
3892
3893         ret = iommu_devinfo_cache_init();
3894         if (!ret)
3895                 return ret;
3896
3897         kmem_cache_destroy(iommu_domain_cache);
3898 domain_error:
3899         iova_cache_put();
3900
3901         return -ENOMEM;
3902 }
3903
3904 static void __init iommu_exit_mempool(void)
3905 {
3906         kmem_cache_destroy(iommu_devinfo_cache);
3907         kmem_cache_destroy(iommu_domain_cache);
3908         iova_cache_put();
3909 }
3910
3911 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3912 {
3913         struct dmar_drhd_unit *drhd;
3914         u32 vtbar;
3915         int rc;
3916
3917         /* We know that this device on this chipset has its own IOMMU.
3918          * If we find it under a different IOMMU, then the BIOS is lying
3919          * to us. Hope that the IOMMU for this device is actually
3920          * disabled, and it needs no translation...
3921          */
3922         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3923         if (rc) {
3924                 /* "can't" happen */
3925                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3926                 return;
3927         }
3928         vtbar &= 0xffff0000;
3929
3930         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3931         drhd = dmar_find_matched_drhd_unit(pdev);
3932         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3933                             TAINT_FIRMWARE_WORKAROUND,
3934                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3935                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3936 }
3937 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3938
3939 static void __init init_no_remapping_devices(void)
3940 {
3941         struct dmar_drhd_unit *drhd;
3942         struct device *dev;
3943         int i;
3944
3945         for_each_drhd_unit(drhd) {
3946                 if (!drhd->include_all) {
3947                         for_each_active_dev_scope(drhd->devices,
3948                                                   drhd->devices_cnt, i, dev)
3949                                 break;
3950                         /* ignore DMAR unit if no devices exist */
3951                         if (i == drhd->devices_cnt)
3952                                 drhd->ignored = 1;
3953                 }
3954         }
3955
3956         for_each_active_drhd_unit(drhd) {
3957                 if (drhd->include_all)
3958                         continue;
3959
3960                 for_each_active_dev_scope(drhd->devices,
3961                                           drhd->devices_cnt, i, dev)
3962                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3963                                 break;
3964                 if (i < drhd->devices_cnt)
3965                         continue;
3966
3967                 /* This IOMMU has *only* gfx devices. Either bypass it or
3968                    set the gfx_mapped flag, as appropriate */
3969                 if (dmar_map_gfx) {
3970                         intel_iommu_gfx_mapped = 1;
3971                 } else {
3972                         drhd->ignored = 1;
3973                         for_each_active_dev_scope(drhd->devices,
3974                                                   drhd->devices_cnt, i, dev)
3975                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3976                 }
3977         }
3978 }
3979
3980 #ifdef CONFIG_SUSPEND
3981 static int init_iommu_hw(void)
3982 {
3983         struct dmar_drhd_unit *drhd;
3984         struct intel_iommu *iommu = NULL;
3985
3986         for_each_active_iommu(iommu, drhd)
3987                 if (iommu->qi)
3988                         dmar_reenable_qi(iommu);
3989
3990         for_each_iommu(iommu, drhd) {
3991                 if (drhd->ignored) {
3992                         /*
3993                          * we always have to disable PMRs or DMA may fail on
3994                          * this device
3995                          */
3996                         if (force_on)
3997                                 iommu_disable_protect_mem_regions(iommu);
3998                         continue;
3999                 }
4000         
4001                 iommu_flush_write_buffer(iommu);
4002
4003                 iommu_set_root_entry(iommu);
4004
4005                 iommu->flush.flush_context(iommu, 0, 0, 0,
4006                                            DMA_CCMD_GLOBAL_INVL);
4007                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4008                 iommu_enable_translation(iommu);
4009                 iommu_disable_protect_mem_regions(iommu);
4010         }
4011
4012         return 0;
4013 }
4014
4015 static void iommu_flush_all(void)
4016 {
4017         struct dmar_drhd_unit *drhd;
4018         struct intel_iommu *iommu;
4019
4020         for_each_active_iommu(iommu, drhd) {
4021                 iommu->flush.flush_context(iommu, 0, 0, 0,
4022                                            DMA_CCMD_GLOBAL_INVL);
4023                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4024                                          DMA_TLB_GLOBAL_FLUSH);
4025         }
4026 }
4027
4028 static int iommu_suspend(void)
4029 {
4030         struct dmar_drhd_unit *drhd;
4031         struct intel_iommu *iommu = NULL;
4032         unsigned long flag;
4033
4034         for_each_active_iommu(iommu, drhd) {
4035                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4036                                                  GFP_ATOMIC);
4037                 if (!iommu->iommu_state)
4038                         goto nomem;
4039         }
4040
4041         iommu_flush_all();
4042
4043         for_each_active_iommu(iommu, drhd) {
4044                 iommu_disable_translation(iommu);
4045
4046                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4047
4048                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4049                         readl(iommu->reg + DMAR_FECTL_REG);
4050                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4051                         readl(iommu->reg + DMAR_FEDATA_REG);
4052                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4053                         readl(iommu->reg + DMAR_FEADDR_REG);
4054                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4055                         readl(iommu->reg + DMAR_FEUADDR_REG);
4056
4057                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4058         }
4059         return 0;
4060
4061 nomem:
4062         for_each_active_iommu(iommu, drhd)
4063                 kfree(iommu->iommu_state);
4064
4065         return -ENOMEM;
4066 }
4067
4068 static void iommu_resume(void)
4069 {
4070         struct dmar_drhd_unit *drhd;
4071         struct intel_iommu *iommu = NULL;
4072         unsigned long flag;
4073
4074         if (init_iommu_hw()) {
4075                 if (force_on)
4076                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4077                 else
4078                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4079                 return;
4080         }
4081
4082         for_each_active_iommu(iommu, drhd) {
4083
4084                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4085
4086                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4087                         iommu->reg + DMAR_FECTL_REG);
4088                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4089                         iommu->reg + DMAR_FEDATA_REG);
4090                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4091                         iommu->reg + DMAR_FEADDR_REG);
4092                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4093                         iommu->reg + DMAR_FEUADDR_REG);
4094
4095                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4096         }
4097
4098         for_each_active_iommu(iommu, drhd)
4099                 kfree(iommu->iommu_state);
4100 }
4101
4102 static struct syscore_ops iommu_syscore_ops = {
4103         .resume         = iommu_resume,
4104         .suspend        = iommu_suspend,
4105 };
4106
4107 static void __init init_iommu_pm_ops(void)
4108 {
4109         register_syscore_ops(&iommu_syscore_ops);
4110 }
4111
4112 #else
4113 static inline void init_iommu_pm_ops(void) {}
4114 #endif  /* CONFIG_PM */
4115
4116
4117 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4118 {
4119         struct acpi_dmar_reserved_memory *rmrr;
4120         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4121         struct dmar_rmrr_unit *rmrru;
4122         size_t length;
4123
4124         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4125         if (!rmrru)
4126                 goto out;
4127
4128         rmrru->hdr = header;
4129         rmrr = (struct acpi_dmar_reserved_memory *)header;
4130         rmrru->base_address = rmrr->base_address;
4131         rmrru->end_address = rmrr->end_address;
4132
4133         length = rmrr->end_address - rmrr->base_address + 1;
4134         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4135                                               IOMMU_RESV_DIRECT);
4136         if (!rmrru->resv)
4137                 goto free_rmrru;
4138
4139         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4140                                 ((void *)rmrr) + rmrr->header.length,
4141                                 &rmrru->devices_cnt);
4142         if (rmrru->devices_cnt && rmrru->devices == NULL)
4143                 goto free_all;
4144
4145         list_add(&rmrru->list, &dmar_rmrr_units);
4146
4147         return 0;
4148 free_all:
4149         kfree(rmrru->resv);
4150 free_rmrru:
4151         kfree(rmrru);
4152 out:
4153         return -ENOMEM;
4154 }
4155
4156 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4157 {
4158         struct dmar_atsr_unit *atsru;
4159         struct acpi_dmar_atsr *tmp;
4160
4161         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4162                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4163                 if (atsr->segment != tmp->segment)
4164                         continue;
4165                 if (atsr->header.length != tmp->header.length)
4166                         continue;
4167                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4168                         return atsru;
4169         }
4170
4171         return NULL;
4172 }
4173
4174 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4175 {
4176         struct acpi_dmar_atsr *atsr;
4177         struct dmar_atsr_unit *atsru;
4178
4179         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4180                 return 0;
4181
4182         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4183         atsru = dmar_find_atsr(atsr);
4184         if (atsru)
4185                 return 0;
4186
4187         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4188         if (!atsru)
4189                 return -ENOMEM;
4190
4191         /*
4192          * If memory is allocated from slab by ACPI _DSM method, we need to
4193          * copy the memory content because the memory buffer will be freed
4194          * on return.
4195          */
4196         atsru->hdr = (void *)(atsru + 1);
4197         memcpy(atsru->hdr, hdr, hdr->length);
4198         atsru->include_all = atsr->flags & 0x1;
4199         if (!atsru->include_all) {
4200                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4201                                 (void *)atsr + atsr->header.length,
4202                                 &atsru->devices_cnt);
4203                 if (atsru->devices_cnt && atsru->devices == NULL) {
4204                         kfree(atsru);
4205                         return -ENOMEM;
4206                 }
4207         }
4208
4209         list_add_rcu(&atsru->list, &dmar_atsr_units);
4210
4211         return 0;
4212 }
4213
4214 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4215 {
4216         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4217         kfree(atsru);
4218 }
4219
4220 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4221 {
4222         struct acpi_dmar_atsr *atsr;
4223         struct dmar_atsr_unit *atsru;
4224
4225         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4226         atsru = dmar_find_atsr(atsr);
4227         if (atsru) {
4228                 list_del_rcu(&atsru->list);
4229                 synchronize_rcu();
4230                 intel_iommu_free_atsr(atsru);
4231         }
4232
4233         return 0;
4234 }
4235
4236 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4237 {
4238         int i;
4239         struct device *dev;
4240         struct acpi_dmar_atsr *atsr;
4241         struct dmar_atsr_unit *atsru;
4242
4243         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4244         atsru = dmar_find_atsr(atsr);
4245         if (!atsru)
4246                 return 0;
4247
4248         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4249                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4250                                           i, dev)
4251                         return -EBUSY;
4252         }
4253
4254         return 0;
4255 }
4256
4257 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4258 {
4259         int sp, ret = 0;
4260         struct intel_iommu *iommu = dmaru->iommu;
4261
4262         if (g_iommus[iommu->seq_id])
4263                 return 0;
4264
4265         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4266                 pr_warn("%s: Doesn't support hardware pass through.\n",
4267                         iommu->name);
4268                 return -ENXIO;
4269         }
4270         if (!ecap_sc_support(iommu->ecap) &&
4271             domain_update_iommu_snooping(iommu)) {
4272                 pr_warn("%s: Doesn't support snooping.\n",
4273                         iommu->name);
4274                 return -ENXIO;
4275         }
4276         sp = domain_update_iommu_superpage(iommu) - 1;
4277         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4278                 pr_warn("%s: Doesn't support large page.\n",
4279                         iommu->name);
4280                 return -ENXIO;
4281         }
4282
4283         /*
4284          * Disable translation if already enabled prior to OS handover.
4285          */
4286         if (iommu->gcmd & DMA_GCMD_TE)
4287                 iommu_disable_translation(iommu);
4288
4289         g_iommus[iommu->seq_id] = iommu;
4290         ret = iommu_init_domains(iommu);
4291         if (ret == 0)
4292                 ret = iommu_alloc_root_entry(iommu);
4293         if (ret)
4294                 goto out;
4295
4296 #ifdef CONFIG_INTEL_IOMMU_SVM
4297         if (pasid_enabled(iommu))
4298                 intel_svm_alloc_pasid_tables(iommu);
4299 #endif
4300
4301         if (dmaru->ignored) {
4302                 /*
4303                  * we always have to disable PMRs or DMA may fail on this device
4304                  */
4305                 if (force_on)
4306                         iommu_disable_protect_mem_regions(iommu);
4307                 return 0;
4308         }
4309
4310         intel_iommu_init_qi(iommu);
4311         iommu_flush_write_buffer(iommu);
4312
4313 #ifdef CONFIG_INTEL_IOMMU_SVM
4314         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4315                 ret = intel_svm_enable_prq(iommu);
4316                 if (ret)
4317                         goto disable_iommu;
4318         }
4319 #endif
4320         ret = dmar_set_interrupt(iommu);
4321         if (ret)
4322                 goto disable_iommu;
4323
4324         iommu_set_root_entry(iommu);
4325         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4326         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4327         iommu_enable_translation(iommu);
4328
4329         iommu_disable_protect_mem_regions(iommu);
4330         return 0;
4331
4332 disable_iommu:
4333         disable_dmar_iommu(iommu);
4334 out:
4335         free_dmar_iommu(iommu);
4336         return ret;
4337 }
4338
4339 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4340 {
4341         int ret = 0;
4342         struct intel_iommu *iommu = dmaru->iommu;
4343
4344         if (!intel_iommu_enabled)
4345                 return 0;
4346         if (iommu == NULL)
4347                 return -EINVAL;
4348
4349         if (insert) {
4350                 ret = intel_iommu_add(dmaru);
4351         } else {
4352                 disable_dmar_iommu(iommu);
4353                 free_dmar_iommu(iommu);
4354         }
4355
4356         return ret;
4357 }
4358
4359 static void intel_iommu_free_dmars(void)
4360 {
4361         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4362         struct dmar_atsr_unit *atsru, *atsr_n;
4363
4364         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4365                 list_del(&rmrru->list);
4366                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4367                 kfree(rmrru->resv);
4368                 kfree(rmrru);
4369         }
4370
4371         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4372                 list_del(&atsru->list);
4373                 intel_iommu_free_atsr(atsru);
4374         }
4375 }
4376
4377 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4378 {
4379         int i, ret = 1;
4380         struct pci_bus *bus;
4381         struct pci_dev *bridge = NULL;
4382         struct device *tmp;
4383         struct acpi_dmar_atsr *atsr;
4384         struct dmar_atsr_unit *atsru;
4385
4386         dev = pci_physfn(dev);
4387         for (bus = dev->bus; bus; bus = bus->parent) {
4388                 bridge = bus->self;
4389                 /* If it's an integrated device, allow ATS */
4390                 if (!bridge)
4391                         return 1;
4392                 /* Connected via non-PCIe: no ATS */
4393                 if (!pci_is_pcie(bridge) ||
4394                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4395                         return 0;
4396                 /* If we found the root port, look it up in the ATSR */
4397                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4398                         break;
4399         }
4400
4401         rcu_read_lock();
4402         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4403                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4404                 if (atsr->segment != pci_domain_nr(dev->bus))
4405                         continue;
4406
4407                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4408                         if (tmp == &bridge->dev)
4409                                 goto out;
4410
4411                 if (atsru->include_all)
4412                         goto out;
4413         }
4414         ret = 0;
4415 out:
4416         rcu_read_unlock();
4417
4418         return ret;
4419 }
4420
4421 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4422 {
4423         int ret = 0;
4424         struct dmar_rmrr_unit *rmrru;
4425         struct dmar_atsr_unit *atsru;
4426         struct acpi_dmar_atsr *atsr;
4427         struct acpi_dmar_reserved_memory *rmrr;
4428
4429         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4430                 return 0;
4431
4432         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4433                 rmrr = container_of(rmrru->hdr,
4434                                     struct acpi_dmar_reserved_memory, header);
4435                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4436                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4437                                 ((void *)rmrr) + rmrr->header.length,
4438                                 rmrr->segment, rmrru->devices,
4439                                 rmrru->devices_cnt);
4440                         if(ret < 0)
4441                                 return ret;
4442                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4443                         dmar_remove_dev_scope(info, rmrr->segment,
4444                                 rmrru->devices, rmrru->devices_cnt);
4445                 }
4446         }
4447
4448         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4449                 if (atsru->include_all)
4450                         continue;
4451
4452                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4453                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4454                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4455                                         (void *)atsr + atsr->header.length,
4456                                         atsr->segment, atsru->devices,
4457                                         atsru->devices_cnt);
4458                         if (ret > 0)
4459                                 break;
4460                         else if(ret < 0)
4461                                 return ret;
4462                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4463                         if (dmar_remove_dev_scope(info, atsr->segment,
4464                                         atsru->devices, atsru->devices_cnt))
4465                                 break;
4466                 }
4467         }
4468
4469         return 0;
4470 }
4471
4472 /*
4473  * Here we only respond to action of unbound device from driver.
4474  *
4475  * Added device is not attached to its DMAR domain here yet. That will happen
4476  * when mapping the device to iova.
4477  */
4478 static int device_notifier(struct notifier_block *nb,
4479                                   unsigned long action, void *data)
4480 {
4481         struct device *dev = data;
4482         struct dmar_domain *domain;
4483
4484         if (iommu_dummy(dev))
4485                 return 0;
4486
4487         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4488                 return 0;
4489
4490         domain = find_domain(dev);
4491         if (!domain)
4492                 return 0;
4493
4494         dmar_remove_one_dev_info(domain, dev);
4495         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4496                 domain_exit(domain);
4497
4498         return 0;
4499 }
4500
4501 static struct notifier_block device_nb = {
4502         .notifier_call = device_notifier,
4503 };
4504
4505 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4506                                        unsigned long val, void *v)
4507 {
4508         struct memory_notify *mhp = v;
4509         unsigned long long start, end;
4510         unsigned long start_vpfn, last_vpfn;
4511
4512         switch (val) {
4513         case MEM_GOING_ONLINE:
4514                 start = mhp->start_pfn << PAGE_SHIFT;
4515                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4516                 if (iommu_domain_identity_map(si_domain, start, end)) {
4517                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4518                                 start, end);
4519                         return NOTIFY_BAD;
4520                 }
4521                 break;
4522
4523         case MEM_OFFLINE:
4524         case MEM_CANCEL_ONLINE:
4525                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4526                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4527                 while (start_vpfn <= last_vpfn) {
4528                         struct iova *iova;
4529                         struct dmar_drhd_unit *drhd;
4530                         struct intel_iommu *iommu;
4531                         struct page *freelist;
4532
4533                         iova = find_iova(&si_domain->iovad, start_vpfn);
4534                         if (iova == NULL) {
4535                                 pr_debug("Failed get IOVA for PFN %lx\n",
4536                                          start_vpfn);
4537                                 break;
4538                         }
4539
4540                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4541                                                      start_vpfn, last_vpfn);
4542                         if (iova == NULL) {
4543                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4544                                         start_vpfn, last_vpfn);
4545                                 return NOTIFY_BAD;
4546                         }
4547
4548                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4549                                                iova->pfn_hi);
4550
4551                         rcu_read_lock();
4552                         for_each_active_iommu(iommu, drhd)
4553                                 iommu_flush_iotlb_psi(iommu, si_domain,
4554                                         iova->pfn_lo, iova_size(iova),
4555                                         !freelist, 0);
4556                         rcu_read_unlock();
4557                         dma_free_pagelist(freelist);
4558
4559                         start_vpfn = iova->pfn_hi + 1;
4560                         free_iova_mem(iova);
4561                 }
4562                 break;
4563         }
4564
4565         return NOTIFY_OK;
4566 }
4567
4568 static struct notifier_block intel_iommu_memory_nb = {
4569         .notifier_call = intel_iommu_memory_notifier,
4570         .priority = 0
4571 };
4572
4573 static void free_all_cpu_cached_iovas(unsigned int cpu)
4574 {
4575         int i;
4576
4577         for (i = 0; i < g_num_of_iommus; i++) {
4578                 struct intel_iommu *iommu = g_iommus[i];
4579                 struct dmar_domain *domain;
4580                 int did;
4581
4582                 if (!iommu)
4583                         continue;
4584
4585                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4586                         domain = get_iommu_domain(iommu, (u16)did);
4587
4588                         if (!domain)
4589                                 continue;
4590                         free_cpu_cached_iovas(cpu, &domain->iovad);
4591                 }
4592         }
4593 }
4594
4595 static int intel_iommu_cpu_dead(unsigned int cpu)
4596 {
4597         free_all_cpu_cached_iovas(cpu);
4598         return 0;
4599 }
4600
4601 static void intel_disable_iommus(void)
4602 {
4603         struct intel_iommu *iommu = NULL;
4604         struct dmar_drhd_unit *drhd;
4605
4606         for_each_iommu(iommu, drhd)
4607                 iommu_disable_translation(iommu);
4608 }
4609
4610 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4611 {
4612         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4613
4614         return container_of(iommu_dev, struct intel_iommu, iommu);
4615 }
4616
4617 static ssize_t intel_iommu_show_version(struct device *dev,
4618                                         struct device_attribute *attr,
4619                                         char *buf)
4620 {
4621         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4622         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4623         return sprintf(buf, "%d:%d\n",
4624                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4625 }
4626 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4627
4628 static ssize_t intel_iommu_show_address(struct device *dev,
4629                                         struct device_attribute *attr,
4630                                         char *buf)
4631 {
4632         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4633         return sprintf(buf, "%llx\n", iommu->reg_phys);
4634 }
4635 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4636
4637 static ssize_t intel_iommu_show_cap(struct device *dev,
4638                                     struct device_attribute *attr,
4639                                     char *buf)
4640 {
4641         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4642         return sprintf(buf, "%llx\n", iommu->cap);
4643 }
4644 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4645
4646 static ssize_t intel_iommu_show_ecap(struct device *dev,
4647                                     struct device_attribute *attr,
4648                                     char *buf)
4649 {
4650         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4651         return sprintf(buf, "%llx\n", iommu->ecap);
4652 }
4653 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4654
4655 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4656                                       struct device_attribute *attr,
4657                                       char *buf)
4658 {
4659         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4660         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4661 }
4662 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4663
4664 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4665                                            struct device_attribute *attr,
4666                                            char *buf)
4667 {
4668         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4669         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4670                                                   cap_ndoms(iommu->cap)));
4671 }
4672 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4673
4674 static struct attribute *intel_iommu_attrs[] = {
4675         &dev_attr_version.attr,
4676         &dev_attr_address.attr,
4677         &dev_attr_cap.attr,
4678         &dev_attr_ecap.attr,
4679         &dev_attr_domains_supported.attr,
4680         &dev_attr_domains_used.attr,
4681         NULL,
4682 };
4683
4684 static struct attribute_group intel_iommu_group = {
4685         .name = "intel-iommu",
4686         .attrs = intel_iommu_attrs,
4687 };
4688
4689 const struct attribute_group *intel_iommu_groups[] = {
4690         &intel_iommu_group,
4691         NULL,
4692 };
4693
4694 int __init intel_iommu_init(void)
4695 {
4696         int ret = -ENODEV;
4697         struct dmar_drhd_unit *drhd;
4698         struct intel_iommu *iommu;
4699
4700         /* VT-d is required for a TXT/tboot launch, so enforce that */
4701         force_on = tboot_force_iommu();
4702
4703         if (iommu_init_mempool()) {
4704                 if (force_on)
4705                         panic("tboot: Failed to initialize iommu memory\n");
4706                 return -ENOMEM;
4707         }
4708
4709         down_write(&dmar_global_lock);
4710         if (dmar_table_init()) {
4711                 if (force_on)
4712                         panic("tboot: Failed to initialize DMAR table\n");
4713                 goto out_free_dmar;
4714         }
4715
4716         if (dmar_dev_scope_init() < 0) {
4717                 if (force_on)
4718                         panic("tboot: Failed to initialize DMAR device scope\n");
4719                 goto out_free_dmar;
4720         }
4721
4722         up_write(&dmar_global_lock);
4723
4724         /*
4725          * The bus notifier takes the dmar_global_lock, so lockdep will
4726          * complain later when we register it under the lock.
4727          */
4728         dmar_register_bus_notifier();
4729
4730         down_write(&dmar_global_lock);
4731
4732         if (no_iommu || dmar_disabled) {
4733                 /*
4734                  * We exit the function here to ensure IOMMU's remapping and
4735                  * mempool aren't setup, which means that the IOMMU's PMRs
4736                  * won't be disabled via the call to init_dmars(). So disable
4737                  * it explicitly here. The PMRs were setup by tboot prior to
4738                  * calling SENTER, but the kernel is expected to reset/tear
4739                  * down the PMRs.
4740                  */
4741                 if (intel_iommu_tboot_noforce) {
4742                         for_each_iommu(iommu, drhd)
4743                                 iommu_disable_protect_mem_regions(iommu);
4744                 }
4745
4746                 /*
4747                  * Make sure the IOMMUs are switched off, even when we
4748                  * boot into a kexec kernel and the previous kernel left
4749                  * them enabled
4750                  */
4751                 intel_disable_iommus();
4752                 goto out_free_dmar;
4753         }
4754
4755         if (list_empty(&dmar_rmrr_units))
4756                 pr_info("No RMRR found\n");
4757
4758         if (list_empty(&dmar_atsr_units))
4759                 pr_info("No ATSR found\n");
4760
4761         if (dmar_init_reserved_ranges()) {
4762                 if (force_on)
4763                         panic("tboot: Failed to reserve iommu ranges\n");
4764                 goto out_free_reserved_range;
4765         }
4766
4767         init_no_remapping_devices();
4768
4769         ret = init_dmars();
4770         if (ret) {
4771                 if (force_on)
4772                         panic("tboot: Failed to initialize DMARs\n");
4773                 pr_err("Initialization failed\n");
4774                 goto out_free_reserved_range;
4775         }
4776         up_write(&dmar_global_lock);
4777         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4778
4779 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4780         swiotlb = 0;
4781 #endif
4782         dma_ops = &intel_dma_ops;
4783
4784         init_iommu_pm_ops();
4785
4786         for_each_active_iommu(iommu, drhd) {
4787                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4788                                        intel_iommu_groups,
4789                                        "%s", iommu->name);
4790                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4791                 iommu_device_register(&iommu->iommu);
4792         }
4793
4794         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4795         bus_register_notifier(&pci_bus_type, &device_nb);
4796         if (si_domain && !hw_pass_through)
4797                 register_memory_notifier(&intel_iommu_memory_nb);
4798         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4799                           intel_iommu_cpu_dead);
4800         intel_iommu_enabled = 1;
4801
4802         return 0;
4803
4804 out_free_reserved_range:
4805         put_iova_domain(&reserved_iova_list);
4806 out_free_dmar:
4807         intel_iommu_free_dmars();
4808         up_write(&dmar_global_lock);
4809         iommu_exit_mempool();
4810         return ret;
4811 }
4812
4813 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4814 {
4815         struct intel_iommu *iommu = opaque;
4816
4817         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4818         return 0;
4819 }
4820
4821 /*
4822  * NB - intel-iommu lacks any sort of reference counting for the users of
4823  * dependent devices.  If multiple endpoints have intersecting dependent
4824  * devices, unbinding the driver from any one of them will possibly leave
4825  * the others unable to operate.
4826  */
4827 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4828 {
4829         if (!iommu || !dev || !dev_is_pci(dev))
4830                 return;
4831
4832         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4833 }
4834
4835 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4836 {
4837         struct intel_iommu *iommu;
4838         unsigned long flags;
4839
4840         assert_spin_locked(&device_domain_lock);
4841
4842         if (WARN_ON(!info))
4843                 return;
4844
4845         iommu = info->iommu;
4846
4847         if (info->dev) {
4848                 iommu_disable_dev_iotlb(info);
4849                 domain_context_clear(iommu, info->dev);
4850         }
4851
4852         unlink_domain_info(info);
4853
4854         spin_lock_irqsave(&iommu->lock, flags);
4855         domain_detach_iommu(info->domain, iommu);
4856         spin_unlock_irqrestore(&iommu->lock, flags);
4857
4858         free_devinfo_mem(info);
4859 }
4860
4861 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4862                                      struct device *dev)
4863 {
4864         struct device_domain_info *info;
4865         unsigned long flags;
4866
4867         spin_lock_irqsave(&device_domain_lock, flags);
4868         info = dev->archdata.iommu;
4869         __dmar_remove_one_dev_info(info);
4870         spin_unlock_irqrestore(&device_domain_lock, flags);
4871 }
4872
4873 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4874 {
4875         int adjust_width;
4876
4877         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4878         domain_reserve_special_ranges(domain);
4879
4880         /* calculate AGAW */
4881         domain->gaw = guest_width;
4882         adjust_width = guestwidth_to_adjustwidth(guest_width);
4883         domain->agaw = width_to_agaw(adjust_width);
4884
4885         domain->iommu_coherency = 0;
4886         domain->iommu_snooping = 0;
4887         domain->iommu_superpage = 0;
4888         domain->max_addr = 0;
4889
4890         /* always allocate the top pgd */
4891         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4892         if (!domain->pgd)
4893                 return -ENOMEM;
4894         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4895         return 0;
4896 }
4897
4898 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4899 {
4900         struct dmar_domain *dmar_domain;
4901         struct iommu_domain *domain;
4902
4903         if (type != IOMMU_DOMAIN_UNMANAGED)
4904                 return NULL;
4905
4906         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4907         if (!dmar_domain) {
4908                 pr_err("Can't allocate dmar_domain\n");
4909                 return NULL;
4910         }
4911         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4912                 pr_err("Domain initialization failed\n");
4913                 domain_exit(dmar_domain);
4914                 return NULL;
4915         }
4916         domain_update_iommu_cap(dmar_domain);
4917
4918         domain = &dmar_domain->domain;
4919         domain->geometry.aperture_start = 0;
4920         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4921         domain->geometry.force_aperture = true;
4922
4923         return domain;
4924 }
4925
4926 static void intel_iommu_domain_free(struct iommu_domain *domain)
4927 {
4928         domain_exit(to_dmar_domain(domain));
4929 }
4930
4931 static int intel_iommu_attach_device(struct iommu_domain *domain,
4932                                      struct device *dev)
4933 {
4934         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4935         struct intel_iommu *iommu;
4936         int addr_width;
4937         u8 bus, devfn;
4938
4939         if (device_is_rmrr_locked(dev)) {
4940                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4941                 return -EPERM;
4942         }
4943
4944         /* normally dev is not mapped */
4945         if (unlikely(domain_context_mapped(dev))) {
4946                 struct dmar_domain *old_domain;
4947
4948                 old_domain = find_domain(dev);
4949                 if (old_domain) {
4950                         rcu_read_lock();
4951                         dmar_remove_one_dev_info(old_domain, dev);
4952                         rcu_read_unlock();
4953
4954                         if (!domain_type_is_vm_or_si(old_domain) &&
4955                              list_empty(&old_domain->devices))
4956                                 domain_exit(old_domain);
4957                 }
4958         }
4959
4960         iommu = device_to_iommu(dev, &bus, &devfn);
4961         if (!iommu)
4962                 return -ENODEV;
4963
4964         /* check if this iommu agaw is sufficient for max mapped address */
4965         addr_width = agaw_to_width(iommu->agaw);
4966         if (addr_width > cap_mgaw(iommu->cap))
4967                 addr_width = cap_mgaw(iommu->cap);
4968
4969         if (dmar_domain->max_addr > (1LL << addr_width)) {
4970                 pr_err("%s: iommu width (%d) is not "
4971                        "sufficient for the mapped address (%llx)\n",
4972                        __func__, addr_width, dmar_domain->max_addr);
4973                 return -EFAULT;
4974         }
4975         dmar_domain->gaw = addr_width;
4976
4977         /*
4978          * Knock out extra levels of page tables if necessary
4979          */
4980         while (iommu->agaw < dmar_domain->agaw) {
4981                 struct dma_pte *pte;
4982
4983                 pte = dmar_domain->pgd;
4984                 if (dma_pte_present(pte)) {
4985                         dmar_domain->pgd = (struct dma_pte *)
4986                                 phys_to_virt(dma_pte_addr(pte));
4987                         free_pgtable_page(pte);
4988                 }
4989                 dmar_domain->agaw--;
4990         }
4991
4992         return domain_add_dev_info(dmar_domain, dev);
4993 }
4994
4995 static void intel_iommu_detach_device(struct iommu_domain *domain,
4996                                       struct device *dev)
4997 {
4998         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4999 }
5000
5001 static int intel_iommu_map(struct iommu_domain *domain,
5002                            unsigned long iova, phys_addr_t hpa,
5003                            size_t size, int iommu_prot)
5004 {
5005         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5006         u64 max_addr;
5007         int prot = 0;
5008         int ret;
5009
5010         if (iommu_prot & IOMMU_READ)
5011                 prot |= DMA_PTE_READ;
5012         if (iommu_prot & IOMMU_WRITE)
5013                 prot |= DMA_PTE_WRITE;
5014         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5015                 prot |= DMA_PTE_SNP;
5016
5017         max_addr = iova + size;
5018         if (dmar_domain->max_addr < max_addr) {
5019                 u64 end;
5020
5021                 /* check if minimum agaw is sufficient for mapped address */
5022                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5023                 if (end < max_addr) {
5024                         pr_err("%s: iommu width (%d) is not "
5025                                "sufficient for the mapped address (%llx)\n",
5026                                __func__, dmar_domain->gaw, max_addr);
5027                         return -EFAULT;
5028                 }
5029                 dmar_domain->max_addr = max_addr;
5030         }
5031         /* Round up size to next multiple of PAGE_SIZE, if it and
5032            the low bits of hpa would take us onto the next page */
5033         size = aligned_nrpages(hpa, size);
5034         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5035                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5036         return ret;
5037 }
5038
5039 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5040                                 unsigned long iova, size_t size)
5041 {
5042         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5043         struct page *freelist = NULL;
5044         unsigned long start_pfn, last_pfn;
5045         unsigned int npages;
5046         int iommu_id, level = 0;
5047
5048         /* Cope with horrid API which requires us to unmap more than the
5049            size argument if it happens to be a large-page mapping. */
5050         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5051
5052         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5053                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5054
5055         start_pfn = iova >> VTD_PAGE_SHIFT;
5056         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5057
5058         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5059
5060         npages = last_pfn - start_pfn + 1;
5061
5062         for_each_domain_iommu(iommu_id, dmar_domain)
5063                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5064                                       start_pfn, npages, !freelist, 0);
5065
5066         dma_free_pagelist(freelist);
5067
5068         if (dmar_domain->max_addr == iova + size)
5069                 dmar_domain->max_addr = iova;
5070
5071         return size;
5072 }
5073
5074 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5075                                             dma_addr_t iova)
5076 {
5077         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5078         struct dma_pte *pte;
5079         int level = 0;
5080         u64 phys = 0;
5081
5082         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5083         if (pte)
5084                 phys = dma_pte_addr(pte);
5085
5086         return phys;
5087 }
5088
5089 static bool intel_iommu_capable(enum iommu_cap cap)
5090 {
5091         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5092                 return domain_update_iommu_snooping(NULL) == 1;
5093         if (cap == IOMMU_CAP_INTR_REMAP)
5094                 return irq_remapping_enabled == 1;
5095
5096         return false;
5097 }
5098
5099 static int intel_iommu_add_device(struct device *dev)
5100 {
5101         struct intel_iommu *iommu;
5102         struct iommu_group *group;
5103         u8 bus, devfn;
5104
5105         iommu = device_to_iommu(dev, &bus, &devfn);
5106         if (!iommu)
5107                 return -ENODEV;
5108
5109         iommu_device_link(&iommu->iommu, dev);
5110
5111         group = iommu_group_get_for_dev(dev);
5112
5113         if (IS_ERR(group))
5114                 return PTR_ERR(group);
5115
5116         iommu_group_put(group);
5117         return 0;
5118 }
5119
5120 static void intel_iommu_remove_device(struct device *dev)
5121 {
5122         struct intel_iommu *iommu;
5123         u8 bus, devfn;
5124
5125         iommu = device_to_iommu(dev, &bus, &devfn);
5126         if (!iommu)
5127                 return;
5128
5129         iommu_group_remove_device(dev);
5130
5131         iommu_device_unlink(&iommu->iommu, dev);
5132 }
5133
5134 static void intel_iommu_get_resv_regions(struct device *device,
5135                                          struct list_head *head)
5136 {
5137         struct iommu_resv_region *reg;
5138         struct dmar_rmrr_unit *rmrr;
5139         struct device *i_dev;
5140         int i;
5141
5142         rcu_read_lock();
5143         for_each_rmrr_units(rmrr) {
5144                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5145                                           i, i_dev) {
5146                         if (i_dev != device)
5147                                 continue;
5148
5149                         list_add_tail(&rmrr->resv->list, head);
5150                 }
5151         }
5152         rcu_read_unlock();
5153
5154         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5155                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5156                                       0, IOMMU_RESV_MSI);
5157         if (!reg)
5158                 return;
5159         list_add_tail(&reg->list, head);
5160 }
5161
5162 static void intel_iommu_put_resv_regions(struct device *dev,
5163                                          struct list_head *head)
5164 {
5165         struct iommu_resv_region *entry, *next;
5166
5167         list_for_each_entry_safe(entry, next, head, list) {
5168                 if (entry->type == IOMMU_RESV_RESERVED)
5169                         kfree(entry);
5170         }
5171 }
5172
5173 #ifdef CONFIG_INTEL_IOMMU_SVM
5174 #define MAX_NR_PASID_BITS (20)
5175 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5176 {
5177         /*
5178          * Convert ecap_pss to extend context entry pts encoding, also
5179          * respect the soft pasid_max value set by the iommu.
5180          * - number of PASID bits = ecap_pss + 1
5181          * - number of PASID table entries = 2^(pts + 5)
5182          * Therefore, pts = ecap_pss - 4
5183          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5184          */
5185         if (ecap_pss(iommu->ecap) < 5)
5186                 return 0;
5187
5188         /* pasid_max is encoded as actual number of entries not the bits */
5189         return find_first_bit((unsigned long *)&iommu->pasid_max,
5190                         MAX_NR_PASID_BITS) - 5;
5191 }
5192
5193 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5194 {
5195         struct device_domain_info *info;
5196         struct context_entry *context;
5197         struct dmar_domain *domain;
5198         unsigned long flags;
5199         u64 ctx_lo;
5200         int ret;
5201
5202         domain = get_valid_domain_for_dev(sdev->dev);
5203         if (!domain)
5204                 return -EINVAL;
5205
5206         spin_lock_irqsave(&device_domain_lock, flags);
5207         spin_lock(&iommu->lock);
5208
5209         ret = -EINVAL;
5210         info = sdev->dev->archdata.iommu;
5211         if (!info || !info->pasid_supported)
5212                 goto out;
5213
5214         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5215         if (WARN_ON(!context))
5216                 goto out;
5217
5218         ctx_lo = context[0].lo;
5219
5220         sdev->did = domain->iommu_did[iommu->seq_id];
5221         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5222
5223         if (!(ctx_lo & CONTEXT_PASIDE)) {
5224                 if (iommu->pasid_state_table)
5225                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5226                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5227                         intel_iommu_get_pts(iommu);
5228
5229                 wmb();
5230                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5231                  * extended to permit requests-with-PASID if the PASIDE bit
5232                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5233                  * however, the PASIDE bit is ignored and requests-with-PASID
5234                  * are unconditionally blocked. Which makes less sense.
5235                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5236                  * "guest mode" translation types depending on whether ATS
5237                  * is available or not. Annoyingly, we can't use the new
5238                  * modes *unless* PASIDE is set. */
5239                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5240                         ctx_lo &= ~CONTEXT_TT_MASK;
5241                         if (info->ats_supported)
5242                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5243                         else
5244                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5245                 }
5246                 ctx_lo |= CONTEXT_PASIDE;
5247                 if (iommu->pasid_state_table)
5248                         ctx_lo |= CONTEXT_DINVE;
5249                 if (info->pri_supported)
5250                         ctx_lo |= CONTEXT_PRS;
5251                 context[0].lo = ctx_lo;
5252                 wmb();
5253                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5254                                            DMA_CCMD_MASK_NOBIT,
5255                                            DMA_CCMD_DEVICE_INVL);
5256         }
5257
5258         /* Enable PASID support in the device, if it wasn't already */
5259         if (!info->pasid_enabled)
5260                 iommu_enable_dev_iotlb(info);
5261
5262         if (info->ats_enabled) {
5263                 sdev->dev_iotlb = 1;
5264                 sdev->qdep = info->ats_qdep;
5265                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5266                         sdev->qdep = 0;
5267         }
5268         ret = 0;
5269
5270  out:
5271         spin_unlock(&iommu->lock);
5272         spin_unlock_irqrestore(&device_domain_lock, flags);
5273
5274         return ret;
5275 }
5276
5277 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5278 {
5279         struct intel_iommu *iommu;
5280         u8 bus, devfn;
5281
5282         if (iommu_dummy(dev)) {
5283                 dev_warn(dev,
5284                          "No IOMMU translation for device; cannot enable SVM\n");
5285                 return NULL;
5286         }
5287
5288         iommu = device_to_iommu(dev, &bus, &devfn);
5289         if ((!iommu)) {
5290                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5291                 return NULL;
5292         }
5293
5294         if (!iommu->pasid_table) {
5295                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5296                 return NULL;
5297         }
5298
5299         return iommu;
5300 }
5301 #endif /* CONFIG_INTEL_IOMMU_SVM */
5302
5303 const struct iommu_ops intel_iommu_ops = {
5304         .capable                = intel_iommu_capable,
5305         .domain_alloc           = intel_iommu_domain_alloc,
5306         .domain_free            = intel_iommu_domain_free,
5307         .attach_dev             = intel_iommu_attach_device,
5308         .detach_dev             = intel_iommu_detach_device,
5309         .map                    = intel_iommu_map,
5310         .unmap                  = intel_iommu_unmap,
5311         .map_sg                 = default_iommu_map_sg,
5312         .iova_to_phys           = intel_iommu_iova_to_phys,
5313         .add_device             = intel_iommu_add_device,
5314         .remove_device          = intel_iommu_remove_device,
5315         .get_resv_regions       = intel_iommu_get_resv_regions,
5316         .put_resv_regions       = intel_iommu_put_resv_regions,
5317         .device_group           = pci_device_group,
5318         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5319 };
5320
5321 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5322 {
5323         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5324         pr_info("Disabling IOMMU for graphics on this chipset\n");
5325         dmar_map_gfx = 0;
5326 }
5327
5328 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5329 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5330 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5331 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5332 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5333 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5334 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5335
5336 static void quirk_iommu_rwbf(struct pci_dev *dev)
5337 {
5338         /*
5339          * Mobile 4 Series Chipset neglects to set RWBF capability,
5340          * but needs it. Same seems to hold for the desktop versions.
5341          */
5342         pr_info("Forcing write-buffer flush capability\n");
5343         rwbf_quirk = 1;
5344 }
5345
5346 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5347 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5348 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5350 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5351 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5352 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5353
5354 #define GGC 0x52
5355 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5356 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5357 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5358 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5359 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5360 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5361 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5362 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5363
5364 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5365 {
5366         unsigned short ggc;
5367
5368         if (pci_read_config_word(dev, GGC, &ggc))
5369                 return;
5370
5371         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5372                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5373                 dmar_map_gfx = 0;
5374         } else if (dmar_map_gfx) {
5375                 /* we have to ensure the gfx device is idle before we flush */
5376                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5377                 intel_iommu_strict = 1;
5378        }
5379 }
5380 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5381 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5382 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5383 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5384
5385 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5386    ISOCH DMAR unit for the Azalia sound device, but not give it any
5387    TLB entries, which causes it to deadlock. Check for that.  We do
5388    this in a function called from init_dmars(), instead of in a PCI
5389    quirk, because we don't want to print the obnoxious "BIOS broken"
5390    message if VT-d is actually disabled.
5391 */
5392 static void __init check_tylersburg_isoch(void)
5393 {
5394         struct pci_dev *pdev;
5395         uint32_t vtisochctrl;
5396
5397         /* If there's no Azalia in the system anyway, forget it. */
5398         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5399         if (!pdev)
5400                 return;
5401         pci_dev_put(pdev);
5402
5403         /* System Management Registers. Might be hidden, in which case
5404            we can't do the sanity check. But that's OK, because the
5405            known-broken BIOSes _don't_ actually hide it, so far. */
5406         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5407         if (!pdev)
5408                 return;
5409
5410         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5411                 pci_dev_put(pdev);
5412                 return;
5413         }
5414
5415         pci_dev_put(pdev);
5416
5417         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5418         if (vtisochctrl & 1)
5419                 return;
5420
5421         /* Drop all bits other than the number of TLB entries */
5422         vtisochctrl &= 0x1c;
5423
5424         /* If we have the recommended number of TLB entries (16), fine. */
5425         if (vtisochctrl == 0x10)
5426                 return;
5427
5428         /* Zero TLB entries? You get to ride the short bus to school. */
5429         if (!vtisochctrl) {
5430                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5431                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5432                      dmi_get_system_info(DMI_BIOS_VENDOR),
5433                      dmi_get_system_info(DMI_BIOS_VERSION),
5434                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5435                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5436                 return;
5437         }
5438
5439         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5440                vtisochctrl);
5441 }