]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
497ef94c5a8c1e9ad3a0dc9b8296c5daa46445e2
[linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/dma-direct.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/cacheflush.h>
53 #include <asm/iommu.h>
54
55 #include "irq_remapping.h"
56
57 #define ROOT_SIZE               VTD_PAGE_SIZE
58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
59
60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
64
65 #define IOAPIC_RANGE_START      (0xfee00000)
66 #define IOAPIC_RANGE_END        (0xfeefffff)
67 #define IOVA_START_ADDR         (0x1000)
68
69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
70
71 #define MAX_AGAW_WIDTH 64
72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
73
74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
76
77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
82
83 /* IO virtual address start page frame number */
84 #define IOVA_START_PFN          (1)
85
86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
87
88 /* page table handling */
89 #define LEVEL_STRIDE            (9)
90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
91
92 /*
93  * This bitmap is used to advertise the page sizes our hardware support
94  * to the IOMMU core, which will then use this information to split
95  * physically contiguous memory regions it is mapping into page sizes
96  * that we support.
97  *
98  * Traditionally the IOMMU core just handed us the mappings directly,
99  * after making sure the size is an order of a 4KiB page and that the
100  * mapping has natural alignment.
101  *
102  * To retain this behavior, we currently advertise that we support
103  * all page sizes that are an order of 4KiB.
104  *
105  * If at some point we'd like to utilize the IOMMU core's new behavior,
106  * we could change this to advertise the real page sizes we support.
107  */
108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
109
110 static inline int agaw_to_level(int agaw)
111 {
112         return agaw + 2;
113 }
114
115 static inline int agaw_to_width(int agaw)
116 {
117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
118 }
119
120 static inline int width_to_agaw(int width)
121 {
122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
123 }
124
125 static inline unsigned int level_to_offset_bits(int level)
126 {
127         return (level - 1) * LEVEL_STRIDE;
128 }
129
130 static inline int pfn_level_offset(unsigned long pfn, int level)
131 {
132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
133 }
134
135 static inline unsigned long level_mask(int level)
136 {
137         return -1UL << level_to_offset_bits(level);
138 }
139
140 static inline unsigned long level_size(int level)
141 {
142         return 1UL << level_to_offset_bits(level);
143 }
144
145 static inline unsigned long align_to_level(unsigned long pfn, int level)
146 {
147         return (pfn + level_size(level) - 1) & level_mask(level);
148 }
149
150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
151 {
152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
153 }
154
155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
156    are never going to work. */
157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
158 {
159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
160 }
161
162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
163 {
164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
165 }
166 static inline unsigned long page_to_dma_pfn(struct page *pg)
167 {
168         return mm_to_dma_pfn(page_to_pfn(pg));
169 }
170 static inline unsigned long virt_to_dma_pfn(void *p)
171 {
172         return page_to_dma_pfn(virt_to_page(p));
173 }
174
175 /* global iommu list, set NULL for ignored DMAR units */
176 static struct intel_iommu **g_iommus;
177
178 static void __init check_tylersburg_isoch(void);
179 static int rwbf_quirk;
180
181 /*
182  * set to 1 to panic kernel if can't successfully enable VT-d
183  * (used when kernel is launched w/ TXT)
184  */
185 static int force_on = 0;
186 int intel_iommu_tboot_noforce;
187
188 /*
189  * 0: Present
190  * 1-11: Reserved
191  * 12-63: Context Ptr (12 - (haw-1))
192  * 64-127: Reserved
193  */
194 struct root_entry {
195         u64     lo;
196         u64     hi;
197 };
198 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
199
200 /*
201  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
202  * if marked present.
203  */
204 static phys_addr_t root_entry_lctp(struct root_entry *re)
205 {
206         if (!(re->lo & 1))
207                 return 0;
208
209         return re->lo & VTD_PAGE_MASK;
210 }
211
212 /*
213  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
214  * if marked present.
215  */
216 static phys_addr_t root_entry_uctp(struct root_entry *re)
217 {
218         if (!(re->hi & 1))
219                 return 0;
220
221         return re->hi & VTD_PAGE_MASK;
222 }
223 /*
224  * low 64 bits:
225  * 0: present
226  * 1: fault processing disable
227  * 2-3: translation type
228  * 12-63: address space root
229  * high 64 bits:
230  * 0-2: address width
231  * 3-6: aval
232  * 8-23: domain id
233  */
234 struct context_entry {
235         u64 lo;
236         u64 hi;
237 };
238
239 static inline void context_clear_pasid_enable(struct context_entry *context)
240 {
241         context->lo &= ~(1ULL << 11);
242 }
243
244 static inline bool context_pasid_enabled(struct context_entry *context)
245 {
246         return !!(context->lo & (1ULL << 11));
247 }
248
249 static inline void context_set_copied(struct context_entry *context)
250 {
251         context->hi |= (1ull << 3);
252 }
253
254 static inline bool context_copied(struct context_entry *context)
255 {
256         return !!(context->hi & (1ULL << 3));
257 }
258
259 static inline bool __context_present(struct context_entry *context)
260 {
261         return (context->lo & 1);
262 }
263
264 static inline bool context_present(struct context_entry *context)
265 {
266         return context_pasid_enabled(context) ?
267              __context_present(context) :
268              __context_present(context) && !context_copied(context);
269 }
270
271 static inline void context_set_present(struct context_entry *context)
272 {
273         context->lo |= 1;
274 }
275
276 static inline void context_set_fault_enable(struct context_entry *context)
277 {
278         context->lo &= (((u64)-1) << 2) | 1;
279 }
280
281 static inline void context_set_translation_type(struct context_entry *context,
282                                                 unsigned long value)
283 {
284         context->lo &= (((u64)-1) << 4) | 3;
285         context->lo |= (value & 3) << 2;
286 }
287
288 static inline void context_set_address_root(struct context_entry *context,
289                                             unsigned long value)
290 {
291         context->lo &= ~VTD_PAGE_MASK;
292         context->lo |= value & VTD_PAGE_MASK;
293 }
294
295 static inline void context_set_address_width(struct context_entry *context,
296                                              unsigned long value)
297 {
298         context->hi |= value & 7;
299 }
300
301 static inline void context_set_domain_id(struct context_entry *context,
302                                          unsigned long value)
303 {
304         context->hi |= (value & ((1 << 16) - 1)) << 8;
305 }
306
307 static inline int context_domain_id(struct context_entry *c)
308 {
309         return((c->hi >> 8) & 0xffff);
310 }
311
312 static inline void context_clear_entry(struct context_entry *context)
313 {
314         context->lo = 0;
315         context->hi = 0;
316 }
317
318 /*
319  * 0: readable
320  * 1: writable
321  * 2-6: reserved
322  * 7: super page
323  * 8-10: available
324  * 11: snoop behavior
325  * 12-63: Host physcial address
326  */
327 struct dma_pte {
328         u64 val;
329 };
330
331 static inline void dma_clear_pte(struct dma_pte *pte)
332 {
333         pte->val = 0;
334 }
335
336 static inline u64 dma_pte_addr(struct dma_pte *pte)
337 {
338 #ifdef CONFIG_64BIT
339         return pte->val & VTD_PAGE_MASK;
340 #else
341         /* Must have a full atomic 64-bit read */
342         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
343 #endif
344 }
345
346 static inline bool dma_pte_present(struct dma_pte *pte)
347 {
348         return (pte->val & 3) != 0;
349 }
350
351 static inline bool dma_pte_superpage(struct dma_pte *pte)
352 {
353         return (pte->val & DMA_PTE_LARGE_PAGE);
354 }
355
356 static inline int first_pte_in_page(struct dma_pte *pte)
357 {
358         return !((unsigned long)pte & ~VTD_PAGE_MASK);
359 }
360
361 /*
362  * This domain is a statically identity mapping domain.
363  *      1. This domain creats a static 1:1 mapping to all usable memory.
364  *      2. It maps to each iommu if successful.
365  *      3. Each iommu mapps to this domain if successful.
366  */
367 static struct dmar_domain *si_domain;
368 static int hw_pass_through = 1;
369
370 /*
371  * Domain represents a virtual machine, more than one devices
372  * across iommus may be owned in one domain, e.g. kvm guest.
373  */
374 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
375
376 /* si_domain contains mulitple devices */
377 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
378
379 #define for_each_domain_iommu(idx, domain)                      \
380         for (idx = 0; idx < g_num_of_iommus; idx++)             \
381                 if (domain->iommu_refcnt[idx])
382
383 struct dmar_domain {
384         int     nid;                    /* node id */
385
386         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
387                                         /* Refcount of devices per iommu */
388
389
390         u16             iommu_did[DMAR_UNITS_SUPPORTED];
391                                         /* Domain ids per IOMMU. Use u16 since
392                                          * domain ids are 16 bit wide according
393                                          * to VT-d spec, section 9.3 */
394
395         bool has_iotlb_device;
396         struct list_head devices;       /* all devices' list */
397         struct iova_domain iovad;       /* iova's that belong to this domain */
398
399         struct dma_pte  *pgd;           /* virtual address */
400         int             gaw;            /* max guest address width */
401
402         /* adjusted guest address width, 0 is level 2 30-bit */
403         int             agaw;
404
405         int             flags;          /* flags to find out type of domain */
406
407         int             iommu_coherency;/* indicate coherency of iommu access */
408         int             iommu_snooping; /* indicate snooping control feature*/
409         int             iommu_count;    /* reference count of iommu */
410         int             iommu_superpage;/* Level of superpages supported:
411                                            0 == 4KiB (no superpages), 1 == 2MiB,
412                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
413         u64             max_addr;       /* maximum mapped address */
414
415         struct iommu_domain domain;     /* generic domain data structure for
416                                            iommu core */
417 };
418
419 /* PCI domain-device relationship */
420 struct device_domain_info {
421         struct list_head link;  /* link to domain siblings */
422         struct list_head global; /* link to global list */
423         u8 bus;                 /* PCI bus number */
424         u8 devfn;               /* PCI devfn number */
425         u16 pfsid;              /* SRIOV physical function source ID */
426         u8 pasid_supported:3;
427         u8 pasid_enabled:1;
428         u8 pri_supported:1;
429         u8 pri_enabled:1;
430         u8 ats_supported:1;
431         u8 ats_enabled:1;
432         u8 ats_qdep;
433         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
434         struct intel_iommu *iommu; /* IOMMU used by this device */
435         struct dmar_domain *domain; /* pointer to domain */
436 };
437
438 struct dmar_rmrr_unit {
439         struct list_head list;          /* list of rmrr units   */
440         struct acpi_dmar_header *hdr;   /* ACPI header          */
441         u64     base_address;           /* reserved base address*/
442         u64     end_address;            /* reserved end address */
443         struct dmar_dev_scope *devices; /* target devices */
444         int     devices_cnt;            /* target device count */
445         struct iommu_resv_region *resv; /* reserved region handle */
446 };
447
448 struct dmar_atsr_unit {
449         struct list_head list;          /* list of ATSR units */
450         struct acpi_dmar_header *hdr;   /* ACPI header */
451         struct dmar_dev_scope *devices; /* target devices */
452         int devices_cnt;                /* target device count */
453         u8 include_all:1;               /* include all ports */
454 };
455
456 static LIST_HEAD(dmar_atsr_units);
457 static LIST_HEAD(dmar_rmrr_units);
458
459 #define for_each_rmrr_units(rmrr) \
460         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
461
462 /* bitmap for indexing intel_iommus */
463 static int g_num_of_iommus;
464
465 static void domain_exit(struct dmar_domain *domain);
466 static void domain_remove_dev_info(struct dmar_domain *domain);
467 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
468                                      struct device *dev);
469 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
470 static void domain_context_clear(struct intel_iommu *iommu,
471                                  struct device *dev);
472 static int domain_detach_iommu(struct dmar_domain *domain,
473                                struct intel_iommu *iommu);
474
475 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
476 int dmar_disabled = 0;
477 #else
478 int dmar_disabled = 1;
479 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
480
481 int intel_iommu_enabled = 0;
482 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
483
484 static int dmar_map_gfx = 1;
485 static int dmar_forcedac;
486 static int intel_iommu_strict;
487 static int intel_iommu_superpage = 1;
488 static int intel_iommu_ecs = 1;
489 static int iommu_identity_mapping;
490
491 #define IDENTMAP_ALL            1
492 #define IDENTMAP_GFX            2
493 #define IDENTMAP_AZALIA         4
494
495 #define ecs_enabled(iommu)      (intel_iommu_ecs && ecap_ecs(iommu->ecap))
496 #define pasid_enabled(iommu)    (ecs_enabled(iommu) && ecap_pasid(iommu->ecap))
497
498 int intel_iommu_gfx_mapped;
499 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
500
501 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
502 static DEFINE_SPINLOCK(device_domain_lock);
503 static LIST_HEAD(device_domain_list);
504
505 const struct iommu_ops intel_iommu_ops;
506
507 static bool translation_pre_enabled(struct intel_iommu *iommu)
508 {
509         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
510 }
511
512 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
513 {
514         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
515 }
516
517 static void init_translation_status(struct intel_iommu *iommu)
518 {
519         u32 gsts;
520
521         gsts = readl(iommu->reg + DMAR_GSTS_REG);
522         if (gsts & DMA_GSTS_TES)
523                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
524 }
525
526 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
527 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
528 {
529         return container_of(dom, struct dmar_domain, domain);
530 }
531
532 static int __init intel_iommu_setup(char *str)
533 {
534         if (!str)
535                 return -EINVAL;
536         while (*str) {
537                 if (!strncmp(str, "on", 2)) {
538                         dmar_disabled = 0;
539                         pr_info("IOMMU enabled\n");
540                 } else if (!strncmp(str, "off", 3)) {
541                         dmar_disabled = 1;
542                         pr_info("IOMMU disabled\n");
543                 } else if (!strncmp(str, "igfx_off", 8)) {
544                         dmar_map_gfx = 0;
545                         pr_info("Disable GFX device mapping\n");
546                 } else if (!strncmp(str, "forcedac", 8)) {
547                         pr_info("Forcing DAC for PCI devices\n");
548                         dmar_forcedac = 1;
549                 } else if (!strncmp(str, "strict", 6)) {
550                         pr_info("Disable batched IOTLB flush\n");
551                         intel_iommu_strict = 1;
552                 } else if (!strncmp(str, "sp_off", 6)) {
553                         pr_info("Disable supported super page\n");
554                         intel_iommu_superpage = 0;
555                 } else if (!strncmp(str, "ecs_off", 7)) {
556                         printk(KERN_INFO
557                                 "Intel-IOMMU: disable extended context table support\n");
558                         intel_iommu_ecs = 0;
559                 } else if (!strncmp(str, "tboot_noforce", 13)) {
560                         printk(KERN_INFO
561                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
562                         intel_iommu_tboot_noforce = 1;
563                 }
564
565                 str += strcspn(str, ",");
566                 while (*str == ',')
567                         str++;
568         }
569         return 0;
570 }
571 __setup("intel_iommu=", intel_iommu_setup);
572
573 static struct kmem_cache *iommu_domain_cache;
574 static struct kmem_cache *iommu_devinfo_cache;
575
576 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
577 {
578         struct dmar_domain **domains;
579         int idx = did >> 8;
580
581         domains = iommu->domains[idx];
582         if (!domains)
583                 return NULL;
584
585         return domains[did & 0xff];
586 }
587
588 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
589                              struct dmar_domain *domain)
590 {
591         struct dmar_domain **domains;
592         int idx = did >> 8;
593
594         if (!iommu->domains[idx]) {
595                 size_t size = 256 * sizeof(struct dmar_domain *);
596                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
597         }
598
599         domains = iommu->domains[idx];
600         if (WARN_ON(!domains))
601                 return;
602         else
603                 domains[did & 0xff] = domain;
604 }
605
606 static inline void *alloc_pgtable_page(int node)
607 {
608         struct page *page;
609         void *vaddr = NULL;
610
611         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
612         if (page)
613                 vaddr = page_address(page);
614         return vaddr;
615 }
616
617 static inline void free_pgtable_page(void *vaddr)
618 {
619         free_page((unsigned long)vaddr);
620 }
621
622 static inline void *alloc_domain_mem(void)
623 {
624         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
625 }
626
627 static void free_domain_mem(void *vaddr)
628 {
629         kmem_cache_free(iommu_domain_cache, vaddr);
630 }
631
632 static inline void * alloc_devinfo_mem(void)
633 {
634         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
635 }
636
637 static inline void free_devinfo_mem(void *vaddr)
638 {
639         kmem_cache_free(iommu_devinfo_cache, vaddr);
640 }
641
642 static inline int domain_type_is_vm(struct dmar_domain *domain)
643 {
644         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
645 }
646
647 static inline int domain_type_is_si(struct dmar_domain *domain)
648 {
649         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
650 }
651
652 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
653 {
654         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
655                                 DOMAIN_FLAG_STATIC_IDENTITY);
656 }
657
658 static inline int domain_pfn_supported(struct dmar_domain *domain,
659                                        unsigned long pfn)
660 {
661         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
662
663         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
664 }
665
666 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
667 {
668         unsigned long sagaw;
669         int agaw = -1;
670
671         sagaw = cap_sagaw(iommu->cap);
672         for (agaw = width_to_agaw(max_gaw);
673              agaw >= 0; agaw--) {
674                 if (test_bit(agaw, &sagaw))
675                         break;
676         }
677
678         return agaw;
679 }
680
681 /*
682  * Calculate max SAGAW for each iommu.
683  */
684 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
685 {
686         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
687 }
688
689 /*
690  * calculate agaw for each iommu.
691  * "SAGAW" may be different across iommus, use a default agaw, and
692  * get a supported less agaw for iommus that don't support the default agaw.
693  */
694 int iommu_calculate_agaw(struct intel_iommu *iommu)
695 {
696         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
697 }
698
699 /* This functionin only returns single iommu in a domain */
700 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
701 {
702         int iommu_id;
703
704         /* si_domain and vm domain should not get here. */
705         BUG_ON(domain_type_is_vm_or_si(domain));
706         for_each_domain_iommu(iommu_id, domain)
707                 break;
708
709         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
710                 return NULL;
711
712         return g_iommus[iommu_id];
713 }
714
715 static void domain_update_iommu_coherency(struct dmar_domain *domain)
716 {
717         struct dmar_drhd_unit *drhd;
718         struct intel_iommu *iommu;
719         bool found = false;
720         int i;
721
722         domain->iommu_coherency = 1;
723
724         for_each_domain_iommu(i, domain) {
725                 found = true;
726                 if (!ecap_coherent(g_iommus[i]->ecap)) {
727                         domain->iommu_coherency = 0;
728                         break;
729                 }
730         }
731         if (found)
732                 return;
733
734         /* No hardware attached; use lowest common denominator */
735         rcu_read_lock();
736         for_each_active_iommu(iommu, drhd) {
737                 if (!ecap_coherent(iommu->ecap)) {
738                         domain->iommu_coherency = 0;
739                         break;
740                 }
741         }
742         rcu_read_unlock();
743 }
744
745 static int domain_update_iommu_snooping(struct intel_iommu *skip)
746 {
747         struct dmar_drhd_unit *drhd;
748         struct intel_iommu *iommu;
749         int ret = 1;
750
751         rcu_read_lock();
752         for_each_active_iommu(iommu, drhd) {
753                 if (iommu != skip) {
754                         if (!ecap_sc_support(iommu->ecap)) {
755                                 ret = 0;
756                                 break;
757                         }
758                 }
759         }
760         rcu_read_unlock();
761
762         return ret;
763 }
764
765 static int domain_update_iommu_superpage(struct intel_iommu *skip)
766 {
767         struct dmar_drhd_unit *drhd;
768         struct intel_iommu *iommu;
769         int mask = 0xf;
770
771         if (!intel_iommu_superpage) {
772                 return 0;
773         }
774
775         /* set iommu_superpage to the smallest common denominator */
776         rcu_read_lock();
777         for_each_active_iommu(iommu, drhd) {
778                 if (iommu != skip) {
779                         mask &= cap_super_page_val(iommu->cap);
780                         if (!mask)
781                                 break;
782                 }
783         }
784         rcu_read_unlock();
785
786         return fls(mask);
787 }
788
789 /* Some capabilities may be different across iommus */
790 static void domain_update_iommu_cap(struct dmar_domain *domain)
791 {
792         domain_update_iommu_coherency(domain);
793         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
794         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
795 }
796
797 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
798                                                        u8 bus, u8 devfn, int alloc)
799 {
800         struct root_entry *root = &iommu->root_entry[bus];
801         struct context_entry *context;
802         u64 *entry;
803
804         entry = &root->lo;
805         if (ecs_enabled(iommu)) {
806                 if (devfn >= 0x80) {
807                         devfn -= 0x80;
808                         entry = &root->hi;
809                 }
810                 devfn *= 2;
811         }
812         if (*entry & 1)
813                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
814         else {
815                 unsigned long phy_addr;
816                 if (!alloc)
817                         return NULL;
818
819                 context = alloc_pgtable_page(iommu->node);
820                 if (!context)
821                         return NULL;
822
823                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
824                 phy_addr = virt_to_phys((void *)context);
825                 *entry = phy_addr | 1;
826                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
827         }
828         return &context[devfn];
829 }
830
831 static int iommu_dummy(struct device *dev)
832 {
833         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
834 }
835
836 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
837 {
838         struct dmar_drhd_unit *drhd = NULL;
839         struct intel_iommu *iommu;
840         struct device *tmp;
841         struct pci_dev *ptmp, *pdev = NULL;
842         u16 segment = 0;
843         int i;
844
845         if (iommu_dummy(dev))
846                 return NULL;
847
848         if (dev_is_pci(dev)) {
849                 struct pci_dev *pf_pdev;
850
851                 pdev = to_pci_dev(dev);
852
853 #ifdef CONFIG_X86
854                 /* VMD child devices currently cannot be handled individually */
855                 if (is_vmd(pdev->bus))
856                         return NULL;
857 #endif
858
859                 /* VFs aren't listed in scope tables; we need to look up
860                  * the PF instead to find the IOMMU. */
861                 pf_pdev = pci_physfn(pdev);
862                 dev = &pf_pdev->dev;
863                 segment = pci_domain_nr(pdev->bus);
864         } else if (has_acpi_companion(dev))
865                 dev = &ACPI_COMPANION(dev)->dev;
866
867         rcu_read_lock();
868         for_each_active_iommu(iommu, drhd) {
869                 if (pdev && segment != drhd->segment)
870                         continue;
871
872                 for_each_active_dev_scope(drhd->devices,
873                                           drhd->devices_cnt, i, tmp) {
874                         if (tmp == dev) {
875                                 /* For a VF use its original BDF# not that of the PF
876                                  * which we used for the IOMMU lookup. Strictly speaking
877                                  * we could do this for all PCI devices; we only need to
878                                  * get the BDF# from the scope table for ACPI matches. */
879                                 if (pdev && pdev->is_virtfn)
880                                         goto got_pdev;
881
882                                 *bus = drhd->devices[i].bus;
883                                 *devfn = drhd->devices[i].devfn;
884                                 goto out;
885                         }
886
887                         if (!pdev || !dev_is_pci(tmp))
888                                 continue;
889
890                         ptmp = to_pci_dev(tmp);
891                         if (ptmp->subordinate &&
892                             ptmp->subordinate->number <= pdev->bus->number &&
893                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
894                                 goto got_pdev;
895                 }
896
897                 if (pdev && drhd->include_all) {
898                 got_pdev:
899                         *bus = pdev->bus->number;
900                         *devfn = pdev->devfn;
901                         goto out;
902                 }
903         }
904         iommu = NULL;
905  out:
906         rcu_read_unlock();
907
908         return iommu;
909 }
910
911 static void domain_flush_cache(struct dmar_domain *domain,
912                                void *addr, int size)
913 {
914         if (!domain->iommu_coherency)
915                 clflush_cache_range(addr, size);
916 }
917
918 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
919 {
920         struct context_entry *context;
921         int ret = 0;
922         unsigned long flags;
923
924         spin_lock_irqsave(&iommu->lock, flags);
925         context = iommu_context_addr(iommu, bus, devfn, 0);
926         if (context)
927                 ret = context_present(context);
928         spin_unlock_irqrestore(&iommu->lock, flags);
929         return ret;
930 }
931
932 static void free_context_table(struct intel_iommu *iommu)
933 {
934         int i;
935         unsigned long flags;
936         struct context_entry *context;
937
938         spin_lock_irqsave(&iommu->lock, flags);
939         if (!iommu->root_entry) {
940                 goto out;
941         }
942         for (i = 0; i < ROOT_ENTRY_NR; i++) {
943                 context = iommu_context_addr(iommu, i, 0, 0);
944                 if (context)
945                         free_pgtable_page(context);
946
947                 if (!ecs_enabled(iommu))
948                         continue;
949
950                 context = iommu_context_addr(iommu, i, 0x80, 0);
951                 if (context)
952                         free_pgtable_page(context);
953
954         }
955         free_pgtable_page(iommu->root_entry);
956         iommu->root_entry = NULL;
957 out:
958         spin_unlock_irqrestore(&iommu->lock, flags);
959 }
960
961 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
962                                       unsigned long pfn, int *target_level)
963 {
964         struct dma_pte *parent, *pte = NULL;
965         int level = agaw_to_level(domain->agaw);
966         int offset;
967
968         BUG_ON(!domain->pgd);
969
970         if (!domain_pfn_supported(domain, pfn))
971                 /* Address beyond IOMMU's addressing capabilities. */
972                 return NULL;
973
974         parent = domain->pgd;
975
976         while (1) {
977                 void *tmp_page;
978
979                 offset = pfn_level_offset(pfn, level);
980                 pte = &parent[offset];
981                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
982                         break;
983                 if (level == *target_level)
984                         break;
985
986                 if (!dma_pte_present(pte)) {
987                         uint64_t pteval;
988
989                         tmp_page = alloc_pgtable_page(domain->nid);
990
991                         if (!tmp_page)
992                                 return NULL;
993
994                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
995                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
996                         if (cmpxchg64(&pte->val, 0ULL, pteval))
997                                 /* Someone else set it while we were thinking; use theirs. */
998                                 free_pgtable_page(tmp_page);
999                         else
1000                                 domain_flush_cache(domain, pte, sizeof(*pte));
1001                 }
1002                 if (level == 1)
1003                         break;
1004
1005                 parent = phys_to_virt(dma_pte_addr(pte));
1006                 level--;
1007         }
1008
1009         if (!*target_level)
1010                 *target_level = level;
1011
1012         return pte;
1013 }
1014
1015
1016 /* return address's pte at specific level */
1017 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1018                                          unsigned long pfn,
1019                                          int level, int *large_page)
1020 {
1021         struct dma_pte *parent, *pte = NULL;
1022         int total = agaw_to_level(domain->agaw);
1023         int offset;
1024
1025         parent = domain->pgd;
1026         while (level <= total) {
1027                 offset = pfn_level_offset(pfn, total);
1028                 pte = &parent[offset];
1029                 if (level == total)
1030                         return pte;
1031
1032                 if (!dma_pte_present(pte)) {
1033                         *large_page = total;
1034                         break;
1035                 }
1036
1037                 if (dma_pte_superpage(pte)) {
1038                         *large_page = total;
1039                         return pte;
1040                 }
1041
1042                 parent = phys_to_virt(dma_pte_addr(pte));
1043                 total--;
1044         }
1045         return NULL;
1046 }
1047
1048 /* clear last level pte, a tlb flush should be followed */
1049 static void dma_pte_clear_range(struct dmar_domain *domain,
1050                                 unsigned long start_pfn,
1051                                 unsigned long last_pfn)
1052 {
1053         unsigned int large_page = 1;
1054         struct dma_pte *first_pte, *pte;
1055
1056         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1057         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1058         BUG_ON(start_pfn > last_pfn);
1059
1060         /* we don't need lock here; nobody else touches the iova range */
1061         do {
1062                 large_page = 1;
1063                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1064                 if (!pte) {
1065                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1066                         continue;
1067                 }
1068                 do {
1069                         dma_clear_pte(pte);
1070                         start_pfn += lvl_to_nr_pages(large_page);
1071                         pte++;
1072                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1073
1074                 domain_flush_cache(domain, first_pte,
1075                                    (void *)pte - (void *)first_pte);
1076
1077         } while (start_pfn && start_pfn <= last_pfn);
1078 }
1079
1080 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1081                                int retain_level, struct dma_pte *pte,
1082                                unsigned long pfn, unsigned long start_pfn,
1083                                unsigned long last_pfn)
1084 {
1085         pfn = max(start_pfn, pfn);
1086         pte = &pte[pfn_level_offset(pfn, level)];
1087
1088         do {
1089                 unsigned long level_pfn;
1090                 struct dma_pte *level_pte;
1091
1092                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1093                         goto next;
1094
1095                 level_pfn = pfn & level_mask(level);
1096                 level_pte = phys_to_virt(dma_pte_addr(pte));
1097
1098                 if (level > 2) {
1099                         dma_pte_free_level(domain, level - 1, retain_level,
1100                                            level_pte, level_pfn, start_pfn,
1101                                            last_pfn);
1102                 }
1103
1104                 /*
1105                  * Free the page table if we're below the level we want to
1106                  * retain and the range covers the entire table.
1107                  */
1108                 if (level < retain_level && !(start_pfn > level_pfn ||
1109                       last_pfn < level_pfn + level_size(level) - 1)) {
1110                         dma_clear_pte(pte);
1111                         domain_flush_cache(domain, pte, sizeof(*pte));
1112                         free_pgtable_page(level_pte);
1113                 }
1114 next:
1115                 pfn += level_size(level);
1116         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1117 }
1118
1119 /*
1120  * clear last level (leaf) ptes and free page table pages below the
1121  * level we wish to keep intact.
1122  */
1123 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1124                                    unsigned long start_pfn,
1125                                    unsigned long last_pfn,
1126                                    int retain_level)
1127 {
1128         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1129         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1130         BUG_ON(start_pfn > last_pfn);
1131
1132         dma_pte_clear_range(domain, start_pfn, last_pfn);
1133
1134         /* We don't need lock here; nobody else touches the iova range */
1135         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1136                            domain->pgd, 0, start_pfn, last_pfn);
1137
1138         /* free pgd */
1139         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1140                 free_pgtable_page(domain->pgd);
1141                 domain->pgd = NULL;
1142         }
1143 }
1144
1145 /* When a page at a given level is being unlinked from its parent, we don't
1146    need to *modify* it at all. All we need to do is make a list of all the
1147    pages which can be freed just as soon as we've flushed the IOTLB and we
1148    know the hardware page-walk will no longer touch them.
1149    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1150    be freed. */
1151 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1152                                             int level, struct dma_pte *pte,
1153                                             struct page *freelist)
1154 {
1155         struct page *pg;
1156
1157         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1158         pg->freelist = freelist;
1159         freelist = pg;
1160
1161         if (level == 1)
1162                 return freelist;
1163
1164         pte = page_address(pg);
1165         do {
1166                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1167                         freelist = dma_pte_list_pagetables(domain, level - 1,
1168                                                            pte, freelist);
1169                 pte++;
1170         } while (!first_pte_in_page(pte));
1171
1172         return freelist;
1173 }
1174
1175 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1176                                         struct dma_pte *pte, unsigned long pfn,
1177                                         unsigned long start_pfn,
1178                                         unsigned long last_pfn,
1179                                         struct page *freelist)
1180 {
1181         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1182
1183         pfn = max(start_pfn, pfn);
1184         pte = &pte[pfn_level_offset(pfn, level)];
1185
1186         do {
1187                 unsigned long level_pfn;
1188
1189                 if (!dma_pte_present(pte))
1190                         goto next;
1191
1192                 level_pfn = pfn & level_mask(level);
1193
1194                 /* If range covers entire pagetable, free it */
1195                 if (start_pfn <= level_pfn &&
1196                     last_pfn >= level_pfn + level_size(level) - 1) {
1197                         /* These suborbinate page tables are going away entirely. Don't
1198                            bother to clear them; we're just going to *free* them. */
1199                         if (level > 1 && !dma_pte_superpage(pte))
1200                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1201
1202                         dma_clear_pte(pte);
1203                         if (!first_pte)
1204                                 first_pte = pte;
1205                         last_pte = pte;
1206                 } else if (level > 1) {
1207                         /* Recurse down into a level that isn't *entirely* obsolete */
1208                         freelist = dma_pte_clear_level(domain, level - 1,
1209                                                        phys_to_virt(dma_pte_addr(pte)),
1210                                                        level_pfn, start_pfn, last_pfn,
1211                                                        freelist);
1212                 }
1213 next:
1214                 pfn += level_size(level);
1215         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1216
1217         if (first_pte)
1218                 domain_flush_cache(domain, first_pte,
1219                                    (void *)++last_pte - (void *)first_pte);
1220
1221         return freelist;
1222 }
1223
1224 /* We can't just free the pages because the IOMMU may still be walking
1225    the page tables, and may have cached the intermediate levels. The
1226    pages can only be freed after the IOTLB flush has been done. */
1227 static struct page *domain_unmap(struct dmar_domain *domain,
1228                                  unsigned long start_pfn,
1229                                  unsigned long last_pfn)
1230 {
1231         struct page *freelist = NULL;
1232
1233         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1234         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1235         BUG_ON(start_pfn > last_pfn);
1236
1237         /* we don't need lock here; nobody else touches the iova range */
1238         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1239                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1240
1241         /* free pgd */
1242         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1243                 struct page *pgd_page = virt_to_page(domain->pgd);
1244                 pgd_page->freelist = freelist;
1245                 freelist = pgd_page;
1246
1247                 domain->pgd = NULL;
1248         }
1249
1250         return freelist;
1251 }
1252
1253 static void dma_free_pagelist(struct page *freelist)
1254 {
1255         struct page *pg;
1256
1257         while ((pg = freelist)) {
1258                 freelist = pg->freelist;
1259                 free_pgtable_page(page_address(pg));
1260         }
1261 }
1262
1263 static void iova_entry_free(unsigned long data)
1264 {
1265         struct page *freelist = (struct page *)data;
1266
1267         dma_free_pagelist(freelist);
1268 }
1269
1270 /* iommu handling */
1271 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1272 {
1273         struct root_entry *root;
1274         unsigned long flags;
1275
1276         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1277         if (!root) {
1278                 pr_err("Allocating root entry for %s failed\n",
1279                         iommu->name);
1280                 return -ENOMEM;
1281         }
1282
1283         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1284
1285         spin_lock_irqsave(&iommu->lock, flags);
1286         iommu->root_entry = root;
1287         spin_unlock_irqrestore(&iommu->lock, flags);
1288
1289         return 0;
1290 }
1291
1292 static void iommu_set_root_entry(struct intel_iommu *iommu)
1293 {
1294         u64 addr;
1295         u32 sts;
1296         unsigned long flag;
1297
1298         addr = virt_to_phys(iommu->root_entry);
1299         if (ecs_enabled(iommu))
1300                 addr |= DMA_RTADDR_RTT;
1301
1302         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1303         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1304
1305         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1306
1307         /* Make sure hardware complete it */
1308         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1309                       readl, (sts & DMA_GSTS_RTPS), sts);
1310
1311         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312 }
1313
1314 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1315 {
1316         u32 val;
1317         unsigned long flag;
1318
1319         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1320                 return;
1321
1322         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1323         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1324
1325         /* Make sure hardware complete it */
1326         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1327                       readl, (!(val & DMA_GSTS_WBFS)), val);
1328
1329         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1330 }
1331
1332 /* return value determine if we need a write buffer flush */
1333 static void __iommu_flush_context(struct intel_iommu *iommu,
1334                                   u16 did, u16 source_id, u8 function_mask,
1335                                   u64 type)
1336 {
1337         u64 val = 0;
1338         unsigned long flag;
1339
1340         switch (type) {
1341         case DMA_CCMD_GLOBAL_INVL:
1342                 val = DMA_CCMD_GLOBAL_INVL;
1343                 break;
1344         case DMA_CCMD_DOMAIN_INVL:
1345                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1346                 break;
1347         case DMA_CCMD_DEVICE_INVL:
1348                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1349                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1350                 break;
1351         default:
1352                 BUG();
1353         }
1354         val |= DMA_CCMD_ICC;
1355
1356         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1357         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1358
1359         /* Make sure hardware complete it */
1360         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1361                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1362
1363         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364 }
1365
1366 /* return value determine if we need a write buffer flush */
1367 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1368                                 u64 addr, unsigned int size_order, u64 type)
1369 {
1370         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1371         u64 val = 0, val_iva = 0;
1372         unsigned long flag;
1373
1374         switch (type) {
1375         case DMA_TLB_GLOBAL_FLUSH:
1376                 /* global flush doesn't need set IVA_REG */
1377                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1378                 break;
1379         case DMA_TLB_DSI_FLUSH:
1380                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1381                 break;
1382         case DMA_TLB_PSI_FLUSH:
1383                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1384                 /* IH bit is passed in as part of address */
1385                 val_iva = size_order | addr;
1386                 break;
1387         default:
1388                 BUG();
1389         }
1390         /* Note: set drain read/write */
1391 #if 0
1392         /*
1393          * This is probably to be super secure.. Looks like we can
1394          * ignore it without any impact.
1395          */
1396         if (cap_read_drain(iommu->cap))
1397                 val |= DMA_TLB_READ_DRAIN;
1398 #endif
1399         if (cap_write_drain(iommu->cap))
1400                 val |= DMA_TLB_WRITE_DRAIN;
1401
1402         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1403         /* Note: Only uses first TLB reg currently */
1404         if (val_iva)
1405                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1406         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1407
1408         /* Make sure hardware complete it */
1409         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1410                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1411
1412         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1413
1414         /* check IOTLB invalidation granularity */
1415         if (DMA_TLB_IAIG(val) == 0)
1416                 pr_err("Flush IOTLB failed\n");
1417         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1418                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1419                         (unsigned long long)DMA_TLB_IIRG(type),
1420                         (unsigned long long)DMA_TLB_IAIG(val));
1421 }
1422
1423 static struct device_domain_info *
1424 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1425                          u8 bus, u8 devfn)
1426 {
1427         struct device_domain_info *info;
1428
1429         assert_spin_locked(&device_domain_lock);
1430
1431         if (!iommu->qi)
1432                 return NULL;
1433
1434         list_for_each_entry(info, &domain->devices, link)
1435                 if (info->iommu == iommu && info->bus == bus &&
1436                     info->devfn == devfn) {
1437                         if (info->ats_supported && info->dev)
1438                                 return info;
1439                         break;
1440                 }
1441
1442         return NULL;
1443 }
1444
1445 static void domain_update_iotlb(struct dmar_domain *domain)
1446 {
1447         struct device_domain_info *info;
1448         bool has_iotlb_device = false;
1449
1450         assert_spin_locked(&device_domain_lock);
1451
1452         list_for_each_entry(info, &domain->devices, link) {
1453                 struct pci_dev *pdev;
1454
1455                 if (!info->dev || !dev_is_pci(info->dev))
1456                         continue;
1457
1458                 pdev = to_pci_dev(info->dev);
1459                 if (pdev->ats_enabled) {
1460                         has_iotlb_device = true;
1461                         break;
1462                 }
1463         }
1464
1465         domain->has_iotlb_device = has_iotlb_device;
1466 }
1467
1468 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1469 {
1470         struct pci_dev *pdev;
1471
1472         assert_spin_locked(&device_domain_lock);
1473
1474         if (!info || !dev_is_pci(info->dev))
1475                 return;
1476
1477         pdev = to_pci_dev(info->dev);
1478         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1479          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1480          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1481          * reserved, which should be set to 0.
1482          */
1483         if (!ecap_dit(info->iommu->ecap))
1484                 info->pfsid = 0;
1485         else {
1486                 struct pci_dev *pf_pdev;
1487
1488                 /* pdev will be returned if device is not a vf */
1489                 pf_pdev = pci_physfn(pdev);
1490                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1491         }
1492
1493 #ifdef CONFIG_INTEL_IOMMU_SVM
1494         /* The PCIe spec, in its wisdom, declares that the behaviour of
1495            the device if you enable PASID support after ATS support is
1496            undefined. So always enable PASID support on devices which
1497            have it, even if we can't yet know if we're ever going to
1498            use it. */
1499         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1500                 info->pasid_enabled = 1;
1501
1502         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1503                 info->pri_enabled = 1;
1504 #endif
1505         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1506                 info->ats_enabled = 1;
1507                 domain_update_iotlb(info->domain);
1508                 info->ats_qdep = pci_ats_queue_depth(pdev);
1509         }
1510 }
1511
1512 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1513 {
1514         struct pci_dev *pdev;
1515
1516         assert_spin_locked(&device_domain_lock);
1517
1518         if (!dev_is_pci(info->dev))
1519                 return;
1520
1521         pdev = to_pci_dev(info->dev);
1522
1523         if (info->ats_enabled) {
1524                 pci_disable_ats(pdev);
1525                 info->ats_enabled = 0;
1526                 domain_update_iotlb(info->domain);
1527         }
1528 #ifdef CONFIG_INTEL_IOMMU_SVM
1529         if (info->pri_enabled) {
1530                 pci_disable_pri(pdev);
1531                 info->pri_enabled = 0;
1532         }
1533         if (info->pasid_enabled) {
1534                 pci_disable_pasid(pdev);
1535                 info->pasid_enabled = 0;
1536         }
1537 #endif
1538 }
1539
1540 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1541                                   u64 addr, unsigned mask)
1542 {
1543         u16 sid, qdep;
1544         unsigned long flags;
1545         struct device_domain_info *info;
1546
1547         if (!domain->has_iotlb_device)
1548                 return;
1549
1550         spin_lock_irqsave(&device_domain_lock, flags);
1551         list_for_each_entry(info, &domain->devices, link) {
1552                 if (!info->ats_enabled)
1553                         continue;
1554
1555                 sid = info->bus << 8 | info->devfn;
1556                 qdep = info->ats_qdep;
1557                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1558                                 qdep, addr, mask);
1559         }
1560         spin_unlock_irqrestore(&device_domain_lock, flags);
1561 }
1562
1563 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1564                                   struct dmar_domain *domain,
1565                                   unsigned long pfn, unsigned int pages,
1566                                   int ih, int map)
1567 {
1568         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1569         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1570         u16 did = domain->iommu_did[iommu->seq_id];
1571
1572         BUG_ON(pages == 0);
1573
1574         if (ih)
1575                 ih = 1 << 6;
1576         /*
1577          * Fallback to domain selective flush if no PSI support or the size is
1578          * too big.
1579          * PSI requires page size to be 2 ^ x, and the base address is naturally
1580          * aligned to the size
1581          */
1582         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1583                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1584                                                 DMA_TLB_DSI_FLUSH);
1585         else
1586                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1587                                                 DMA_TLB_PSI_FLUSH);
1588
1589         /*
1590          * In caching mode, changes of pages from non-present to present require
1591          * flush. However, device IOTLB doesn't need to be flushed in this case.
1592          */
1593         if (!cap_caching_mode(iommu->cap) || !map)
1594                 iommu_flush_dev_iotlb(domain, addr, mask);
1595 }
1596
1597 /* Notification for newly created mappings */
1598 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1599                                         struct dmar_domain *domain,
1600                                         unsigned long pfn, unsigned int pages)
1601 {
1602         /* It's a non-present to present mapping. Only flush if caching mode */
1603         if (cap_caching_mode(iommu->cap))
1604                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1605         else
1606                 iommu_flush_write_buffer(iommu);
1607 }
1608
1609 static void iommu_flush_iova(struct iova_domain *iovad)
1610 {
1611         struct dmar_domain *domain;
1612         int idx;
1613
1614         domain = container_of(iovad, struct dmar_domain, iovad);
1615
1616         for_each_domain_iommu(idx, domain) {
1617                 struct intel_iommu *iommu = g_iommus[idx];
1618                 u16 did = domain->iommu_did[iommu->seq_id];
1619
1620                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1621
1622                 if (!cap_caching_mode(iommu->cap))
1623                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1624                                               0, MAX_AGAW_PFN_WIDTH);
1625         }
1626 }
1627
1628 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1629 {
1630         u32 pmen;
1631         unsigned long flags;
1632
1633         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1634         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1635         pmen &= ~DMA_PMEN_EPM;
1636         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1637
1638         /* wait for the protected region status bit to clear */
1639         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1640                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1641
1642         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643 }
1644
1645 static void iommu_enable_translation(struct intel_iommu *iommu)
1646 {
1647         u32 sts;
1648         unsigned long flags;
1649
1650         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1651         iommu->gcmd |= DMA_GCMD_TE;
1652         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654         /* Make sure hardware complete it */
1655         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656                       readl, (sts & DMA_GSTS_TES), sts);
1657
1658         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1659 }
1660
1661 static void iommu_disable_translation(struct intel_iommu *iommu)
1662 {
1663         u32 sts;
1664         unsigned long flag;
1665
1666         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1667         iommu->gcmd &= ~DMA_GCMD_TE;
1668         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1669
1670         /* Make sure hardware complete it */
1671         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1672                       readl, (!(sts & DMA_GSTS_TES)), sts);
1673
1674         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1675 }
1676
1677
1678 static int iommu_init_domains(struct intel_iommu *iommu)
1679 {
1680         u32 ndomains, nlongs;
1681         size_t size;
1682
1683         ndomains = cap_ndoms(iommu->cap);
1684         pr_debug("%s: Number of Domains supported <%d>\n",
1685                  iommu->name, ndomains);
1686         nlongs = BITS_TO_LONGS(ndomains);
1687
1688         spin_lock_init(&iommu->lock);
1689
1690         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1691         if (!iommu->domain_ids) {
1692                 pr_err("%s: Allocating domain id array failed\n",
1693                        iommu->name);
1694                 return -ENOMEM;
1695         }
1696
1697         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1698         iommu->domains = kzalloc(size, GFP_KERNEL);
1699
1700         if (iommu->domains) {
1701                 size = 256 * sizeof(struct dmar_domain *);
1702                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1703         }
1704
1705         if (!iommu->domains || !iommu->domains[0]) {
1706                 pr_err("%s: Allocating domain array failed\n",
1707                        iommu->name);
1708                 kfree(iommu->domain_ids);
1709                 kfree(iommu->domains);
1710                 iommu->domain_ids = NULL;
1711                 iommu->domains    = NULL;
1712                 return -ENOMEM;
1713         }
1714
1715
1716
1717         /*
1718          * If Caching mode is set, then invalid translations are tagged
1719          * with domain-id 0, hence we need to pre-allocate it. We also
1720          * use domain-id 0 as a marker for non-allocated domain-id, so
1721          * make sure it is not used for a real domain.
1722          */
1723         set_bit(0, iommu->domain_ids);
1724
1725         return 0;
1726 }
1727
1728 static void disable_dmar_iommu(struct intel_iommu *iommu)
1729 {
1730         struct device_domain_info *info, *tmp;
1731         unsigned long flags;
1732
1733         if (!iommu->domains || !iommu->domain_ids)
1734                 return;
1735
1736 again:
1737         spin_lock_irqsave(&device_domain_lock, flags);
1738         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1739                 struct dmar_domain *domain;
1740
1741                 if (info->iommu != iommu)
1742                         continue;
1743
1744                 if (!info->dev || !info->domain)
1745                         continue;
1746
1747                 domain = info->domain;
1748
1749                 __dmar_remove_one_dev_info(info);
1750
1751                 if (!domain_type_is_vm_or_si(domain)) {
1752                         /*
1753                          * The domain_exit() function  can't be called under
1754                          * device_domain_lock, as it takes this lock itself.
1755                          * So release the lock here and re-run the loop
1756                          * afterwards.
1757                          */
1758                         spin_unlock_irqrestore(&device_domain_lock, flags);
1759                         domain_exit(domain);
1760                         goto again;
1761                 }
1762         }
1763         spin_unlock_irqrestore(&device_domain_lock, flags);
1764
1765         if (iommu->gcmd & DMA_GCMD_TE)
1766                 iommu_disable_translation(iommu);
1767 }
1768
1769 static void free_dmar_iommu(struct intel_iommu *iommu)
1770 {
1771         if ((iommu->domains) && (iommu->domain_ids)) {
1772                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1773                 int i;
1774
1775                 for (i = 0; i < elems; i++)
1776                         kfree(iommu->domains[i]);
1777                 kfree(iommu->domains);
1778                 kfree(iommu->domain_ids);
1779                 iommu->domains = NULL;
1780                 iommu->domain_ids = NULL;
1781         }
1782
1783         g_iommus[iommu->seq_id] = NULL;
1784
1785         /* free context mapping */
1786         free_context_table(iommu);
1787
1788 #ifdef CONFIG_INTEL_IOMMU_SVM
1789         if (pasid_enabled(iommu)) {
1790                 if (ecap_prs(iommu->ecap))
1791                         intel_svm_finish_prq(iommu);
1792                 intel_svm_free_pasid_tables(iommu);
1793         }
1794 #endif
1795 }
1796
1797 static struct dmar_domain *alloc_domain(int flags)
1798 {
1799         struct dmar_domain *domain;
1800
1801         domain = alloc_domain_mem();
1802         if (!domain)
1803                 return NULL;
1804
1805         memset(domain, 0, sizeof(*domain));
1806         domain->nid = -1;
1807         domain->flags = flags;
1808         domain->has_iotlb_device = false;
1809         INIT_LIST_HEAD(&domain->devices);
1810
1811         return domain;
1812 }
1813
1814 /* Must be called with iommu->lock */
1815 static int domain_attach_iommu(struct dmar_domain *domain,
1816                                struct intel_iommu *iommu)
1817 {
1818         unsigned long ndomains;
1819         int num;
1820
1821         assert_spin_locked(&device_domain_lock);
1822         assert_spin_locked(&iommu->lock);
1823
1824         domain->iommu_refcnt[iommu->seq_id] += 1;
1825         domain->iommu_count += 1;
1826         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1827                 ndomains = cap_ndoms(iommu->cap);
1828                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1829
1830                 if (num >= ndomains) {
1831                         pr_err("%s: No free domain ids\n", iommu->name);
1832                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1833                         domain->iommu_count -= 1;
1834                         return -ENOSPC;
1835                 }
1836
1837                 set_bit(num, iommu->domain_ids);
1838                 set_iommu_domain(iommu, num, domain);
1839
1840                 domain->iommu_did[iommu->seq_id] = num;
1841                 domain->nid                      = iommu->node;
1842
1843                 domain_update_iommu_cap(domain);
1844         }
1845
1846         return 0;
1847 }
1848
1849 static int domain_detach_iommu(struct dmar_domain *domain,
1850                                struct intel_iommu *iommu)
1851 {
1852         int num, count = INT_MAX;
1853
1854         assert_spin_locked(&device_domain_lock);
1855         assert_spin_locked(&iommu->lock);
1856
1857         domain->iommu_refcnt[iommu->seq_id] -= 1;
1858         count = --domain->iommu_count;
1859         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1860                 num = domain->iommu_did[iommu->seq_id];
1861                 clear_bit(num, iommu->domain_ids);
1862                 set_iommu_domain(iommu, num, NULL);
1863
1864                 domain_update_iommu_cap(domain);
1865                 domain->iommu_did[iommu->seq_id] = 0;
1866         }
1867
1868         return count;
1869 }
1870
1871 static struct iova_domain reserved_iova_list;
1872 static struct lock_class_key reserved_rbtree_key;
1873
1874 static int dmar_init_reserved_ranges(void)
1875 {
1876         struct pci_dev *pdev = NULL;
1877         struct iova *iova;
1878         int i;
1879
1880         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1881
1882         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1883                 &reserved_rbtree_key);
1884
1885         /* IOAPIC ranges shouldn't be accessed by DMA */
1886         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1887                 IOVA_PFN(IOAPIC_RANGE_END));
1888         if (!iova) {
1889                 pr_err("Reserve IOAPIC range failed\n");
1890                 return -ENODEV;
1891         }
1892
1893         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1894         for_each_pci_dev(pdev) {
1895                 struct resource *r;
1896
1897                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1898                         r = &pdev->resource[i];
1899                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1900                                 continue;
1901                         iova = reserve_iova(&reserved_iova_list,
1902                                             IOVA_PFN(r->start),
1903                                             IOVA_PFN(r->end));
1904                         if (!iova) {
1905                                 pr_err("Reserve iova failed\n");
1906                                 return -ENODEV;
1907                         }
1908                 }
1909         }
1910         return 0;
1911 }
1912
1913 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1914 {
1915         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1916 }
1917
1918 static inline int guestwidth_to_adjustwidth(int gaw)
1919 {
1920         int agaw;
1921         int r = (gaw - 12) % 9;
1922
1923         if (r == 0)
1924                 agaw = gaw;
1925         else
1926                 agaw = gaw + 9 - r;
1927         if (agaw > 64)
1928                 agaw = 64;
1929         return agaw;
1930 }
1931
1932 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1933                        int guest_width)
1934 {
1935         int adjust_width, agaw;
1936         unsigned long sagaw;
1937         int err;
1938
1939         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1940
1941         err = init_iova_flush_queue(&domain->iovad,
1942                                     iommu_flush_iova, iova_entry_free);
1943         if (err)
1944                 return err;
1945
1946         domain_reserve_special_ranges(domain);
1947
1948         /* calculate AGAW */
1949         if (guest_width > cap_mgaw(iommu->cap))
1950                 guest_width = cap_mgaw(iommu->cap);
1951         domain->gaw = guest_width;
1952         adjust_width = guestwidth_to_adjustwidth(guest_width);
1953         agaw = width_to_agaw(adjust_width);
1954         sagaw = cap_sagaw(iommu->cap);
1955         if (!test_bit(agaw, &sagaw)) {
1956                 /* hardware doesn't support it, choose a bigger one */
1957                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1958                 agaw = find_next_bit(&sagaw, 5, agaw);
1959                 if (agaw >= 5)
1960                         return -ENODEV;
1961         }
1962         domain->agaw = agaw;
1963
1964         if (ecap_coherent(iommu->ecap))
1965                 domain->iommu_coherency = 1;
1966         else
1967                 domain->iommu_coherency = 0;
1968
1969         if (ecap_sc_support(iommu->ecap))
1970                 domain->iommu_snooping = 1;
1971         else
1972                 domain->iommu_snooping = 0;
1973
1974         if (intel_iommu_superpage)
1975                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1976         else
1977                 domain->iommu_superpage = 0;
1978
1979         domain->nid = iommu->node;
1980
1981         /* always allocate the top pgd */
1982         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1983         if (!domain->pgd)
1984                 return -ENOMEM;
1985         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1986         return 0;
1987 }
1988
1989 static void domain_exit(struct dmar_domain *domain)
1990 {
1991         struct page *freelist = NULL;
1992
1993         /* Domain 0 is reserved, so dont process it */
1994         if (!domain)
1995                 return;
1996
1997         /* Remove associated devices and clear attached or cached domains */
1998         rcu_read_lock();
1999         domain_remove_dev_info(domain);
2000         rcu_read_unlock();
2001
2002         /* destroy iovas */
2003         put_iova_domain(&domain->iovad);
2004
2005         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2006
2007         dma_free_pagelist(freelist);
2008
2009         free_domain_mem(domain);
2010 }
2011
2012 static int domain_context_mapping_one(struct dmar_domain *domain,
2013                                       struct intel_iommu *iommu,
2014                                       u8 bus, u8 devfn)
2015 {
2016         u16 did = domain->iommu_did[iommu->seq_id];
2017         int translation = CONTEXT_TT_MULTI_LEVEL;
2018         struct device_domain_info *info = NULL;
2019         struct context_entry *context;
2020         unsigned long flags;
2021         struct dma_pte *pgd;
2022         int ret, agaw;
2023
2024         WARN_ON(did == 0);
2025
2026         if (hw_pass_through && domain_type_is_si(domain))
2027                 translation = CONTEXT_TT_PASS_THROUGH;
2028
2029         pr_debug("Set context mapping for %02x:%02x.%d\n",
2030                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2031
2032         BUG_ON(!domain->pgd);
2033
2034         spin_lock_irqsave(&device_domain_lock, flags);
2035         spin_lock(&iommu->lock);
2036
2037         ret = -ENOMEM;
2038         context = iommu_context_addr(iommu, bus, devfn, 1);
2039         if (!context)
2040                 goto out_unlock;
2041
2042         ret = 0;
2043         if (context_present(context))
2044                 goto out_unlock;
2045
2046         /*
2047          * For kdump cases, old valid entries may be cached due to the
2048          * in-flight DMA and copied pgtable, but there is no unmapping
2049          * behaviour for them, thus we need an explicit cache flush for
2050          * the newly-mapped device. For kdump, at this point, the device
2051          * is supposed to finish reset at its driver probe stage, so no
2052          * in-flight DMA will exist, and we don't need to worry anymore
2053          * hereafter.
2054          */
2055         if (context_copied(context)) {
2056                 u16 did_old = context_domain_id(context);
2057
2058                 if (did_old < cap_ndoms(iommu->cap)) {
2059                         iommu->flush.flush_context(iommu, did_old,
2060                                                    (((u16)bus) << 8) | devfn,
2061                                                    DMA_CCMD_MASK_NOBIT,
2062                                                    DMA_CCMD_DEVICE_INVL);
2063                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2064                                                  DMA_TLB_DSI_FLUSH);
2065                 }
2066         }
2067
2068         pgd = domain->pgd;
2069
2070         context_clear_entry(context);
2071         context_set_domain_id(context, did);
2072
2073         /*
2074          * Skip top levels of page tables for iommu which has less agaw
2075          * than default.  Unnecessary for PT mode.
2076          */
2077         if (translation != CONTEXT_TT_PASS_THROUGH) {
2078                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2079                         ret = -ENOMEM;
2080                         pgd = phys_to_virt(dma_pte_addr(pgd));
2081                         if (!dma_pte_present(pgd))
2082                                 goto out_unlock;
2083                 }
2084
2085                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2086                 if (info && info->ats_supported)
2087                         translation = CONTEXT_TT_DEV_IOTLB;
2088                 else
2089                         translation = CONTEXT_TT_MULTI_LEVEL;
2090
2091                 context_set_address_root(context, virt_to_phys(pgd));
2092                 context_set_address_width(context, iommu->agaw);
2093         } else {
2094                 /*
2095                  * In pass through mode, AW must be programmed to
2096                  * indicate the largest AGAW value supported by
2097                  * hardware. And ASR is ignored by hardware.
2098                  */
2099                 context_set_address_width(context, iommu->msagaw);
2100         }
2101
2102         context_set_translation_type(context, translation);
2103         context_set_fault_enable(context);
2104         context_set_present(context);
2105         domain_flush_cache(domain, context, sizeof(*context));
2106
2107         /*
2108          * It's a non-present to present mapping. If hardware doesn't cache
2109          * non-present entry we only need to flush the write-buffer. If the
2110          * _does_ cache non-present entries, then it does so in the special
2111          * domain #0, which we have to flush:
2112          */
2113         if (cap_caching_mode(iommu->cap)) {
2114                 iommu->flush.flush_context(iommu, 0,
2115                                            (((u16)bus) << 8) | devfn,
2116                                            DMA_CCMD_MASK_NOBIT,
2117                                            DMA_CCMD_DEVICE_INVL);
2118                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2119         } else {
2120                 iommu_flush_write_buffer(iommu);
2121         }
2122         iommu_enable_dev_iotlb(info);
2123
2124         ret = 0;
2125
2126 out_unlock:
2127         spin_unlock(&iommu->lock);
2128         spin_unlock_irqrestore(&device_domain_lock, flags);
2129
2130         return ret;
2131 }
2132
2133 struct domain_context_mapping_data {
2134         struct dmar_domain *domain;
2135         struct intel_iommu *iommu;
2136 };
2137
2138 static int domain_context_mapping_cb(struct pci_dev *pdev,
2139                                      u16 alias, void *opaque)
2140 {
2141         struct domain_context_mapping_data *data = opaque;
2142
2143         return domain_context_mapping_one(data->domain, data->iommu,
2144                                           PCI_BUS_NUM(alias), alias & 0xff);
2145 }
2146
2147 static int
2148 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2149 {
2150         struct intel_iommu *iommu;
2151         u8 bus, devfn;
2152         struct domain_context_mapping_data data;
2153
2154         iommu = device_to_iommu(dev, &bus, &devfn);
2155         if (!iommu)
2156                 return -ENODEV;
2157
2158         if (!dev_is_pci(dev))
2159                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2160
2161         data.domain = domain;
2162         data.iommu = iommu;
2163
2164         return pci_for_each_dma_alias(to_pci_dev(dev),
2165                                       &domain_context_mapping_cb, &data);
2166 }
2167
2168 static int domain_context_mapped_cb(struct pci_dev *pdev,
2169                                     u16 alias, void *opaque)
2170 {
2171         struct intel_iommu *iommu = opaque;
2172
2173         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2174 }
2175
2176 static int domain_context_mapped(struct device *dev)
2177 {
2178         struct intel_iommu *iommu;
2179         u8 bus, devfn;
2180
2181         iommu = device_to_iommu(dev, &bus, &devfn);
2182         if (!iommu)
2183                 return -ENODEV;
2184
2185         if (!dev_is_pci(dev))
2186                 return device_context_mapped(iommu, bus, devfn);
2187
2188         return !pci_for_each_dma_alias(to_pci_dev(dev),
2189                                        domain_context_mapped_cb, iommu);
2190 }
2191
2192 /* Returns a number of VTD pages, but aligned to MM page size */
2193 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2194                                             size_t size)
2195 {
2196         host_addr &= ~PAGE_MASK;
2197         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2198 }
2199
2200 /* Return largest possible superpage level for a given mapping */
2201 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2202                                           unsigned long iov_pfn,
2203                                           unsigned long phy_pfn,
2204                                           unsigned long pages)
2205 {
2206         int support, level = 1;
2207         unsigned long pfnmerge;
2208
2209         support = domain->iommu_superpage;
2210
2211         /* To use a large page, the virtual *and* physical addresses
2212            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2213            of them will mean we have to use smaller pages. So just
2214            merge them and check both at once. */
2215         pfnmerge = iov_pfn | phy_pfn;
2216
2217         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2218                 pages >>= VTD_STRIDE_SHIFT;
2219                 if (!pages)
2220                         break;
2221                 pfnmerge >>= VTD_STRIDE_SHIFT;
2222                 level++;
2223                 support--;
2224         }
2225         return level;
2226 }
2227
2228 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2229                             struct scatterlist *sg, unsigned long phys_pfn,
2230                             unsigned long nr_pages, int prot)
2231 {
2232         struct dma_pte *first_pte = NULL, *pte = NULL;
2233         phys_addr_t uninitialized_var(pteval);
2234         unsigned long sg_res = 0;
2235         unsigned int largepage_lvl = 0;
2236         unsigned long lvl_pages = 0;
2237
2238         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2239
2240         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2241                 return -EINVAL;
2242
2243         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2244
2245         if (!sg) {
2246                 sg_res = nr_pages;
2247                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2248         }
2249
2250         while (nr_pages > 0) {
2251                 uint64_t tmp;
2252
2253                 if (!sg_res) {
2254                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2255
2256                         sg_res = aligned_nrpages(sg->offset, sg->length);
2257                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2258                         sg->dma_length = sg->length;
2259                         pteval = (sg_phys(sg) - pgoff) | prot;
2260                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2261                 }
2262
2263                 if (!pte) {
2264                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2265
2266                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2267                         if (!pte)
2268                                 return -ENOMEM;
2269                         /* It is large page*/
2270                         if (largepage_lvl > 1) {
2271                                 unsigned long nr_superpages, end_pfn;
2272
2273                                 pteval |= DMA_PTE_LARGE_PAGE;
2274                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2275
2276                                 nr_superpages = sg_res / lvl_pages;
2277                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2278
2279                                 /*
2280                                  * Ensure that old small page tables are
2281                                  * removed to make room for superpage(s).
2282                                  * We're adding new large pages, so make sure
2283                                  * we don't remove their parent tables.
2284                                  */
2285                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2286                                                        largepage_lvl + 1);
2287                         } else {
2288                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2289                         }
2290
2291                 }
2292                 /* We don't need lock here, nobody else
2293                  * touches the iova range
2294                  */
2295                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2296                 if (tmp) {
2297                         static int dumps = 5;
2298                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2299                                 iov_pfn, tmp, (unsigned long long)pteval);
2300                         if (dumps) {
2301                                 dumps--;
2302                                 debug_dma_dump_mappings(NULL);
2303                         }
2304                         WARN_ON(1);
2305                 }
2306
2307                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2308
2309                 BUG_ON(nr_pages < lvl_pages);
2310                 BUG_ON(sg_res < lvl_pages);
2311
2312                 nr_pages -= lvl_pages;
2313                 iov_pfn += lvl_pages;
2314                 phys_pfn += lvl_pages;
2315                 pteval += lvl_pages * VTD_PAGE_SIZE;
2316                 sg_res -= lvl_pages;
2317
2318                 /* If the next PTE would be the first in a new page, then we
2319                    need to flush the cache on the entries we've just written.
2320                    And then we'll need to recalculate 'pte', so clear it and
2321                    let it get set again in the if (!pte) block above.
2322
2323                    If we're done (!nr_pages) we need to flush the cache too.
2324
2325                    Also if we've been setting superpages, we may need to
2326                    recalculate 'pte' and switch back to smaller pages for the
2327                    end of the mapping, if the trailing size is not enough to
2328                    use another superpage (i.e. sg_res < lvl_pages). */
2329                 pte++;
2330                 if (!nr_pages || first_pte_in_page(pte) ||
2331                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2332                         domain_flush_cache(domain, first_pte,
2333                                            (void *)pte - (void *)first_pte);
2334                         pte = NULL;
2335                 }
2336
2337                 if (!sg_res && nr_pages)
2338                         sg = sg_next(sg);
2339         }
2340         return 0;
2341 }
2342
2343 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2344                          struct scatterlist *sg, unsigned long phys_pfn,
2345                          unsigned long nr_pages, int prot)
2346 {
2347        int ret;
2348        struct intel_iommu *iommu;
2349
2350        /* Do the real mapping first */
2351        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2352        if (ret)
2353                return ret;
2354
2355        /* Notify about the new mapping */
2356        if (domain_type_is_vm(domain)) {
2357                /* VM typed domains can have more than one IOMMUs */
2358                int iommu_id;
2359                for_each_domain_iommu(iommu_id, domain) {
2360                        iommu = g_iommus[iommu_id];
2361                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2362                }
2363        } else {
2364                /* General domains only have one IOMMU */
2365                iommu = domain_get_iommu(domain);
2366                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2367        }
2368
2369        return 0;
2370 }
2371
2372 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2373                                     struct scatterlist *sg, unsigned long nr_pages,
2374                                     int prot)
2375 {
2376         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2377 }
2378
2379 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2380                                      unsigned long phys_pfn, unsigned long nr_pages,
2381                                      int prot)
2382 {
2383         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2384 }
2385
2386 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2387 {
2388         unsigned long flags;
2389         struct context_entry *context;
2390         u16 did_old;
2391
2392         if (!iommu)
2393                 return;
2394
2395         spin_lock_irqsave(&iommu->lock, flags);
2396         context = iommu_context_addr(iommu, bus, devfn, 0);
2397         if (!context) {
2398                 spin_unlock_irqrestore(&iommu->lock, flags);
2399                 return;
2400         }
2401         did_old = context_domain_id(context);
2402         context_clear_entry(context);
2403         __iommu_flush_cache(iommu, context, sizeof(*context));
2404         spin_unlock_irqrestore(&iommu->lock, flags);
2405         iommu->flush.flush_context(iommu,
2406                                    did_old,
2407                                    (((u16)bus) << 8) | devfn,
2408                                    DMA_CCMD_MASK_NOBIT,
2409                                    DMA_CCMD_DEVICE_INVL);
2410         iommu->flush.flush_iotlb(iommu,
2411                                  did_old,
2412                                  0,
2413                                  0,
2414                                  DMA_TLB_DSI_FLUSH);
2415 }
2416
2417 static inline void unlink_domain_info(struct device_domain_info *info)
2418 {
2419         assert_spin_locked(&device_domain_lock);
2420         list_del(&info->link);
2421         list_del(&info->global);
2422         if (info->dev)
2423                 info->dev->archdata.iommu = NULL;
2424 }
2425
2426 static void domain_remove_dev_info(struct dmar_domain *domain)
2427 {
2428         struct device_domain_info *info, *tmp;
2429         unsigned long flags;
2430
2431         spin_lock_irqsave(&device_domain_lock, flags);
2432         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2433                 __dmar_remove_one_dev_info(info);
2434         spin_unlock_irqrestore(&device_domain_lock, flags);
2435 }
2436
2437 /*
2438  * find_domain
2439  * Note: we use struct device->archdata.iommu stores the info
2440  */
2441 static struct dmar_domain *find_domain(struct device *dev)
2442 {
2443         struct device_domain_info *info;
2444
2445         /* No lock here, assumes no domain exit in normal case */
2446         info = dev->archdata.iommu;
2447         if (likely(info))
2448                 return info->domain;
2449         return NULL;
2450 }
2451
2452 static inline struct device_domain_info *
2453 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2454 {
2455         struct device_domain_info *info;
2456
2457         list_for_each_entry(info, &device_domain_list, global)
2458                 if (info->iommu->segment == segment && info->bus == bus &&
2459                     info->devfn == devfn)
2460                         return info;
2461
2462         return NULL;
2463 }
2464
2465 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2466                                                     int bus, int devfn,
2467                                                     struct device *dev,
2468                                                     struct dmar_domain *domain)
2469 {
2470         struct dmar_domain *found = NULL;
2471         struct device_domain_info *info;
2472         unsigned long flags;
2473         int ret;
2474
2475         info = alloc_devinfo_mem();
2476         if (!info)
2477                 return NULL;
2478
2479         info->bus = bus;
2480         info->devfn = devfn;
2481         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2482         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2483         info->ats_qdep = 0;
2484         info->dev = dev;
2485         info->domain = domain;
2486         info->iommu = iommu;
2487
2488         if (dev && dev_is_pci(dev)) {
2489                 struct pci_dev *pdev = to_pci_dev(info->dev);
2490
2491                 if (!pci_ats_disabled() &&
2492                     ecap_dev_iotlb_support(iommu->ecap) &&
2493                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494                     dmar_find_matched_atsr_unit(pdev))
2495                         info->ats_supported = 1;
2496
2497                 if (ecs_enabled(iommu)) {
2498                         if (pasid_enabled(iommu)) {
2499                                 int features = pci_pasid_features(pdev);
2500                                 if (features >= 0)
2501                                         info->pasid_supported = features | 1;
2502                         }
2503
2504                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506                                 info->pri_supported = 1;
2507                 }
2508         }
2509
2510         spin_lock_irqsave(&device_domain_lock, flags);
2511         if (dev)
2512                 found = find_domain(dev);
2513
2514         if (!found) {
2515                 struct device_domain_info *info2;
2516                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2517                 if (info2) {
2518                         found      = info2->domain;
2519                         info2->dev = dev;
2520                 }
2521         }
2522
2523         if (found) {
2524                 spin_unlock_irqrestore(&device_domain_lock, flags);
2525                 free_devinfo_mem(info);
2526                 /* Caller must free the original domain */
2527                 return found;
2528         }
2529
2530         spin_lock(&iommu->lock);
2531         ret = domain_attach_iommu(domain, iommu);
2532         spin_unlock(&iommu->lock);
2533
2534         if (ret) {
2535                 spin_unlock_irqrestore(&device_domain_lock, flags);
2536                 free_devinfo_mem(info);
2537                 return NULL;
2538         }
2539
2540         list_add(&info->link, &domain->devices);
2541         list_add(&info->global, &device_domain_list);
2542         if (dev)
2543                 dev->archdata.iommu = info;
2544         spin_unlock_irqrestore(&device_domain_lock, flags);
2545
2546         if (dev && domain_context_mapping(domain, dev)) {
2547                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2548                 dmar_remove_one_dev_info(domain, dev);
2549                 return NULL;
2550         }
2551
2552         return domain;
2553 }
2554
2555 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2556 {
2557         *(u16 *)opaque = alias;
2558         return 0;
2559 }
2560
2561 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2562 {
2563         struct device_domain_info *info = NULL;
2564         struct dmar_domain *domain = NULL;
2565         struct intel_iommu *iommu;
2566         u16 dma_alias;
2567         unsigned long flags;
2568         u8 bus, devfn;
2569
2570         iommu = device_to_iommu(dev, &bus, &devfn);
2571         if (!iommu)
2572                 return NULL;
2573
2574         if (dev_is_pci(dev)) {
2575                 struct pci_dev *pdev = to_pci_dev(dev);
2576
2577                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2578
2579                 spin_lock_irqsave(&device_domain_lock, flags);
2580                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2581                                                       PCI_BUS_NUM(dma_alias),
2582                                                       dma_alias & 0xff);
2583                 if (info) {
2584                         iommu = info->iommu;
2585                         domain = info->domain;
2586                 }
2587                 spin_unlock_irqrestore(&device_domain_lock, flags);
2588
2589                 /* DMA alias already has a domain, use it */
2590                 if (info)
2591                         goto out;
2592         }
2593
2594         /* Allocate and initialize new domain for the device */
2595         domain = alloc_domain(0);
2596         if (!domain)
2597                 return NULL;
2598         if (domain_init(domain, iommu, gaw)) {
2599                 domain_exit(domain);
2600                 return NULL;
2601         }
2602
2603 out:
2604
2605         return domain;
2606 }
2607
2608 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2609                                               struct dmar_domain *domain)
2610 {
2611         struct intel_iommu *iommu;
2612         struct dmar_domain *tmp;
2613         u16 req_id, dma_alias;
2614         u8 bus, devfn;
2615
2616         iommu = device_to_iommu(dev, &bus, &devfn);
2617         if (!iommu)
2618                 return NULL;
2619
2620         req_id = ((u16)bus << 8) | devfn;
2621
2622         if (dev_is_pci(dev)) {
2623                 struct pci_dev *pdev = to_pci_dev(dev);
2624
2625                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2626
2627                 /* register PCI DMA alias device */
2628                 if (req_id != dma_alias) {
2629                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2630                                         dma_alias & 0xff, NULL, domain);
2631
2632                         if (!tmp || tmp != domain)
2633                                 return tmp;
2634                 }
2635         }
2636
2637         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2638         if (!tmp || tmp != domain)
2639                 return tmp;
2640
2641         return domain;
2642 }
2643
2644 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2645 {
2646         struct dmar_domain *domain, *tmp;
2647
2648         domain = find_domain(dev);
2649         if (domain)
2650                 goto out;
2651
2652         domain = find_or_alloc_domain(dev, gaw);
2653         if (!domain)
2654                 goto out;
2655
2656         tmp = set_domain_for_dev(dev, domain);
2657         if (!tmp || domain != tmp) {
2658                 domain_exit(domain);
2659                 domain = tmp;
2660         }
2661
2662 out:
2663
2664         return domain;
2665 }
2666
2667 static int iommu_domain_identity_map(struct dmar_domain *domain,
2668                                      unsigned long long start,
2669                                      unsigned long long end)
2670 {
2671         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2672         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2673
2674         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2675                           dma_to_mm_pfn(last_vpfn))) {
2676                 pr_err("Reserving iova failed\n");
2677                 return -ENOMEM;
2678         }
2679
2680         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2681         /*
2682          * RMRR range might have overlap with physical memory range,
2683          * clear it first
2684          */
2685         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2686
2687         return __domain_mapping(domain, first_vpfn, NULL,
2688                                 first_vpfn, last_vpfn - first_vpfn + 1,
2689                                 DMA_PTE_READ|DMA_PTE_WRITE);
2690 }
2691
2692 static int domain_prepare_identity_map(struct device *dev,
2693                                        struct dmar_domain *domain,
2694                                        unsigned long long start,
2695                                        unsigned long long end)
2696 {
2697         /* For _hardware_ passthrough, don't bother. But for software
2698            passthrough, we do it anyway -- it may indicate a memory
2699            range which is reserved in E820, so which didn't get set
2700            up to start with in si_domain */
2701         if (domain == si_domain && hw_pass_through) {
2702                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2703                         dev_name(dev), start, end);
2704                 return 0;
2705         }
2706
2707         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2708                 dev_name(dev), start, end);
2709
2710         if (end < start) {
2711                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2712                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2713                         dmi_get_system_info(DMI_BIOS_VENDOR),
2714                         dmi_get_system_info(DMI_BIOS_VERSION),
2715                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2716                 return -EIO;
2717         }
2718
2719         if (end >> agaw_to_width(domain->agaw)) {
2720                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2721                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2722                      agaw_to_width(domain->agaw),
2723                      dmi_get_system_info(DMI_BIOS_VENDOR),
2724                      dmi_get_system_info(DMI_BIOS_VERSION),
2725                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2726                 return -EIO;
2727         }
2728
2729         return iommu_domain_identity_map(domain, start, end);
2730 }
2731
2732 static int iommu_prepare_identity_map(struct device *dev,
2733                                       unsigned long long start,
2734                                       unsigned long long end)
2735 {
2736         struct dmar_domain *domain;
2737         int ret;
2738
2739         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2740         if (!domain)
2741                 return -ENOMEM;
2742
2743         ret = domain_prepare_identity_map(dev, domain, start, end);
2744         if (ret)
2745                 domain_exit(domain);
2746
2747         return ret;
2748 }
2749
2750 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2751                                          struct device *dev)
2752 {
2753         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2754                 return 0;
2755         return iommu_prepare_identity_map(dev, rmrr->base_address,
2756                                           rmrr->end_address);
2757 }
2758
2759 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2760 static inline void iommu_prepare_isa(void)
2761 {
2762         struct pci_dev *pdev;
2763         int ret;
2764
2765         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2766         if (!pdev)
2767                 return;
2768
2769         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2770         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2771
2772         if (ret)
2773                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2774
2775         pci_dev_put(pdev);
2776 }
2777 #else
2778 static inline void iommu_prepare_isa(void)
2779 {
2780         return;
2781 }
2782 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2783
2784 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2785
2786 static int __init si_domain_init(int hw)
2787 {
2788         int nid, ret = 0;
2789
2790         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2791         if (!si_domain)
2792                 return -EFAULT;
2793
2794         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2795                 domain_exit(si_domain);
2796                 return -EFAULT;
2797         }
2798
2799         pr_debug("Identity mapping domain allocated\n");
2800
2801         if (hw)
2802                 return 0;
2803
2804         for_each_online_node(nid) {
2805                 unsigned long start_pfn, end_pfn;
2806                 int i;
2807
2808                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2809                         ret = iommu_domain_identity_map(si_domain,
2810                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2811                         if (ret)
2812                                 return ret;
2813                 }
2814         }
2815
2816         return 0;
2817 }
2818
2819 static int identity_mapping(struct device *dev)
2820 {
2821         struct device_domain_info *info;
2822
2823         if (likely(!iommu_identity_mapping))
2824                 return 0;
2825
2826         info = dev->archdata.iommu;
2827         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2828                 return (info->domain == si_domain);
2829
2830         return 0;
2831 }
2832
2833 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2834 {
2835         struct dmar_domain *ndomain;
2836         struct intel_iommu *iommu;
2837         u8 bus, devfn;
2838
2839         iommu = device_to_iommu(dev, &bus, &devfn);
2840         if (!iommu)
2841                 return -ENODEV;
2842
2843         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2844         if (ndomain != domain)
2845                 return -EBUSY;
2846
2847         return 0;
2848 }
2849
2850 static bool device_has_rmrr(struct device *dev)
2851 {
2852         struct dmar_rmrr_unit *rmrr;
2853         struct device *tmp;
2854         int i;
2855
2856         rcu_read_lock();
2857         for_each_rmrr_units(rmrr) {
2858                 /*
2859                  * Return TRUE if this RMRR contains the device that
2860                  * is passed in.
2861                  */
2862                 for_each_active_dev_scope(rmrr->devices,
2863                                           rmrr->devices_cnt, i, tmp)
2864                         if (tmp == dev) {
2865                                 rcu_read_unlock();
2866                                 return true;
2867                         }
2868         }
2869         rcu_read_unlock();
2870         return false;
2871 }
2872
2873 /*
2874  * There are a couple cases where we need to restrict the functionality of
2875  * devices associated with RMRRs.  The first is when evaluating a device for
2876  * identity mapping because problems exist when devices are moved in and out
2877  * of domains and their respective RMRR information is lost.  This means that
2878  * a device with associated RMRRs will never be in a "passthrough" domain.
2879  * The second is use of the device through the IOMMU API.  This interface
2880  * expects to have full control of the IOVA space for the device.  We cannot
2881  * satisfy both the requirement that RMRR access is maintained and have an
2882  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2883  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2884  * We therefore prevent devices associated with an RMRR from participating in
2885  * the IOMMU API, which eliminates them from device assignment.
2886  *
2887  * In both cases we assume that PCI USB devices with RMRRs have them largely
2888  * for historical reasons and that the RMRR space is not actively used post
2889  * boot.  This exclusion may change if vendors begin to abuse it.
2890  *
2891  * The same exception is made for graphics devices, with the requirement that
2892  * any use of the RMRR regions will be torn down before assigning the device
2893  * to a guest.
2894  */
2895 static bool device_is_rmrr_locked(struct device *dev)
2896 {
2897         if (!device_has_rmrr(dev))
2898                 return false;
2899
2900         if (dev_is_pci(dev)) {
2901                 struct pci_dev *pdev = to_pci_dev(dev);
2902
2903                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2904                         return false;
2905         }
2906
2907         return true;
2908 }
2909
2910 static int iommu_should_identity_map(struct device *dev, int startup)
2911 {
2912
2913         if (dev_is_pci(dev)) {
2914                 struct pci_dev *pdev = to_pci_dev(dev);
2915
2916                 if (device_is_rmrr_locked(dev))
2917                         return 0;
2918
2919                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2920                         return 1;
2921
2922                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2923                         return 1;
2924
2925                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2926                         return 0;
2927
2928                 /*
2929                  * We want to start off with all devices in the 1:1 domain, and
2930                  * take them out later if we find they can't access all of memory.
2931                  *
2932                  * However, we can't do this for PCI devices behind bridges,
2933                  * because all PCI devices behind the same bridge will end up
2934                  * with the same source-id on their transactions.
2935                  *
2936                  * Practically speaking, we can't change things around for these
2937                  * devices at run-time, because we can't be sure there'll be no
2938                  * DMA transactions in flight for any of their siblings.
2939                  *
2940                  * So PCI devices (unless they're on the root bus) as well as
2941                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2942                  * the 1:1 domain, just in _case_ one of their siblings turns out
2943                  * not to be able to map all of memory.
2944                  */
2945                 if (!pci_is_pcie(pdev)) {
2946                         if (!pci_is_root_bus(pdev->bus))
2947                                 return 0;
2948                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2949                                 return 0;
2950                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2951                         return 0;
2952         } else {
2953                 if (device_has_rmrr(dev))
2954                         return 0;
2955         }
2956
2957         /*
2958          * At boot time, we don't yet know if devices will be 64-bit capable.
2959          * Assume that they will — if they turn out not to be, then we can
2960          * take them out of the 1:1 domain later.
2961          */
2962         if (!startup) {
2963                 /*
2964                  * If the device's dma_mask is less than the system's memory
2965                  * size then this is not a candidate for identity mapping.
2966                  */
2967                 u64 dma_mask = *dev->dma_mask;
2968
2969                 if (dev->coherent_dma_mask &&
2970                     dev->coherent_dma_mask < dma_mask)
2971                         dma_mask = dev->coherent_dma_mask;
2972
2973                 return dma_mask >= dma_get_required_mask(dev);
2974         }
2975
2976         return 1;
2977 }
2978
2979 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2980 {
2981         int ret;
2982
2983         if (!iommu_should_identity_map(dev, 1))
2984                 return 0;
2985
2986         ret = domain_add_dev_info(si_domain, dev);
2987         if (!ret)
2988                 pr_info("%s identity mapping for device %s\n",
2989                         hw ? "Hardware" : "Software", dev_name(dev));
2990         else if (ret == -ENODEV)
2991                 /* device not associated with an iommu */
2992                 ret = 0;
2993
2994         return ret;
2995 }
2996
2997
2998 static int __init iommu_prepare_static_identity_mapping(int hw)
2999 {
3000         struct pci_dev *pdev = NULL;
3001         struct dmar_drhd_unit *drhd;
3002         struct intel_iommu *iommu;
3003         struct device *dev;
3004         int i;
3005         int ret = 0;
3006
3007         for_each_pci_dev(pdev) {
3008                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3009                 if (ret)
3010                         return ret;
3011         }
3012
3013         for_each_active_iommu(iommu, drhd)
3014                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3015                         struct acpi_device_physical_node *pn;
3016                         struct acpi_device *adev;
3017
3018                         if (dev->bus != &acpi_bus_type)
3019                                 continue;
3020
3021                         adev= to_acpi_device(dev);
3022                         mutex_lock(&adev->physical_node_lock);
3023                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3024                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3025                                 if (ret)
3026                                         break;
3027                         }
3028                         mutex_unlock(&adev->physical_node_lock);
3029                         if (ret)
3030                                 return ret;
3031                 }
3032
3033         return 0;
3034 }
3035
3036 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3037 {
3038         /*
3039          * Start from the sane iommu hardware state.
3040          * If the queued invalidation is already initialized by us
3041          * (for example, while enabling interrupt-remapping) then
3042          * we got the things already rolling from a sane state.
3043          */
3044         if (!iommu->qi) {
3045                 /*
3046                  * Clear any previous faults.
3047                  */
3048                 dmar_fault(-1, iommu);
3049                 /*
3050                  * Disable queued invalidation if supported and already enabled
3051                  * before OS handover.
3052                  */
3053                 dmar_disable_qi(iommu);
3054         }
3055
3056         if (dmar_enable_qi(iommu)) {
3057                 /*
3058                  * Queued Invalidate not enabled, use Register Based Invalidate
3059                  */
3060                 iommu->flush.flush_context = __iommu_flush_context;
3061                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3062                 pr_info("%s: Using Register based invalidation\n",
3063                         iommu->name);
3064         } else {
3065                 iommu->flush.flush_context = qi_flush_context;
3066                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3067                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3068         }
3069 }
3070
3071 static int copy_context_table(struct intel_iommu *iommu,
3072                               struct root_entry *old_re,
3073                               struct context_entry **tbl,
3074                               int bus, bool ext)
3075 {
3076         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3077         struct context_entry *new_ce = NULL, ce;
3078         struct context_entry *old_ce = NULL;
3079         struct root_entry re;
3080         phys_addr_t old_ce_phys;
3081
3082         tbl_idx = ext ? bus * 2 : bus;
3083         memcpy(&re, old_re, sizeof(re));
3084
3085         for (devfn = 0; devfn < 256; devfn++) {
3086                 /* First calculate the correct index */
3087                 idx = (ext ? devfn * 2 : devfn) % 256;
3088
3089                 if (idx == 0) {
3090                         /* First save what we may have and clean up */
3091                         if (new_ce) {
3092                                 tbl[tbl_idx] = new_ce;
3093                                 __iommu_flush_cache(iommu, new_ce,
3094                                                     VTD_PAGE_SIZE);
3095                                 pos = 1;
3096                         }
3097
3098                         if (old_ce)
3099                                 iounmap(old_ce);
3100
3101                         ret = 0;
3102                         if (devfn < 0x80)
3103                                 old_ce_phys = root_entry_lctp(&re);
3104                         else
3105                                 old_ce_phys = root_entry_uctp(&re);
3106
3107                         if (!old_ce_phys) {
3108                                 if (ext && devfn == 0) {
3109                                         /* No LCTP, try UCTP */
3110                                         devfn = 0x7f;
3111                                         continue;
3112                                 } else {
3113                                         goto out;
3114                                 }
3115                         }
3116
3117                         ret = -ENOMEM;
3118                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3119                                         MEMREMAP_WB);
3120                         if (!old_ce)
3121                                 goto out;
3122
3123                         new_ce = alloc_pgtable_page(iommu->node);
3124                         if (!new_ce)
3125                                 goto out_unmap;
3126
3127                         ret = 0;
3128                 }
3129
3130                 /* Now copy the context entry */
3131                 memcpy(&ce, old_ce + idx, sizeof(ce));
3132
3133                 if (!__context_present(&ce))
3134                         continue;
3135
3136                 did = context_domain_id(&ce);
3137                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3138                         set_bit(did, iommu->domain_ids);
3139
3140                 /*
3141                  * We need a marker for copied context entries. This
3142                  * marker needs to work for the old format as well as
3143                  * for extended context entries.
3144                  *
3145                  * Bit 67 of the context entry is used. In the old
3146                  * format this bit is available to software, in the
3147                  * extended format it is the PGE bit, but PGE is ignored
3148                  * by HW if PASIDs are disabled (and thus still
3149                  * available).
3150                  *
3151                  * So disable PASIDs first and then mark the entry
3152                  * copied. This means that we don't copy PASID
3153                  * translations from the old kernel, but this is fine as
3154                  * faults there are not fatal.
3155                  */
3156                 context_clear_pasid_enable(&ce);
3157                 context_set_copied(&ce);
3158
3159                 new_ce[idx] = ce;
3160         }
3161
3162         tbl[tbl_idx + pos] = new_ce;
3163
3164         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3165
3166 out_unmap:
3167         memunmap(old_ce);
3168
3169 out:
3170         return ret;
3171 }
3172
3173 static int copy_translation_tables(struct intel_iommu *iommu)
3174 {
3175         struct context_entry **ctxt_tbls;
3176         struct root_entry *old_rt;
3177         phys_addr_t old_rt_phys;
3178         int ctxt_table_entries;
3179         unsigned long flags;
3180         u64 rtaddr_reg;
3181         int bus, ret;
3182         bool new_ext, ext;
3183
3184         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3185         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3186         new_ext    = !!ecap_ecs(iommu->ecap);
3187
3188         /*
3189          * The RTT bit can only be changed when translation is disabled,
3190          * but disabling translation means to open a window for data
3191          * corruption. So bail out and don't copy anything if we would
3192          * have to change the bit.
3193          */
3194         if (new_ext != ext)
3195                 return -EINVAL;
3196
3197         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3198         if (!old_rt_phys)
3199                 return -EINVAL;
3200
3201         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3202         if (!old_rt)
3203                 return -ENOMEM;
3204
3205         /* This is too big for the stack - allocate it from slab */
3206         ctxt_table_entries = ext ? 512 : 256;
3207         ret = -ENOMEM;
3208         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3209         if (!ctxt_tbls)
3210                 goto out_unmap;
3211
3212         for (bus = 0; bus < 256; bus++) {
3213                 ret = copy_context_table(iommu, &old_rt[bus],
3214                                          ctxt_tbls, bus, ext);
3215                 if (ret) {
3216                         pr_err("%s: Failed to copy context table for bus %d\n",
3217                                 iommu->name, bus);
3218                         continue;
3219                 }
3220         }
3221
3222         spin_lock_irqsave(&iommu->lock, flags);
3223
3224         /* Context tables are copied, now write them to the root_entry table */
3225         for (bus = 0; bus < 256; bus++) {
3226                 int idx = ext ? bus * 2 : bus;
3227                 u64 val;
3228
3229                 if (ctxt_tbls[idx]) {
3230                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3231                         iommu->root_entry[bus].lo = val;
3232                 }
3233
3234                 if (!ext || !ctxt_tbls[idx + 1])
3235                         continue;
3236
3237                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3238                 iommu->root_entry[bus].hi = val;
3239         }
3240
3241         spin_unlock_irqrestore(&iommu->lock, flags);
3242
3243         kfree(ctxt_tbls);
3244
3245         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3246
3247         ret = 0;
3248
3249 out_unmap:
3250         memunmap(old_rt);
3251
3252         return ret;
3253 }
3254
3255 static int __init init_dmars(void)
3256 {
3257         struct dmar_drhd_unit *drhd;
3258         struct dmar_rmrr_unit *rmrr;
3259         bool copied_tables = false;
3260         struct device *dev;
3261         struct intel_iommu *iommu;
3262         int i, ret;
3263
3264         /*
3265          * for each drhd
3266          *    allocate root
3267          *    initialize and program root entry to not present
3268          * endfor
3269          */
3270         for_each_drhd_unit(drhd) {
3271                 /*
3272                  * lock not needed as this is only incremented in the single
3273                  * threaded kernel __init code path all other access are read
3274                  * only
3275                  */
3276                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3277                         g_num_of_iommus++;
3278                         continue;
3279                 }
3280                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3281         }
3282
3283         /* Preallocate enough resources for IOMMU hot-addition */
3284         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3285                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3286
3287         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3288                         GFP_KERNEL);
3289         if (!g_iommus) {
3290                 pr_err("Allocating global iommu array failed\n");
3291                 ret = -ENOMEM;
3292                 goto error;
3293         }
3294
3295         for_each_active_iommu(iommu, drhd) {
3296                 g_iommus[iommu->seq_id] = iommu;
3297
3298                 intel_iommu_init_qi(iommu);
3299
3300                 ret = iommu_init_domains(iommu);
3301                 if (ret)
3302                         goto free_iommu;
3303
3304                 init_translation_status(iommu);
3305
3306                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3307                         iommu_disable_translation(iommu);
3308                         clear_translation_pre_enabled(iommu);
3309                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3310                                 iommu->name);
3311                 }
3312
3313                 /*
3314                  * TBD:
3315                  * we could share the same root & context tables
3316                  * among all IOMMU's. Need to Split it later.
3317                  */
3318                 ret = iommu_alloc_root_entry(iommu);
3319                 if (ret)
3320                         goto free_iommu;
3321
3322                 if (translation_pre_enabled(iommu)) {
3323                         pr_info("Translation already enabled - trying to copy translation structures\n");
3324
3325                         ret = copy_translation_tables(iommu);
3326                         if (ret) {
3327                                 /*
3328                                  * We found the IOMMU with translation
3329                                  * enabled - but failed to copy over the
3330                                  * old root-entry table. Try to proceed
3331                                  * by disabling translation now and
3332                                  * allocating a clean root-entry table.
3333                                  * This might cause DMAR faults, but
3334                                  * probably the dump will still succeed.
3335                                  */
3336                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3337                                        iommu->name);
3338                                 iommu_disable_translation(iommu);
3339                                 clear_translation_pre_enabled(iommu);
3340                         } else {
3341                                 pr_info("Copied translation tables from previous kernel for %s\n",
3342                                         iommu->name);
3343                                 copied_tables = true;
3344                         }
3345                 }
3346
3347                 if (!ecap_pass_through(iommu->ecap))
3348                         hw_pass_through = 0;
3349 #ifdef CONFIG_INTEL_IOMMU_SVM
3350                 if (pasid_enabled(iommu))
3351                         intel_svm_alloc_pasid_tables(iommu);
3352 #endif
3353         }
3354
3355         /*
3356          * Now that qi is enabled on all iommus, set the root entry and flush
3357          * caches. This is required on some Intel X58 chipsets, otherwise the
3358          * flush_context function will loop forever and the boot hangs.
3359          */
3360         for_each_active_iommu(iommu, drhd) {
3361                 iommu_flush_write_buffer(iommu);
3362                 iommu_set_root_entry(iommu);
3363                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3364                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3365         }
3366
3367         if (iommu_pass_through)
3368                 iommu_identity_mapping |= IDENTMAP_ALL;
3369
3370 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3371         iommu_identity_mapping |= IDENTMAP_GFX;
3372 #endif
3373
3374         check_tylersburg_isoch();
3375
3376         if (iommu_identity_mapping) {
3377                 ret = si_domain_init(hw_pass_through);
3378                 if (ret)
3379                         goto free_iommu;
3380         }
3381
3382
3383         /*
3384          * If we copied translations from a previous kernel in the kdump
3385          * case, we can not assign the devices to domains now, as that
3386          * would eliminate the old mappings. So skip this part and defer
3387          * the assignment to device driver initialization time.
3388          */
3389         if (copied_tables)
3390                 goto domains_done;
3391
3392         /*
3393          * If pass through is not set or not enabled, setup context entries for
3394          * identity mappings for rmrr, gfx, and isa and may fall back to static
3395          * identity mapping if iommu_identity_mapping is set.
3396          */
3397         if (iommu_identity_mapping) {
3398                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3399                 if (ret) {
3400                         pr_crit("Failed to setup IOMMU pass-through\n");
3401                         goto free_iommu;
3402                 }
3403         }
3404         /*
3405          * For each rmrr
3406          *   for each dev attached to rmrr
3407          *   do
3408          *     locate drhd for dev, alloc domain for dev
3409          *     allocate free domain
3410          *     allocate page table entries for rmrr
3411          *     if context not allocated for bus
3412          *           allocate and init context
3413          *           set present in root table for this bus
3414          *     init context with domain, translation etc
3415          *    endfor
3416          * endfor
3417          */
3418         pr_info("Setting RMRR:\n");
3419         for_each_rmrr_units(rmrr) {
3420                 /* some BIOS lists non-exist devices in DMAR table. */
3421                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3422                                           i, dev) {
3423                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3424                         if (ret)
3425                                 pr_err("Mapping reserved region failed\n");
3426                 }
3427         }
3428
3429         iommu_prepare_isa();
3430
3431 domains_done:
3432
3433         /*
3434          * for each drhd
3435          *   enable fault log
3436          *   global invalidate context cache
3437          *   global invalidate iotlb
3438          *   enable translation
3439          */
3440         for_each_iommu(iommu, drhd) {
3441                 if (drhd->ignored) {
3442                         /*
3443                          * we always have to disable PMRs or DMA may fail on
3444                          * this device
3445                          */
3446                         if (force_on)
3447                                 iommu_disable_protect_mem_regions(iommu);
3448                         continue;
3449                 }
3450
3451                 iommu_flush_write_buffer(iommu);
3452
3453 #ifdef CONFIG_INTEL_IOMMU_SVM
3454                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3455                         ret = intel_svm_enable_prq(iommu);
3456                         if (ret)
3457                                 goto free_iommu;
3458                 }
3459 #endif
3460                 ret = dmar_set_interrupt(iommu);
3461                 if (ret)
3462                         goto free_iommu;
3463
3464                 if (!translation_pre_enabled(iommu))
3465                         iommu_enable_translation(iommu);
3466
3467                 iommu_disable_protect_mem_regions(iommu);
3468         }
3469
3470         return 0;
3471
3472 free_iommu:
3473         for_each_active_iommu(iommu, drhd) {
3474                 disable_dmar_iommu(iommu);
3475                 free_dmar_iommu(iommu);
3476         }
3477
3478         kfree(g_iommus);
3479
3480 error:
3481         return ret;
3482 }
3483
3484 /* This takes a number of _MM_ pages, not VTD pages */
3485 static unsigned long intel_alloc_iova(struct device *dev,
3486                                      struct dmar_domain *domain,
3487                                      unsigned long nrpages, uint64_t dma_mask)
3488 {
3489         unsigned long iova_pfn = 0;
3490
3491         /* Restrict dma_mask to the width that the iommu can handle */
3492         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3493         /* Ensure we reserve the whole size-aligned region */
3494         nrpages = __roundup_pow_of_two(nrpages);
3495
3496         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3497                 /*
3498                  * First try to allocate an io virtual address in
3499                  * DMA_BIT_MASK(32) and if that fails then try allocating
3500                  * from higher range
3501                  */
3502                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3503                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3504                 if (iova_pfn)
3505                         return iova_pfn;
3506         }
3507         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3508                                    IOVA_PFN(dma_mask), true);
3509         if (unlikely(!iova_pfn)) {
3510                 pr_err("Allocating %ld-page iova for %s failed",
3511                        nrpages, dev_name(dev));
3512                 return 0;
3513         }
3514
3515         return iova_pfn;
3516 }
3517
3518 static struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3519 {
3520         struct dmar_domain *domain, *tmp;
3521         struct dmar_rmrr_unit *rmrr;
3522         struct device *i_dev;
3523         int i, ret;
3524
3525         domain = find_domain(dev);
3526         if (domain)
3527                 goto out;
3528
3529         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3530         if (!domain)
3531                 goto out;
3532
3533         /* We have a new domain - setup possible RMRRs for the device */
3534         rcu_read_lock();
3535         for_each_rmrr_units(rmrr) {
3536                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3537                                           i, i_dev) {
3538                         if (i_dev != dev)
3539                                 continue;
3540
3541                         ret = domain_prepare_identity_map(dev, domain,
3542                                                           rmrr->base_address,
3543                                                           rmrr->end_address);
3544                         if (ret)
3545                                 dev_err(dev, "Mapping reserved region failed\n");
3546                 }
3547         }
3548         rcu_read_unlock();
3549
3550         tmp = set_domain_for_dev(dev, domain);
3551         if (!tmp || domain != tmp) {
3552                 domain_exit(domain);
3553                 domain = tmp;
3554         }
3555
3556 out:
3557
3558         if (!domain)
3559                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3560
3561
3562         return domain;
3563 }
3564
3565 /* Check if the dev needs to go through non-identity map and unmap process.*/
3566 static int iommu_no_mapping(struct device *dev)
3567 {
3568         int found;
3569
3570         if (iommu_dummy(dev))
3571                 return 1;
3572
3573         if (!iommu_identity_mapping)
3574                 return 0;
3575
3576         found = identity_mapping(dev);
3577         if (found) {
3578                 if (iommu_should_identity_map(dev, 0))
3579                         return 1;
3580                 else {
3581                         /*
3582                          * 32 bit DMA is removed from si_domain and fall back
3583                          * to non-identity mapping.
3584                          */
3585                         dmar_remove_one_dev_info(si_domain, dev);
3586                         pr_info("32bit %s uses non-identity mapping\n",
3587                                 dev_name(dev));
3588                         return 0;
3589                 }
3590         } else {
3591                 /*
3592                  * In case of a detached 64 bit DMA device from vm, the device
3593                  * is put into si_domain for identity mapping.
3594                  */
3595                 if (iommu_should_identity_map(dev, 0)) {
3596                         int ret;
3597                         ret = domain_add_dev_info(si_domain, dev);
3598                         if (!ret) {
3599                                 pr_info("64bit %s uses identity mapping\n",
3600                                         dev_name(dev));
3601                                 return 1;
3602                         }
3603                 }
3604         }
3605
3606         return 0;
3607 }
3608
3609 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3610                                      size_t size, int dir, u64 dma_mask)
3611 {
3612         struct dmar_domain *domain;
3613         phys_addr_t start_paddr;
3614         unsigned long iova_pfn;
3615         int prot = 0;
3616         int ret;
3617         struct intel_iommu *iommu;
3618         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3619
3620         BUG_ON(dir == DMA_NONE);
3621
3622         if (iommu_no_mapping(dev))
3623                 return paddr;
3624
3625         domain = get_valid_domain_for_dev(dev);
3626         if (!domain)
3627                 return 0;
3628
3629         iommu = domain_get_iommu(domain);
3630         size = aligned_nrpages(paddr, size);
3631
3632         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3633         if (!iova_pfn)
3634                 goto error;
3635
3636         /*
3637          * Check if DMAR supports zero-length reads on write only
3638          * mappings..
3639          */
3640         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3641                         !cap_zlr(iommu->cap))
3642                 prot |= DMA_PTE_READ;
3643         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3644                 prot |= DMA_PTE_WRITE;
3645         /*
3646          * paddr - (paddr + size) might be partial page, we should map the whole
3647          * page.  Note: if two part of one page are separately mapped, we
3648          * might have two guest_addr mapping to the same host paddr, but this
3649          * is not a big problem
3650          */
3651         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3652                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3653         if (ret)
3654                 goto error;
3655
3656         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3657         start_paddr += paddr & ~PAGE_MASK;
3658         return start_paddr;
3659
3660 error:
3661         if (iova_pfn)
3662                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3663         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3664                 dev_name(dev), size, (unsigned long long)paddr, dir);
3665         return 0;
3666 }
3667
3668 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3669                                  unsigned long offset, size_t size,
3670                                  enum dma_data_direction dir,
3671                                  unsigned long attrs)
3672 {
3673         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3674                                   dir, *dev->dma_mask);
3675 }
3676
3677 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3678 {
3679         struct dmar_domain *domain;
3680         unsigned long start_pfn, last_pfn;
3681         unsigned long nrpages;
3682         unsigned long iova_pfn;
3683         struct intel_iommu *iommu;
3684         struct page *freelist;
3685
3686         if (iommu_no_mapping(dev))
3687                 return;
3688
3689         domain = find_domain(dev);
3690         BUG_ON(!domain);
3691
3692         iommu = domain_get_iommu(domain);
3693
3694         iova_pfn = IOVA_PFN(dev_addr);
3695
3696         nrpages = aligned_nrpages(dev_addr, size);
3697         start_pfn = mm_to_dma_pfn(iova_pfn);
3698         last_pfn = start_pfn + nrpages - 1;
3699
3700         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3701                  dev_name(dev), start_pfn, last_pfn);
3702
3703         freelist = domain_unmap(domain, start_pfn, last_pfn);
3704
3705         if (intel_iommu_strict) {
3706                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3707                                       nrpages, !freelist, 0);
3708                 /* free iova */
3709                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3710                 dma_free_pagelist(freelist);
3711         } else {
3712                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3713                            (unsigned long)freelist);
3714                 /*
3715                  * queue up the release of the unmap to save the 1/6th of the
3716                  * cpu used up by the iotlb flush operation...
3717                  */
3718         }
3719 }
3720
3721 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3722                              size_t size, enum dma_data_direction dir,
3723                              unsigned long attrs)
3724 {
3725         intel_unmap(dev, dev_addr, size);
3726 }
3727
3728 static void *intel_alloc_coherent(struct device *dev, size_t size,
3729                                   dma_addr_t *dma_handle, gfp_t flags,
3730                                   unsigned long attrs)
3731 {
3732         void *vaddr;
3733
3734         vaddr = dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3735         if (iommu_no_mapping(dev) || !vaddr)
3736                 return vaddr;
3737
3738         *dma_handle = __intel_map_single(dev, virt_to_phys(vaddr),
3739                         PAGE_ALIGN(size), DMA_BIDIRECTIONAL,
3740                         dev->coherent_dma_mask);
3741         if (!*dma_handle)
3742                 goto out_free_pages;
3743         return vaddr;
3744
3745 out_free_pages:
3746         dma_direct_free(dev, size, vaddr, *dma_handle, attrs);
3747         return NULL;
3748 }
3749
3750 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3751                                 dma_addr_t dma_handle, unsigned long attrs)
3752 {
3753         if (!iommu_no_mapping(dev))
3754                 intel_unmap(dev, dma_handle, PAGE_ALIGN(size));
3755         dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3756 }
3757
3758 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3759                            int nelems, enum dma_data_direction dir,
3760                            unsigned long attrs)
3761 {
3762         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3763         unsigned long nrpages = 0;
3764         struct scatterlist *sg;
3765         int i;
3766
3767         for_each_sg(sglist, sg, nelems, i) {
3768                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3769         }
3770
3771         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3772 }
3773
3774 static int intel_nontranslate_map_sg(struct device *hddev,
3775         struct scatterlist *sglist, int nelems, int dir)
3776 {
3777         int i;
3778         struct scatterlist *sg;
3779
3780         for_each_sg(sglist, sg, nelems, i) {
3781                 BUG_ON(!sg_page(sg));
3782                 sg->dma_address = sg_phys(sg);
3783                 sg->dma_length = sg->length;
3784         }
3785         return nelems;
3786 }
3787
3788 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3789                         enum dma_data_direction dir, unsigned long attrs)
3790 {
3791         int i;
3792         struct dmar_domain *domain;
3793         size_t size = 0;
3794         int prot = 0;
3795         unsigned long iova_pfn;
3796         int ret;
3797         struct scatterlist *sg;
3798         unsigned long start_vpfn;
3799         struct intel_iommu *iommu;
3800
3801         BUG_ON(dir == DMA_NONE);
3802         if (iommu_no_mapping(dev))
3803                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3804
3805         domain = get_valid_domain_for_dev(dev);
3806         if (!domain)
3807                 return 0;
3808
3809         iommu = domain_get_iommu(domain);
3810
3811         for_each_sg(sglist, sg, nelems, i)
3812                 size += aligned_nrpages(sg->offset, sg->length);
3813
3814         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3815                                 *dev->dma_mask);
3816         if (!iova_pfn) {
3817                 sglist->dma_length = 0;
3818                 return 0;
3819         }
3820
3821         /*
3822          * Check if DMAR supports zero-length reads on write only
3823          * mappings..
3824          */
3825         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3826                         !cap_zlr(iommu->cap))
3827                 prot |= DMA_PTE_READ;
3828         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3829                 prot |= DMA_PTE_WRITE;
3830
3831         start_vpfn = mm_to_dma_pfn(iova_pfn);
3832
3833         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3834         if (unlikely(ret)) {
3835                 dma_pte_free_pagetable(domain, start_vpfn,
3836                                        start_vpfn + size - 1,
3837                                        agaw_to_level(domain->agaw) + 1);
3838                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3839                 return 0;
3840         }
3841
3842         return nelems;
3843 }
3844
3845 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3846 {
3847         return !dma_addr;
3848 }
3849
3850 const struct dma_map_ops intel_dma_ops = {
3851         .alloc = intel_alloc_coherent,
3852         .free = intel_free_coherent,
3853         .map_sg = intel_map_sg,
3854         .unmap_sg = intel_unmap_sg,
3855         .map_page = intel_map_page,
3856         .unmap_page = intel_unmap_page,
3857         .mapping_error = intel_mapping_error,
3858 #ifdef CONFIG_X86
3859         .dma_supported = dma_direct_supported,
3860 #endif
3861 };
3862
3863 static inline int iommu_domain_cache_init(void)
3864 {
3865         int ret = 0;
3866
3867         iommu_domain_cache = kmem_cache_create("iommu_domain",
3868                                          sizeof(struct dmar_domain),
3869                                          0,
3870                                          SLAB_HWCACHE_ALIGN,
3871
3872                                          NULL);
3873         if (!iommu_domain_cache) {
3874                 pr_err("Couldn't create iommu_domain cache\n");
3875                 ret = -ENOMEM;
3876         }
3877
3878         return ret;
3879 }
3880
3881 static inline int iommu_devinfo_cache_init(void)
3882 {
3883         int ret = 0;
3884
3885         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3886                                          sizeof(struct device_domain_info),
3887                                          0,
3888                                          SLAB_HWCACHE_ALIGN,
3889                                          NULL);
3890         if (!iommu_devinfo_cache) {
3891                 pr_err("Couldn't create devinfo cache\n");
3892                 ret = -ENOMEM;
3893         }
3894
3895         return ret;
3896 }
3897
3898 static int __init iommu_init_mempool(void)
3899 {
3900         int ret;
3901         ret = iova_cache_get();
3902         if (ret)
3903                 return ret;
3904
3905         ret = iommu_domain_cache_init();
3906         if (ret)
3907                 goto domain_error;
3908
3909         ret = iommu_devinfo_cache_init();
3910         if (!ret)
3911                 return ret;
3912
3913         kmem_cache_destroy(iommu_domain_cache);
3914 domain_error:
3915         iova_cache_put();
3916
3917         return -ENOMEM;
3918 }
3919
3920 static void __init iommu_exit_mempool(void)
3921 {
3922         kmem_cache_destroy(iommu_devinfo_cache);
3923         kmem_cache_destroy(iommu_domain_cache);
3924         iova_cache_put();
3925 }
3926
3927 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3928 {
3929         struct dmar_drhd_unit *drhd;
3930         u32 vtbar;
3931         int rc;
3932
3933         /* We know that this device on this chipset has its own IOMMU.
3934          * If we find it under a different IOMMU, then the BIOS is lying
3935          * to us. Hope that the IOMMU for this device is actually
3936          * disabled, and it needs no translation...
3937          */
3938         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3939         if (rc) {
3940                 /* "can't" happen */
3941                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3942                 return;
3943         }
3944         vtbar &= 0xffff0000;
3945
3946         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3947         drhd = dmar_find_matched_drhd_unit(pdev);
3948         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3949                             TAINT_FIRMWARE_WORKAROUND,
3950                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3951                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3952 }
3953 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3954
3955 static void __init init_no_remapping_devices(void)
3956 {
3957         struct dmar_drhd_unit *drhd;
3958         struct device *dev;
3959         int i;
3960
3961         for_each_drhd_unit(drhd) {
3962                 if (!drhd->include_all) {
3963                         for_each_active_dev_scope(drhd->devices,
3964                                                   drhd->devices_cnt, i, dev)
3965                                 break;
3966                         /* ignore DMAR unit if no devices exist */
3967                         if (i == drhd->devices_cnt)
3968                                 drhd->ignored = 1;
3969                 }
3970         }
3971
3972         for_each_active_drhd_unit(drhd) {
3973                 if (drhd->include_all)
3974                         continue;
3975
3976                 for_each_active_dev_scope(drhd->devices,
3977                                           drhd->devices_cnt, i, dev)
3978                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3979                                 break;
3980                 if (i < drhd->devices_cnt)
3981                         continue;
3982
3983                 /* This IOMMU has *only* gfx devices. Either bypass it or
3984                    set the gfx_mapped flag, as appropriate */
3985                 if (dmar_map_gfx) {
3986                         intel_iommu_gfx_mapped = 1;
3987                 } else {
3988                         drhd->ignored = 1;
3989                         for_each_active_dev_scope(drhd->devices,
3990                                                   drhd->devices_cnt, i, dev)
3991                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3992                 }
3993         }
3994 }
3995
3996 #ifdef CONFIG_SUSPEND
3997 static int init_iommu_hw(void)
3998 {
3999         struct dmar_drhd_unit *drhd;
4000         struct intel_iommu *iommu = NULL;
4001
4002         for_each_active_iommu(iommu, drhd)
4003                 if (iommu->qi)
4004                         dmar_reenable_qi(iommu);
4005
4006         for_each_iommu(iommu, drhd) {
4007                 if (drhd->ignored) {
4008                         /*
4009                          * we always have to disable PMRs or DMA may fail on
4010                          * this device
4011                          */
4012                         if (force_on)
4013                                 iommu_disable_protect_mem_regions(iommu);
4014                         continue;
4015                 }
4016         
4017                 iommu_flush_write_buffer(iommu);
4018
4019                 iommu_set_root_entry(iommu);
4020
4021                 iommu->flush.flush_context(iommu, 0, 0, 0,
4022                                            DMA_CCMD_GLOBAL_INVL);
4023                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4024                 iommu_enable_translation(iommu);
4025                 iommu_disable_protect_mem_regions(iommu);
4026         }
4027
4028         return 0;
4029 }
4030
4031 static void iommu_flush_all(void)
4032 {
4033         struct dmar_drhd_unit *drhd;
4034         struct intel_iommu *iommu;
4035
4036         for_each_active_iommu(iommu, drhd) {
4037                 iommu->flush.flush_context(iommu, 0, 0, 0,
4038                                            DMA_CCMD_GLOBAL_INVL);
4039                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4040                                          DMA_TLB_GLOBAL_FLUSH);
4041         }
4042 }
4043
4044 static int iommu_suspend(void)
4045 {
4046         struct dmar_drhd_unit *drhd;
4047         struct intel_iommu *iommu = NULL;
4048         unsigned long flag;
4049
4050         for_each_active_iommu(iommu, drhd) {
4051                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4052                                                  GFP_ATOMIC);
4053                 if (!iommu->iommu_state)
4054                         goto nomem;
4055         }
4056
4057         iommu_flush_all();
4058
4059         for_each_active_iommu(iommu, drhd) {
4060                 iommu_disable_translation(iommu);
4061
4062                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4063
4064                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4065                         readl(iommu->reg + DMAR_FECTL_REG);
4066                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4067                         readl(iommu->reg + DMAR_FEDATA_REG);
4068                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4069                         readl(iommu->reg + DMAR_FEADDR_REG);
4070                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4071                         readl(iommu->reg + DMAR_FEUADDR_REG);
4072
4073                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4074         }
4075         return 0;
4076
4077 nomem:
4078         for_each_active_iommu(iommu, drhd)
4079                 kfree(iommu->iommu_state);
4080
4081         return -ENOMEM;
4082 }
4083
4084 static void iommu_resume(void)
4085 {
4086         struct dmar_drhd_unit *drhd;
4087         struct intel_iommu *iommu = NULL;
4088         unsigned long flag;
4089
4090         if (init_iommu_hw()) {
4091                 if (force_on)
4092                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4093                 else
4094                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4095                 return;
4096         }
4097
4098         for_each_active_iommu(iommu, drhd) {
4099
4100                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4101
4102                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4103                         iommu->reg + DMAR_FECTL_REG);
4104                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4105                         iommu->reg + DMAR_FEDATA_REG);
4106                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4107                         iommu->reg + DMAR_FEADDR_REG);
4108                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4109                         iommu->reg + DMAR_FEUADDR_REG);
4110
4111                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4112         }
4113
4114         for_each_active_iommu(iommu, drhd)
4115                 kfree(iommu->iommu_state);
4116 }
4117
4118 static struct syscore_ops iommu_syscore_ops = {
4119         .resume         = iommu_resume,
4120         .suspend        = iommu_suspend,
4121 };
4122
4123 static void __init init_iommu_pm_ops(void)
4124 {
4125         register_syscore_ops(&iommu_syscore_ops);
4126 }
4127
4128 #else
4129 static inline void init_iommu_pm_ops(void) {}
4130 #endif  /* CONFIG_PM */
4131
4132
4133 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4134 {
4135         struct acpi_dmar_reserved_memory *rmrr;
4136         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4137         struct dmar_rmrr_unit *rmrru;
4138         size_t length;
4139
4140         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4141         if (!rmrru)
4142                 goto out;
4143
4144         rmrru->hdr = header;
4145         rmrr = (struct acpi_dmar_reserved_memory *)header;
4146         rmrru->base_address = rmrr->base_address;
4147         rmrru->end_address = rmrr->end_address;
4148
4149         length = rmrr->end_address - rmrr->base_address + 1;
4150         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4151                                               IOMMU_RESV_DIRECT);
4152         if (!rmrru->resv)
4153                 goto free_rmrru;
4154
4155         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4156                                 ((void *)rmrr) + rmrr->header.length,
4157                                 &rmrru->devices_cnt);
4158         if (rmrru->devices_cnt && rmrru->devices == NULL)
4159                 goto free_all;
4160
4161         list_add(&rmrru->list, &dmar_rmrr_units);
4162
4163         return 0;
4164 free_all:
4165         kfree(rmrru->resv);
4166 free_rmrru:
4167         kfree(rmrru);
4168 out:
4169         return -ENOMEM;
4170 }
4171
4172 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4173 {
4174         struct dmar_atsr_unit *atsru;
4175         struct acpi_dmar_atsr *tmp;
4176
4177         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4178                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4179                 if (atsr->segment != tmp->segment)
4180                         continue;
4181                 if (atsr->header.length != tmp->header.length)
4182                         continue;
4183                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4184                         return atsru;
4185         }
4186
4187         return NULL;
4188 }
4189
4190 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4191 {
4192         struct acpi_dmar_atsr *atsr;
4193         struct dmar_atsr_unit *atsru;
4194
4195         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4196                 return 0;
4197
4198         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4199         atsru = dmar_find_atsr(atsr);
4200         if (atsru)
4201                 return 0;
4202
4203         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4204         if (!atsru)
4205                 return -ENOMEM;
4206
4207         /*
4208          * If memory is allocated from slab by ACPI _DSM method, we need to
4209          * copy the memory content because the memory buffer will be freed
4210          * on return.
4211          */
4212         atsru->hdr = (void *)(atsru + 1);
4213         memcpy(atsru->hdr, hdr, hdr->length);
4214         atsru->include_all = atsr->flags & 0x1;
4215         if (!atsru->include_all) {
4216                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4217                                 (void *)atsr + atsr->header.length,
4218                                 &atsru->devices_cnt);
4219                 if (atsru->devices_cnt && atsru->devices == NULL) {
4220                         kfree(atsru);
4221                         return -ENOMEM;
4222                 }
4223         }
4224
4225         list_add_rcu(&atsru->list, &dmar_atsr_units);
4226
4227         return 0;
4228 }
4229
4230 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4231 {
4232         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4233         kfree(atsru);
4234 }
4235
4236 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4237 {
4238         struct acpi_dmar_atsr *atsr;
4239         struct dmar_atsr_unit *atsru;
4240
4241         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4242         atsru = dmar_find_atsr(atsr);
4243         if (atsru) {
4244                 list_del_rcu(&atsru->list);
4245                 synchronize_rcu();
4246                 intel_iommu_free_atsr(atsru);
4247         }
4248
4249         return 0;
4250 }
4251
4252 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4253 {
4254         int i;
4255         struct device *dev;
4256         struct acpi_dmar_atsr *atsr;
4257         struct dmar_atsr_unit *atsru;
4258
4259         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4260         atsru = dmar_find_atsr(atsr);
4261         if (!atsru)
4262                 return 0;
4263
4264         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4265                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4266                                           i, dev)
4267                         return -EBUSY;
4268         }
4269
4270         return 0;
4271 }
4272
4273 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4274 {
4275         int sp, ret = 0;
4276         struct intel_iommu *iommu = dmaru->iommu;
4277
4278         if (g_iommus[iommu->seq_id])
4279                 return 0;
4280
4281         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4282                 pr_warn("%s: Doesn't support hardware pass through.\n",
4283                         iommu->name);
4284                 return -ENXIO;
4285         }
4286         if (!ecap_sc_support(iommu->ecap) &&
4287             domain_update_iommu_snooping(iommu)) {
4288                 pr_warn("%s: Doesn't support snooping.\n",
4289                         iommu->name);
4290                 return -ENXIO;
4291         }
4292         sp = domain_update_iommu_superpage(iommu) - 1;
4293         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4294                 pr_warn("%s: Doesn't support large page.\n",
4295                         iommu->name);
4296                 return -ENXIO;
4297         }
4298
4299         /*
4300          * Disable translation if already enabled prior to OS handover.
4301          */
4302         if (iommu->gcmd & DMA_GCMD_TE)
4303                 iommu_disable_translation(iommu);
4304
4305         g_iommus[iommu->seq_id] = iommu;
4306         ret = iommu_init_domains(iommu);
4307         if (ret == 0)
4308                 ret = iommu_alloc_root_entry(iommu);
4309         if (ret)
4310                 goto out;
4311
4312 #ifdef CONFIG_INTEL_IOMMU_SVM
4313         if (pasid_enabled(iommu))
4314                 intel_svm_alloc_pasid_tables(iommu);
4315 #endif
4316
4317         if (dmaru->ignored) {
4318                 /*
4319                  * we always have to disable PMRs or DMA may fail on this device
4320                  */
4321                 if (force_on)
4322                         iommu_disable_protect_mem_regions(iommu);
4323                 return 0;
4324         }
4325
4326         intel_iommu_init_qi(iommu);
4327         iommu_flush_write_buffer(iommu);
4328
4329 #ifdef CONFIG_INTEL_IOMMU_SVM
4330         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4331                 ret = intel_svm_enable_prq(iommu);
4332                 if (ret)
4333                         goto disable_iommu;
4334         }
4335 #endif
4336         ret = dmar_set_interrupt(iommu);
4337         if (ret)
4338                 goto disable_iommu;
4339
4340         iommu_set_root_entry(iommu);
4341         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4342         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4343         iommu_enable_translation(iommu);
4344
4345         iommu_disable_protect_mem_regions(iommu);
4346         return 0;
4347
4348 disable_iommu:
4349         disable_dmar_iommu(iommu);
4350 out:
4351         free_dmar_iommu(iommu);
4352         return ret;
4353 }
4354
4355 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4356 {
4357         int ret = 0;
4358         struct intel_iommu *iommu = dmaru->iommu;
4359
4360         if (!intel_iommu_enabled)
4361                 return 0;
4362         if (iommu == NULL)
4363                 return -EINVAL;
4364
4365         if (insert) {
4366                 ret = intel_iommu_add(dmaru);
4367         } else {
4368                 disable_dmar_iommu(iommu);
4369                 free_dmar_iommu(iommu);
4370         }
4371
4372         return ret;
4373 }
4374
4375 static void intel_iommu_free_dmars(void)
4376 {
4377         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4378         struct dmar_atsr_unit *atsru, *atsr_n;
4379
4380         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4381                 list_del(&rmrru->list);
4382                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4383                 kfree(rmrru->resv);
4384                 kfree(rmrru);
4385         }
4386
4387         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4388                 list_del(&atsru->list);
4389                 intel_iommu_free_atsr(atsru);
4390         }
4391 }
4392
4393 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4394 {
4395         int i, ret = 1;
4396         struct pci_bus *bus;
4397         struct pci_dev *bridge = NULL;
4398         struct device *tmp;
4399         struct acpi_dmar_atsr *atsr;
4400         struct dmar_atsr_unit *atsru;
4401
4402         dev = pci_physfn(dev);
4403         for (bus = dev->bus; bus; bus = bus->parent) {
4404                 bridge = bus->self;
4405                 /* If it's an integrated device, allow ATS */
4406                 if (!bridge)
4407                         return 1;
4408                 /* Connected via non-PCIe: no ATS */
4409                 if (!pci_is_pcie(bridge) ||
4410                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4411                         return 0;
4412                 /* If we found the root port, look it up in the ATSR */
4413                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4414                         break;
4415         }
4416
4417         rcu_read_lock();
4418         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4419                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4420                 if (atsr->segment != pci_domain_nr(dev->bus))
4421                         continue;
4422
4423                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4424                         if (tmp == &bridge->dev)
4425                                 goto out;
4426
4427                 if (atsru->include_all)
4428                         goto out;
4429         }
4430         ret = 0;
4431 out:
4432         rcu_read_unlock();
4433
4434         return ret;
4435 }
4436
4437 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4438 {
4439         int ret = 0;
4440         struct dmar_rmrr_unit *rmrru;
4441         struct dmar_atsr_unit *atsru;
4442         struct acpi_dmar_atsr *atsr;
4443         struct acpi_dmar_reserved_memory *rmrr;
4444
4445         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4446                 return 0;
4447
4448         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4449                 rmrr = container_of(rmrru->hdr,
4450                                     struct acpi_dmar_reserved_memory, header);
4451                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4452                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4453                                 ((void *)rmrr) + rmrr->header.length,
4454                                 rmrr->segment, rmrru->devices,
4455                                 rmrru->devices_cnt);
4456                         if(ret < 0)
4457                                 return ret;
4458                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4459                         dmar_remove_dev_scope(info, rmrr->segment,
4460                                 rmrru->devices, rmrru->devices_cnt);
4461                 }
4462         }
4463
4464         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4465                 if (atsru->include_all)
4466                         continue;
4467
4468                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4469                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4470                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4471                                         (void *)atsr + atsr->header.length,
4472                                         atsr->segment, atsru->devices,
4473                                         atsru->devices_cnt);
4474                         if (ret > 0)
4475                                 break;
4476                         else if(ret < 0)
4477                                 return ret;
4478                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4479                         if (dmar_remove_dev_scope(info, atsr->segment,
4480                                         atsru->devices, atsru->devices_cnt))
4481                                 break;
4482                 }
4483         }
4484
4485         return 0;
4486 }
4487
4488 /*
4489  * Here we only respond to action of unbound device from driver.
4490  *
4491  * Added device is not attached to its DMAR domain here yet. That will happen
4492  * when mapping the device to iova.
4493  */
4494 static int device_notifier(struct notifier_block *nb,
4495                                   unsigned long action, void *data)
4496 {
4497         struct device *dev = data;
4498         struct dmar_domain *domain;
4499
4500         if (iommu_dummy(dev))
4501                 return 0;
4502
4503         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4504                 return 0;
4505
4506         domain = find_domain(dev);
4507         if (!domain)
4508                 return 0;
4509
4510         dmar_remove_one_dev_info(domain, dev);
4511         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4512                 domain_exit(domain);
4513
4514         return 0;
4515 }
4516
4517 static struct notifier_block device_nb = {
4518         .notifier_call = device_notifier,
4519 };
4520
4521 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4522                                        unsigned long val, void *v)
4523 {
4524         struct memory_notify *mhp = v;
4525         unsigned long long start, end;
4526         unsigned long start_vpfn, last_vpfn;
4527
4528         switch (val) {
4529         case MEM_GOING_ONLINE:
4530                 start = mhp->start_pfn << PAGE_SHIFT;
4531                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4532                 if (iommu_domain_identity_map(si_domain, start, end)) {
4533                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4534                                 start, end);
4535                         return NOTIFY_BAD;
4536                 }
4537                 break;
4538
4539         case MEM_OFFLINE:
4540         case MEM_CANCEL_ONLINE:
4541                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4542                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4543                 while (start_vpfn <= last_vpfn) {
4544                         struct iova *iova;
4545                         struct dmar_drhd_unit *drhd;
4546                         struct intel_iommu *iommu;
4547                         struct page *freelist;
4548
4549                         iova = find_iova(&si_domain->iovad, start_vpfn);
4550                         if (iova == NULL) {
4551                                 pr_debug("Failed get IOVA for PFN %lx\n",
4552                                          start_vpfn);
4553                                 break;
4554                         }
4555
4556                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4557                                                      start_vpfn, last_vpfn);
4558                         if (iova == NULL) {
4559                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4560                                         start_vpfn, last_vpfn);
4561                                 return NOTIFY_BAD;
4562                         }
4563
4564                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4565                                                iova->pfn_hi);
4566
4567                         rcu_read_lock();
4568                         for_each_active_iommu(iommu, drhd)
4569                                 iommu_flush_iotlb_psi(iommu, si_domain,
4570                                         iova->pfn_lo, iova_size(iova),
4571                                         !freelist, 0);
4572                         rcu_read_unlock();
4573                         dma_free_pagelist(freelist);
4574
4575                         start_vpfn = iova->pfn_hi + 1;
4576                         free_iova_mem(iova);
4577                 }
4578                 break;
4579         }
4580
4581         return NOTIFY_OK;
4582 }
4583
4584 static struct notifier_block intel_iommu_memory_nb = {
4585         .notifier_call = intel_iommu_memory_notifier,
4586         .priority = 0
4587 };
4588
4589 static void free_all_cpu_cached_iovas(unsigned int cpu)
4590 {
4591         int i;
4592
4593         for (i = 0; i < g_num_of_iommus; i++) {
4594                 struct intel_iommu *iommu = g_iommus[i];
4595                 struct dmar_domain *domain;
4596                 int did;
4597
4598                 if (!iommu)
4599                         continue;
4600
4601                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4602                         domain = get_iommu_domain(iommu, (u16)did);
4603
4604                         if (!domain)
4605                                 continue;
4606                         free_cpu_cached_iovas(cpu, &domain->iovad);
4607                 }
4608         }
4609 }
4610
4611 static int intel_iommu_cpu_dead(unsigned int cpu)
4612 {
4613         free_all_cpu_cached_iovas(cpu);
4614         return 0;
4615 }
4616
4617 static void intel_disable_iommus(void)
4618 {
4619         struct intel_iommu *iommu = NULL;
4620         struct dmar_drhd_unit *drhd;
4621
4622         for_each_iommu(iommu, drhd)
4623                 iommu_disable_translation(iommu);
4624 }
4625
4626 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4627 {
4628         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4629
4630         return container_of(iommu_dev, struct intel_iommu, iommu);
4631 }
4632
4633 static ssize_t intel_iommu_show_version(struct device *dev,
4634                                         struct device_attribute *attr,
4635                                         char *buf)
4636 {
4637         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4638         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4639         return sprintf(buf, "%d:%d\n",
4640                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4641 }
4642 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4643
4644 static ssize_t intel_iommu_show_address(struct device *dev,
4645                                         struct device_attribute *attr,
4646                                         char *buf)
4647 {
4648         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4649         return sprintf(buf, "%llx\n", iommu->reg_phys);
4650 }
4651 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4652
4653 static ssize_t intel_iommu_show_cap(struct device *dev,
4654                                     struct device_attribute *attr,
4655                                     char *buf)
4656 {
4657         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4658         return sprintf(buf, "%llx\n", iommu->cap);
4659 }
4660 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4661
4662 static ssize_t intel_iommu_show_ecap(struct device *dev,
4663                                     struct device_attribute *attr,
4664                                     char *buf)
4665 {
4666         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4667         return sprintf(buf, "%llx\n", iommu->ecap);
4668 }
4669 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4670
4671 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4672                                       struct device_attribute *attr,
4673                                       char *buf)
4674 {
4675         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4676         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4677 }
4678 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4679
4680 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4681                                            struct device_attribute *attr,
4682                                            char *buf)
4683 {
4684         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4685         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4686                                                   cap_ndoms(iommu->cap)));
4687 }
4688 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4689
4690 static struct attribute *intel_iommu_attrs[] = {
4691         &dev_attr_version.attr,
4692         &dev_attr_address.attr,
4693         &dev_attr_cap.attr,
4694         &dev_attr_ecap.attr,
4695         &dev_attr_domains_supported.attr,
4696         &dev_attr_domains_used.attr,
4697         NULL,
4698 };
4699
4700 static struct attribute_group intel_iommu_group = {
4701         .name = "intel-iommu",
4702         .attrs = intel_iommu_attrs,
4703 };
4704
4705 const struct attribute_group *intel_iommu_groups[] = {
4706         &intel_iommu_group,
4707         NULL,
4708 };
4709
4710 int __init intel_iommu_init(void)
4711 {
4712         int ret = -ENODEV;
4713         struct dmar_drhd_unit *drhd;
4714         struct intel_iommu *iommu;
4715
4716         /* VT-d is required for a TXT/tboot launch, so enforce that */
4717         force_on = tboot_force_iommu();
4718
4719         if (iommu_init_mempool()) {
4720                 if (force_on)
4721                         panic("tboot: Failed to initialize iommu memory\n");
4722                 return -ENOMEM;
4723         }
4724
4725         down_write(&dmar_global_lock);
4726         if (dmar_table_init()) {
4727                 if (force_on)
4728                         panic("tboot: Failed to initialize DMAR table\n");
4729                 goto out_free_dmar;
4730         }
4731
4732         if (dmar_dev_scope_init() < 0) {
4733                 if (force_on)
4734                         panic("tboot: Failed to initialize DMAR device scope\n");
4735                 goto out_free_dmar;
4736         }
4737
4738         up_write(&dmar_global_lock);
4739
4740         /*
4741          * The bus notifier takes the dmar_global_lock, so lockdep will
4742          * complain later when we register it under the lock.
4743          */
4744         dmar_register_bus_notifier();
4745
4746         down_write(&dmar_global_lock);
4747
4748         if (no_iommu || dmar_disabled) {
4749                 /*
4750                  * We exit the function here to ensure IOMMU's remapping and
4751                  * mempool aren't setup, which means that the IOMMU's PMRs
4752                  * won't be disabled via the call to init_dmars(). So disable
4753                  * it explicitly here. The PMRs were setup by tboot prior to
4754                  * calling SENTER, but the kernel is expected to reset/tear
4755                  * down the PMRs.
4756                  */
4757                 if (intel_iommu_tboot_noforce) {
4758                         for_each_iommu(iommu, drhd)
4759                                 iommu_disable_protect_mem_regions(iommu);
4760                 }
4761
4762                 /*
4763                  * Make sure the IOMMUs are switched off, even when we
4764                  * boot into a kexec kernel and the previous kernel left
4765                  * them enabled
4766                  */
4767                 intel_disable_iommus();
4768                 goto out_free_dmar;
4769         }
4770
4771         if (list_empty(&dmar_rmrr_units))
4772                 pr_info("No RMRR found\n");
4773
4774         if (list_empty(&dmar_atsr_units))
4775                 pr_info("No ATSR found\n");
4776
4777         if (dmar_init_reserved_ranges()) {
4778                 if (force_on)
4779                         panic("tboot: Failed to reserve iommu ranges\n");
4780                 goto out_free_reserved_range;
4781         }
4782
4783         init_no_remapping_devices();
4784
4785         ret = init_dmars();
4786         if (ret) {
4787                 if (force_on)
4788                         panic("tboot: Failed to initialize DMARs\n");
4789                 pr_err("Initialization failed\n");
4790                 goto out_free_reserved_range;
4791         }
4792         up_write(&dmar_global_lock);
4793         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4794
4795 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4796         swiotlb = 0;
4797 #endif
4798         dma_ops = &intel_dma_ops;
4799
4800         init_iommu_pm_ops();
4801
4802         for_each_active_iommu(iommu, drhd) {
4803                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4804                                        intel_iommu_groups,
4805                                        "%s", iommu->name);
4806                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4807                 iommu_device_register(&iommu->iommu);
4808         }
4809
4810         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4811         bus_register_notifier(&pci_bus_type, &device_nb);
4812         if (si_domain && !hw_pass_through)
4813                 register_memory_notifier(&intel_iommu_memory_nb);
4814         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4815                           intel_iommu_cpu_dead);
4816         intel_iommu_enabled = 1;
4817
4818         return 0;
4819
4820 out_free_reserved_range:
4821         put_iova_domain(&reserved_iova_list);
4822 out_free_dmar:
4823         intel_iommu_free_dmars();
4824         up_write(&dmar_global_lock);
4825         iommu_exit_mempool();
4826         return ret;
4827 }
4828
4829 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4830 {
4831         struct intel_iommu *iommu = opaque;
4832
4833         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4834         return 0;
4835 }
4836
4837 /*
4838  * NB - intel-iommu lacks any sort of reference counting for the users of
4839  * dependent devices.  If multiple endpoints have intersecting dependent
4840  * devices, unbinding the driver from any one of them will possibly leave
4841  * the others unable to operate.
4842  */
4843 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4844 {
4845         if (!iommu || !dev || !dev_is_pci(dev))
4846                 return;
4847
4848         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4849 }
4850
4851 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4852 {
4853         struct intel_iommu *iommu;
4854         unsigned long flags;
4855
4856         assert_spin_locked(&device_domain_lock);
4857
4858         if (WARN_ON(!info))
4859                 return;
4860
4861         iommu = info->iommu;
4862
4863         if (info->dev) {
4864                 iommu_disable_dev_iotlb(info);
4865                 domain_context_clear(iommu, info->dev);
4866         }
4867
4868         unlink_domain_info(info);
4869
4870         spin_lock_irqsave(&iommu->lock, flags);
4871         domain_detach_iommu(info->domain, iommu);
4872         spin_unlock_irqrestore(&iommu->lock, flags);
4873
4874         free_devinfo_mem(info);
4875 }
4876
4877 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4878                                      struct device *dev)
4879 {
4880         struct device_domain_info *info;
4881         unsigned long flags;
4882
4883         spin_lock_irqsave(&device_domain_lock, flags);
4884         info = dev->archdata.iommu;
4885         __dmar_remove_one_dev_info(info);
4886         spin_unlock_irqrestore(&device_domain_lock, flags);
4887 }
4888
4889 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4890 {
4891         int adjust_width;
4892
4893         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4894         domain_reserve_special_ranges(domain);
4895
4896         /* calculate AGAW */
4897         domain->gaw = guest_width;
4898         adjust_width = guestwidth_to_adjustwidth(guest_width);
4899         domain->agaw = width_to_agaw(adjust_width);
4900
4901         domain->iommu_coherency = 0;
4902         domain->iommu_snooping = 0;
4903         domain->iommu_superpage = 0;
4904         domain->max_addr = 0;
4905
4906         /* always allocate the top pgd */
4907         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4908         if (!domain->pgd)
4909                 return -ENOMEM;
4910         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4911         return 0;
4912 }
4913
4914 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4915 {
4916         struct dmar_domain *dmar_domain;
4917         struct iommu_domain *domain;
4918
4919         if (type != IOMMU_DOMAIN_UNMANAGED)
4920                 return NULL;
4921
4922         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4923         if (!dmar_domain) {
4924                 pr_err("Can't allocate dmar_domain\n");
4925                 return NULL;
4926         }
4927         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4928                 pr_err("Domain initialization failed\n");
4929                 domain_exit(dmar_domain);
4930                 return NULL;
4931         }
4932         domain_update_iommu_cap(dmar_domain);
4933
4934         domain = &dmar_domain->domain;
4935         domain->geometry.aperture_start = 0;
4936         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4937         domain->geometry.force_aperture = true;
4938
4939         return domain;
4940 }
4941
4942 static void intel_iommu_domain_free(struct iommu_domain *domain)
4943 {
4944         domain_exit(to_dmar_domain(domain));
4945 }
4946
4947 static int intel_iommu_attach_device(struct iommu_domain *domain,
4948                                      struct device *dev)
4949 {
4950         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4951         struct intel_iommu *iommu;
4952         int addr_width;
4953         u8 bus, devfn;
4954
4955         if (device_is_rmrr_locked(dev)) {
4956                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4957                 return -EPERM;
4958         }
4959
4960         /* normally dev is not mapped */
4961         if (unlikely(domain_context_mapped(dev))) {
4962                 struct dmar_domain *old_domain;
4963
4964                 old_domain = find_domain(dev);
4965                 if (old_domain) {
4966                         rcu_read_lock();
4967                         dmar_remove_one_dev_info(old_domain, dev);
4968                         rcu_read_unlock();
4969
4970                         if (!domain_type_is_vm_or_si(old_domain) &&
4971                              list_empty(&old_domain->devices))
4972                                 domain_exit(old_domain);
4973                 }
4974         }
4975
4976         iommu = device_to_iommu(dev, &bus, &devfn);
4977         if (!iommu)
4978                 return -ENODEV;
4979
4980         /* check if this iommu agaw is sufficient for max mapped address */
4981         addr_width = agaw_to_width(iommu->agaw);
4982         if (addr_width > cap_mgaw(iommu->cap))
4983                 addr_width = cap_mgaw(iommu->cap);
4984
4985         if (dmar_domain->max_addr > (1LL << addr_width)) {
4986                 pr_err("%s: iommu width (%d) is not "
4987                        "sufficient for the mapped address (%llx)\n",
4988                        __func__, addr_width, dmar_domain->max_addr);
4989                 return -EFAULT;
4990         }
4991         dmar_domain->gaw = addr_width;
4992
4993         /*
4994          * Knock out extra levels of page tables if necessary
4995          */
4996         while (iommu->agaw < dmar_domain->agaw) {
4997                 struct dma_pte *pte;
4998
4999                 pte = dmar_domain->pgd;
5000                 if (dma_pte_present(pte)) {
5001                         dmar_domain->pgd = (struct dma_pte *)
5002                                 phys_to_virt(dma_pte_addr(pte));
5003                         free_pgtable_page(pte);
5004                 }
5005                 dmar_domain->agaw--;
5006         }
5007
5008         return domain_add_dev_info(dmar_domain, dev);
5009 }
5010
5011 static void intel_iommu_detach_device(struct iommu_domain *domain,
5012                                       struct device *dev)
5013 {
5014         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5015 }
5016
5017 static int intel_iommu_map(struct iommu_domain *domain,
5018                            unsigned long iova, phys_addr_t hpa,
5019                            size_t size, int iommu_prot)
5020 {
5021         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5022         u64 max_addr;
5023         int prot = 0;
5024         int ret;
5025
5026         if (iommu_prot & IOMMU_READ)
5027                 prot |= DMA_PTE_READ;
5028         if (iommu_prot & IOMMU_WRITE)
5029                 prot |= DMA_PTE_WRITE;
5030         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5031                 prot |= DMA_PTE_SNP;
5032
5033         max_addr = iova + size;
5034         if (dmar_domain->max_addr < max_addr) {
5035                 u64 end;
5036
5037                 /* check if minimum agaw is sufficient for mapped address */
5038                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5039                 if (end < max_addr) {
5040                         pr_err("%s: iommu width (%d) is not "
5041                                "sufficient for the mapped address (%llx)\n",
5042                                __func__, dmar_domain->gaw, max_addr);
5043                         return -EFAULT;
5044                 }
5045                 dmar_domain->max_addr = max_addr;
5046         }
5047         /* Round up size to next multiple of PAGE_SIZE, if it and
5048            the low bits of hpa would take us onto the next page */
5049         size = aligned_nrpages(hpa, size);
5050         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5051                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5052         return ret;
5053 }
5054
5055 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5056                                 unsigned long iova, size_t size)
5057 {
5058         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5059         struct page *freelist = NULL;
5060         unsigned long start_pfn, last_pfn;
5061         unsigned int npages;
5062         int iommu_id, level = 0;
5063
5064         /* Cope with horrid API which requires us to unmap more than the
5065            size argument if it happens to be a large-page mapping. */
5066         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5067
5068         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5069                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5070
5071         start_pfn = iova >> VTD_PAGE_SHIFT;
5072         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5073
5074         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5075
5076         npages = last_pfn - start_pfn + 1;
5077
5078         for_each_domain_iommu(iommu_id, dmar_domain)
5079                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5080                                       start_pfn, npages, !freelist, 0);
5081
5082         dma_free_pagelist(freelist);
5083
5084         if (dmar_domain->max_addr == iova + size)
5085                 dmar_domain->max_addr = iova;
5086
5087         return size;
5088 }
5089
5090 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5091                                             dma_addr_t iova)
5092 {
5093         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5094         struct dma_pte *pte;
5095         int level = 0;
5096         u64 phys = 0;
5097
5098         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5099         if (pte)
5100                 phys = dma_pte_addr(pte);
5101
5102         return phys;
5103 }
5104
5105 static bool intel_iommu_capable(enum iommu_cap cap)
5106 {
5107         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5108                 return domain_update_iommu_snooping(NULL) == 1;
5109         if (cap == IOMMU_CAP_INTR_REMAP)
5110                 return irq_remapping_enabled == 1;
5111
5112         return false;
5113 }
5114
5115 static int intel_iommu_add_device(struct device *dev)
5116 {
5117         struct intel_iommu *iommu;
5118         struct iommu_group *group;
5119         u8 bus, devfn;
5120
5121         iommu = device_to_iommu(dev, &bus, &devfn);
5122         if (!iommu)
5123                 return -ENODEV;
5124
5125         iommu_device_link(&iommu->iommu, dev);
5126
5127         group = iommu_group_get_for_dev(dev);
5128
5129         if (IS_ERR(group))
5130                 return PTR_ERR(group);
5131
5132         iommu_group_put(group);
5133         return 0;
5134 }
5135
5136 static void intel_iommu_remove_device(struct device *dev)
5137 {
5138         struct intel_iommu *iommu;
5139         u8 bus, devfn;
5140
5141         iommu = device_to_iommu(dev, &bus, &devfn);
5142         if (!iommu)
5143                 return;
5144
5145         iommu_group_remove_device(dev);
5146
5147         iommu_device_unlink(&iommu->iommu, dev);
5148 }
5149
5150 static void intel_iommu_get_resv_regions(struct device *device,
5151                                          struct list_head *head)
5152 {
5153         struct iommu_resv_region *reg;
5154         struct dmar_rmrr_unit *rmrr;
5155         struct device *i_dev;
5156         int i;
5157
5158         rcu_read_lock();
5159         for_each_rmrr_units(rmrr) {
5160                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5161                                           i, i_dev) {
5162                         if (i_dev != device)
5163                                 continue;
5164
5165                         list_add_tail(&rmrr->resv->list, head);
5166                 }
5167         }
5168         rcu_read_unlock();
5169
5170         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5171                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5172                                       0, IOMMU_RESV_MSI);
5173         if (!reg)
5174                 return;
5175         list_add_tail(&reg->list, head);
5176 }
5177
5178 static void intel_iommu_put_resv_regions(struct device *dev,
5179                                          struct list_head *head)
5180 {
5181         struct iommu_resv_region *entry, *next;
5182
5183         list_for_each_entry_safe(entry, next, head, list) {
5184                 if (entry->type == IOMMU_RESV_RESERVED)
5185                         kfree(entry);
5186         }
5187 }
5188
5189 #ifdef CONFIG_INTEL_IOMMU_SVM
5190 #define MAX_NR_PASID_BITS (20)
5191 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5192 {
5193         /*
5194          * Convert ecap_pss to extend context entry pts encoding, also
5195          * respect the soft pasid_max value set by the iommu.
5196          * - number of PASID bits = ecap_pss + 1
5197          * - number of PASID table entries = 2^(pts + 5)
5198          * Therefore, pts = ecap_pss - 4
5199          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5200          */
5201         if (ecap_pss(iommu->ecap) < 5)
5202                 return 0;
5203
5204         /* pasid_max is encoded as actual number of entries not the bits */
5205         return find_first_bit((unsigned long *)&iommu->pasid_max,
5206                         MAX_NR_PASID_BITS) - 5;
5207 }
5208
5209 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5210 {
5211         struct device_domain_info *info;
5212         struct context_entry *context;
5213         struct dmar_domain *domain;
5214         unsigned long flags;
5215         u64 ctx_lo;
5216         int ret;
5217
5218         domain = get_valid_domain_for_dev(sdev->dev);
5219         if (!domain)
5220                 return -EINVAL;
5221
5222         spin_lock_irqsave(&device_domain_lock, flags);
5223         spin_lock(&iommu->lock);
5224
5225         ret = -EINVAL;
5226         info = sdev->dev->archdata.iommu;
5227         if (!info || !info->pasid_supported)
5228                 goto out;
5229
5230         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5231         if (WARN_ON(!context))
5232                 goto out;
5233
5234         ctx_lo = context[0].lo;
5235
5236         sdev->did = domain->iommu_did[iommu->seq_id];
5237         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5238
5239         if (!(ctx_lo & CONTEXT_PASIDE)) {
5240                 if (iommu->pasid_state_table)
5241                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5242                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5243                         intel_iommu_get_pts(iommu);
5244
5245                 wmb();
5246                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5247                  * extended to permit requests-with-PASID if the PASIDE bit
5248                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5249                  * however, the PASIDE bit is ignored and requests-with-PASID
5250                  * are unconditionally blocked. Which makes less sense.
5251                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5252                  * "guest mode" translation types depending on whether ATS
5253                  * is available or not. Annoyingly, we can't use the new
5254                  * modes *unless* PASIDE is set. */
5255                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5256                         ctx_lo &= ~CONTEXT_TT_MASK;
5257                         if (info->ats_supported)
5258                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5259                         else
5260                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5261                 }
5262                 ctx_lo |= CONTEXT_PASIDE;
5263                 if (iommu->pasid_state_table)
5264                         ctx_lo |= CONTEXT_DINVE;
5265                 if (info->pri_supported)
5266                         ctx_lo |= CONTEXT_PRS;
5267                 context[0].lo = ctx_lo;
5268                 wmb();
5269                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5270                                            DMA_CCMD_MASK_NOBIT,
5271                                            DMA_CCMD_DEVICE_INVL);
5272         }
5273
5274         /* Enable PASID support in the device, if it wasn't already */
5275         if (!info->pasid_enabled)
5276                 iommu_enable_dev_iotlb(info);
5277
5278         if (info->ats_enabled) {
5279                 sdev->dev_iotlb = 1;
5280                 sdev->qdep = info->ats_qdep;
5281                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5282                         sdev->qdep = 0;
5283         }
5284         ret = 0;
5285
5286  out:
5287         spin_unlock(&iommu->lock);
5288         spin_unlock_irqrestore(&device_domain_lock, flags);
5289
5290         return ret;
5291 }
5292
5293 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5294 {
5295         struct intel_iommu *iommu;
5296         u8 bus, devfn;
5297
5298         if (iommu_dummy(dev)) {
5299                 dev_warn(dev,
5300                          "No IOMMU translation for device; cannot enable SVM\n");
5301                 return NULL;
5302         }
5303
5304         iommu = device_to_iommu(dev, &bus, &devfn);
5305         if ((!iommu)) {
5306                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5307                 return NULL;
5308         }
5309
5310         if (!iommu->pasid_table) {
5311                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5312                 return NULL;
5313         }
5314
5315         return iommu;
5316 }
5317 #endif /* CONFIG_INTEL_IOMMU_SVM */
5318
5319 const struct iommu_ops intel_iommu_ops = {
5320         .capable                = intel_iommu_capable,
5321         .domain_alloc           = intel_iommu_domain_alloc,
5322         .domain_free            = intel_iommu_domain_free,
5323         .attach_dev             = intel_iommu_attach_device,
5324         .detach_dev             = intel_iommu_detach_device,
5325         .map                    = intel_iommu_map,
5326         .unmap                  = intel_iommu_unmap,
5327         .map_sg                 = default_iommu_map_sg,
5328         .iova_to_phys           = intel_iommu_iova_to_phys,
5329         .add_device             = intel_iommu_add_device,
5330         .remove_device          = intel_iommu_remove_device,
5331         .get_resv_regions       = intel_iommu_get_resv_regions,
5332         .put_resv_regions       = intel_iommu_put_resv_regions,
5333         .device_group           = pci_device_group,
5334         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5335 };
5336
5337 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5338 {
5339         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5340         pr_info("Disabling IOMMU for graphics on this chipset\n");
5341         dmar_map_gfx = 0;
5342 }
5343
5344 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5345 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5346 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5347 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5348 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5350 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5351
5352 static void quirk_iommu_rwbf(struct pci_dev *dev)
5353 {
5354         /*
5355          * Mobile 4 Series Chipset neglects to set RWBF capability,
5356          * but needs it. Same seems to hold for the desktop versions.
5357          */
5358         pr_info("Forcing write-buffer flush capability\n");
5359         rwbf_quirk = 1;
5360 }
5361
5362 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5363 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5364 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5365 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5366 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5367 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5368 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5369
5370 #define GGC 0x52
5371 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5372 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5373 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5374 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5375 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5376 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5377 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5378 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5379
5380 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5381 {
5382         unsigned short ggc;
5383
5384         if (pci_read_config_word(dev, GGC, &ggc))
5385                 return;
5386
5387         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5388                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5389                 dmar_map_gfx = 0;
5390         } else if (dmar_map_gfx) {
5391                 /* we have to ensure the gfx device is idle before we flush */
5392                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5393                 intel_iommu_strict = 1;
5394        }
5395 }
5396 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5397 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5398 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5399 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5400
5401 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5402    ISOCH DMAR unit for the Azalia sound device, but not give it any
5403    TLB entries, which causes it to deadlock. Check for that.  We do
5404    this in a function called from init_dmars(), instead of in a PCI
5405    quirk, because we don't want to print the obnoxious "BIOS broken"
5406    message if VT-d is actually disabled.
5407 */
5408 static void __init check_tylersburg_isoch(void)
5409 {
5410         struct pci_dev *pdev;
5411         uint32_t vtisochctrl;
5412
5413         /* If there's no Azalia in the system anyway, forget it. */
5414         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5415         if (!pdev)
5416                 return;
5417         pci_dev_put(pdev);
5418
5419         /* System Management Registers. Might be hidden, in which case
5420            we can't do the sanity check. But that's OK, because the
5421            known-broken BIOSes _don't_ actually hide it, so far. */
5422         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5423         if (!pdev)
5424                 return;
5425
5426         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5427                 pci_dev_put(pdev);
5428                 return;
5429         }
5430
5431         pci_dev_put(pdev);
5432
5433         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5434         if (vtisochctrl & 1)
5435                 return;
5436
5437         /* Drop all bits other than the number of TLB entries */
5438         vtisochctrl &= 0x1c;
5439
5440         /* If we have the recommended number of TLB entries (16), fine. */
5441         if (vtisochctrl == 0x10)
5442                 return;
5443
5444         /* Zero TLB entries? You get to ride the short bus to school. */
5445         if (!vtisochctrl) {
5446                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5447                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5448                      dmi_get_system_info(DMI_BIOS_VENDOR),
5449                      dmi_get_system_info(DMI_BIOS_VERSION),
5450                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5451                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5452                 return;
5453         }
5454
5455         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5456                vtisochctrl);
5457 }