]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
Merge branches 'iommu/fixes', 'arm/smmu', 'x86/amd', 'x86/vt-d' and 'core' into next
[linux.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359                                  struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361                                struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364                                      struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366                                             dma_addr_t iova);
367
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
373
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
379
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
382
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
389
390 #define IDENTMAP_GFX            2
391 #define IDENTMAP_AZALIA         4
392
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
400
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
402                                 to_pci_dev(d)->untrusted)
403
404 /*
405  * Iterate over elements in device_domain_list and call the specified
406  * callback @fn against each element.
407  */
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409                                      void *data), void *data)
410 {
411         int ret = 0;
412         unsigned long flags;
413         struct device_domain_info *info;
414
415         spin_lock_irqsave(&device_domain_lock, flags);
416         list_for_each_entry(info, &device_domain_list, global) {
417                 ret = fn(info, data);
418                 if (ret) {
419                         spin_unlock_irqrestore(&device_domain_lock, flags);
420                         return ret;
421                 }
422         }
423         spin_unlock_irqrestore(&device_domain_lock, flags);
424
425         return 0;
426 }
427
428 const struct iommu_ops intel_iommu_ops;
429
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
433 }
434
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
436 {
437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
438 }
439
440 static void init_translation_status(struct intel_iommu *iommu)
441 {
442         u32 gsts;
443
444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
445         if (gsts & DMA_GSTS_TES)
446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
447 }
448
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
451 {
452         return container_of(dom, struct dmar_domain, domain);
453 }
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         pr_info("IOMMU enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         no_platform_optin = 1;
466                         pr_info("IOMMU disabled\n");
467                 } else if (!strncmp(str, "igfx_off", 8)) {
468                         dmar_map_gfx = 0;
469                         pr_info("Disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         pr_info("Forcing DAC for PCI devices\n");
472                         dmar_forcedac = 1;
473                 } else if (!strncmp(str, "strict", 6)) {
474                         pr_info("Disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         pr_info("Disable supported super page\n");
478                         intel_iommu_superpage = 0;
479                 } else if (!strncmp(str, "sm_on", 5)) {
480                         pr_info("Intel-IOMMU: scalable mode supported\n");
481                         intel_iommu_sm = 1;
482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
483                         printk(KERN_INFO
484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485                         intel_iommu_tboot_noforce = 1;
486                 } else if (!strncmp(str, "nobounce", 8)) {
487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488                         intel_no_bounce = 1;
489                 }
490
491                 str += strcspn(str, ",");
492                 while (*str == ',')
493                         str++;
494         }
495         return 0;
496 }
497 __setup("intel_iommu=", intel_iommu_setup);
498
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
501
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
503 {
504         struct dmar_domain **domains;
505         int idx = did >> 8;
506
507         domains = iommu->domains[idx];
508         if (!domains)
509                 return NULL;
510
511         return domains[did & 0xff];
512 }
513
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515                              struct dmar_domain *domain)
516 {
517         struct dmar_domain **domains;
518         int idx = did >> 8;
519
520         if (!iommu->domains[idx]) {
521                 size_t size = 256 * sizeof(struct dmar_domain *);
522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
523         }
524
525         domains = iommu->domains[idx];
526         if (WARN_ON(!domains))
527                 return;
528         else
529                 domains[did & 0xff] = domain;
530 }
531
532 void *alloc_pgtable_page(int node)
533 {
534         struct page *page;
535         void *vaddr = NULL;
536
537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538         if (page)
539                 vaddr = page_address(page);
540         return vaddr;
541 }
542
543 void free_pgtable_page(void *vaddr)
544 {
545         free_page((unsigned long)vaddr);
546 }
547
548 static inline void *alloc_domain_mem(void)
549 {
550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
551 }
552
553 static void free_domain_mem(void *vaddr)
554 {
555         kmem_cache_free(iommu_domain_cache, vaddr);
556 }
557
558 static inline void * alloc_devinfo_mem(void)
559 {
560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
561 }
562
563 static inline void free_devinfo_mem(void *vaddr)
564 {
565         kmem_cache_free(iommu_devinfo_cache, vaddr);
566 }
567
568 static inline int domain_type_is_si(struct dmar_domain *domain)
569 {
570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
571 }
572
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
574 {
575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
576 }
577
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579                                        unsigned long pfn)
580 {
581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
582
583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588         unsigned long sagaw;
589         int agaw = -1;
590
591         sagaw = cap_sagaw(iommu->cap);
592         for (agaw = width_to_agaw(max_gaw);
593              agaw >= 0; agaw--) {
594                 if (test_bit(agaw, &sagaw))
595                         break;
596         }
597
598         return agaw;
599 }
600
601 /*
602  * Calculate max SAGAW for each iommu.
603  */
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 {
606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
607 }
608
609 /*
610  * calculate agaw for each iommu.
611  * "SAGAW" may be different across iommus, use a default agaw, and
612  * get a supported less agaw for iommus that don't support the default agaw.
613  */
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 {
616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
617 }
618
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
621 {
622         int iommu_id;
623
624         /* si_domain and vm domain should not get here. */
625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626                 return NULL;
627
628         for_each_domain_iommu(iommu_id, domain)
629                 break;
630
631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632                 return NULL;
633
634         return g_iommus[iommu_id];
635 }
636
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
638 {
639         struct dmar_drhd_unit *drhd;
640         struct intel_iommu *iommu;
641         bool found = false;
642         int i;
643
644         domain->iommu_coherency = 1;
645
646         for_each_domain_iommu(i, domain) {
647                 found = true;
648                 if (!ecap_coherent(g_iommus[i]->ecap)) {
649                         domain->iommu_coherency = 0;
650                         break;
651                 }
652         }
653         if (found)
654                 return;
655
656         /* No hardware attached; use lowest common denominator */
657         rcu_read_lock();
658         for_each_active_iommu(iommu, drhd) {
659                 if (!ecap_coherent(iommu->ecap)) {
660                         domain->iommu_coherency = 0;
661                         break;
662                 }
663         }
664         rcu_read_unlock();
665 }
666
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
668 {
669         struct dmar_drhd_unit *drhd;
670         struct intel_iommu *iommu;
671         int ret = 1;
672
673         rcu_read_lock();
674         for_each_active_iommu(iommu, drhd) {
675                 if (iommu != skip) {
676                         if (!ecap_sc_support(iommu->ecap)) {
677                                 ret = 0;
678                                 break;
679                         }
680                 }
681         }
682         rcu_read_unlock();
683
684         return ret;
685 }
686
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688                                          struct intel_iommu *skip)
689 {
690         struct dmar_drhd_unit *drhd;
691         struct intel_iommu *iommu;
692         int mask = 0x3;
693
694         if (!intel_iommu_superpage) {
695                 return 0;
696         }
697
698         /* set iommu_superpage to the smallest common denominator */
699         rcu_read_lock();
700         for_each_active_iommu(iommu, drhd) {
701                 if (iommu != skip) {
702                         if (domain && domain_use_first_level(domain)) {
703                                 if (!cap_fl1gp_support(iommu->cap))
704                                         mask = 0x1;
705                         } else {
706                                 mask &= cap_super_page_val(iommu->cap);
707                         }
708
709                         if (!mask)
710                                 break;
711                 }
712         }
713         rcu_read_unlock();
714
715         return fls(mask);
716 }
717
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
720 {
721         domain_update_iommu_coherency(domain);
722         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
724 }
725
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
727                                          u8 devfn, int alloc)
728 {
729         struct root_entry *root = &iommu->root_entry[bus];
730         struct context_entry *context;
731         u64 *entry;
732
733         entry = &root->lo;
734         if (sm_supported(iommu)) {
735                 if (devfn >= 0x80) {
736                         devfn -= 0x80;
737                         entry = &root->hi;
738                 }
739                 devfn *= 2;
740         }
741         if (*entry & 1)
742                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
743         else {
744                 unsigned long phy_addr;
745                 if (!alloc)
746                         return NULL;
747
748                 context = alloc_pgtable_page(iommu->node);
749                 if (!context)
750                         return NULL;
751
752                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753                 phy_addr = virt_to_phys((void *)context);
754                 *entry = phy_addr | 1;
755                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
756         }
757         return &context[devfn];
758 }
759
760 static int iommu_dummy(struct device *dev)
761 {
762         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
763 }
764
765 /**
766  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
767  *                               sub-hierarchy of a candidate PCI-PCI bridge
768  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
769  * @bridge: the candidate PCI-PCI bridge
770  *
771  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
772  */
773 static bool
774 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
775 {
776         struct pci_dev *pdev, *pbridge;
777
778         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
779                 return false;
780
781         pdev = to_pci_dev(dev);
782         pbridge = to_pci_dev(bridge);
783
784         if (pbridge->subordinate &&
785             pbridge->subordinate->number <= pdev->bus->number &&
786             pbridge->subordinate->busn_res.end >= pdev->bus->number)
787                 return true;
788
789         return false;
790 }
791
792 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
793 {
794         struct dmar_drhd_unit *drhd = NULL;
795         struct intel_iommu *iommu;
796         struct device *tmp;
797         struct pci_dev *pdev = NULL;
798         u16 segment = 0;
799         int i;
800
801         if (iommu_dummy(dev))
802                 return NULL;
803
804         if (dev_is_pci(dev)) {
805                 struct pci_dev *pf_pdev;
806
807                 pdev = to_pci_dev(dev);
808
809 #ifdef CONFIG_X86
810                 /* VMD child devices currently cannot be handled individually */
811                 if (is_vmd(pdev->bus))
812                         return NULL;
813 #endif
814
815                 /* VFs aren't listed in scope tables; we need to look up
816                  * the PF instead to find the IOMMU. */
817                 pf_pdev = pci_physfn(pdev);
818                 dev = &pf_pdev->dev;
819                 segment = pci_domain_nr(pdev->bus);
820         } else if (has_acpi_companion(dev))
821                 dev = &ACPI_COMPANION(dev)->dev;
822
823         rcu_read_lock();
824         for_each_active_iommu(iommu, drhd) {
825                 if (pdev && segment != drhd->segment)
826                         continue;
827
828                 for_each_active_dev_scope(drhd->devices,
829                                           drhd->devices_cnt, i, tmp) {
830                         if (tmp == dev) {
831                                 /* For a VF use its original BDF# not that of the PF
832                                  * which we used for the IOMMU lookup. Strictly speaking
833                                  * we could do this for all PCI devices; we only need to
834                                  * get the BDF# from the scope table for ACPI matches. */
835                                 if (pdev && pdev->is_virtfn)
836                                         goto got_pdev;
837
838                                 *bus = drhd->devices[i].bus;
839                                 *devfn = drhd->devices[i].devfn;
840                                 goto out;
841                         }
842
843                         if (is_downstream_to_pci_bridge(dev, tmp))
844                                 goto got_pdev;
845                 }
846
847                 if (pdev && drhd->include_all) {
848                 got_pdev:
849                         *bus = pdev->bus->number;
850                         *devfn = pdev->devfn;
851                         goto out;
852                 }
853         }
854         iommu = NULL;
855  out:
856         rcu_read_unlock();
857
858         return iommu;
859 }
860
861 static void domain_flush_cache(struct dmar_domain *domain,
862                                void *addr, int size)
863 {
864         if (!domain->iommu_coherency)
865                 clflush_cache_range(addr, size);
866 }
867
868 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
869 {
870         struct context_entry *context;
871         int ret = 0;
872         unsigned long flags;
873
874         spin_lock_irqsave(&iommu->lock, flags);
875         context = iommu_context_addr(iommu, bus, devfn, 0);
876         if (context)
877                 ret = context_present(context);
878         spin_unlock_irqrestore(&iommu->lock, flags);
879         return ret;
880 }
881
882 static void free_context_table(struct intel_iommu *iommu)
883 {
884         int i;
885         unsigned long flags;
886         struct context_entry *context;
887
888         spin_lock_irqsave(&iommu->lock, flags);
889         if (!iommu->root_entry) {
890                 goto out;
891         }
892         for (i = 0; i < ROOT_ENTRY_NR; i++) {
893                 context = iommu_context_addr(iommu, i, 0, 0);
894                 if (context)
895                         free_pgtable_page(context);
896
897                 if (!sm_supported(iommu))
898                         continue;
899
900                 context = iommu_context_addr(iommu, i, 0x80, 0);
901                 if (context)
902                         free_pgtable_page(context);
903
904         }
905         free_pgtable_page(iommu->root_entry);
906         iommu->root_entry = NULL;
907 out:
908         spin_unlock_irqrestore(&iommu->lock, flags);
909 }
910
911 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
912                                       unsigned long pfn, int *target_level)
913 {
914         struct dma_pte *parent, *pte;
915         int level = agaw_to_level(domain->agaw);
916         int offset;
917
918         BUG_ON(!domain->pgd);
919
920         if (!domain_pfn_supported(domain, pfn))
921                 /* Address beyond IOMMU's addressing capabilities. */
922                 return NULL;
923
924         parent = domain->pgd;
925
926         while (1) {
927                 void *tmp_page;
928
929                 offset = pfn_level_offset(pfn, level);
930                 pte = &parent[offset];
931                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
932                         break;
933                 if (level == *target_level)
934                         break;
935
936                 if (!dma_pte_present(pte)) {
937                         uint64_t pteval;
938
939                         tmp_page = alloc_pgtable_page(domain->nid);
940
941                         if (!tmp_page)
942                                 return NULL;
943
944                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
945                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
946                         if (domain_use_first_level(domain))
947                                 pteval |= DMA_FL_PTE_XD;
948                         if (cmpxchg64(&pte->val, 0ULL, pteval))
949                                 /* Someone else set it while we were thinking; use theirs. */
950                                 free_pgtable_page(tmp_page);
951                         else
952                                 domain_flush_cache(domain, pte, sizeof(*pte));
953                 }
954                 if (level == 1)
955                         break;
956
957                 parent = phys_to_virt(dma_pte_addr(pte));
958                 level--;
959         }
960
961         if (!*target_level)
962                 *target_level = level;
963
964         return pte;
965 }
966
967 /* return address's pte at specific level */
968 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
969                                          unsigned long pfn,
970                                          int level, int *large_page)
971 {
972         struct dma_pte *parent, *pte;
973         int total = agaw_to_level(domain->agaw);
974         int offset;
975
976         parent = domain->pgd;
977         while (level <= total) {
978                 offset = pfn_level_offset(pfn, total);
979                 pte = &parent[offset];
980                 if (level == total)
981                         return pte;
982
983                 if (!dma_pte_present(pte)) {
984                         *large_page = total;
985                         break;
986                 }
987
988                 if (dma_pte_superpage(pte)) {
989                         *large_page = total;
990                         return pte;
991                 }
992
993                 parent = phys_to_virt(dma_pte_addr(pte));
994                 total--;
995         }
996         return NULL;
997 }
998
999 /* clear last level pte, a tlb flush should be followed */
1000 static void dma_pte_clear_range(struct dmar_domain *domain,
1001                                 unsigned long start_pfn,
1002                                 unsigned long last_pfn)
1003 {
1004         unsigned int large_page;
1005         struct dma_pte *first_pte, *pte;
1006
1007         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1008         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1009         BUG_ON(start_pfn > last_pfn);
1010
1011         /* we don't need lock here; nobody else touches the iova range */
1012         do {
1013                 large_page = 1;
1014                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1015                 if (!pte) {
1016                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1017                         continue;
1018                 }
1019                 do {
1020                         dma_clear_pte(pte);
1021                         start_pfn += lvl_to_nr_pages(large_page);
1022                         pte++;
1023                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1024
1025                 domain_flush_cache(domain, first_pte,
1026                                    (void *)pte - (void *)first_pte);
1027
1028         } while (start_pfn && start_pfn <= last_pfn);
1029 }
1030
1031 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1032                                int retain_level, struct dma_pte *pte,
1033                                unsigned long pfn, unsigned long start_pfn,
1034                                unsigned long last_pfn)
1035 {
1036         pfn = max(start_pfn, pfn);
1037         pte = &pte[pfn_level_offset(pfn, level)];
1038
1039         do {
1040                 unsigned long level_pfn;
1041                 struct dma_pte *level_pte;
1042
1043                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1044                         goto next;
1045
1046                 level_pfn = pfn & level_mask(level);
1047                 level_pte = phys_to_virt(dma_pte_addr(pte));
1048
1049                 if (level > 2) {
1050                         dma_pte_free_level(domain, level - 1, retain_level,
1051                                            level_pte, level_pfn, start_pfn,
1052                                            last_pfn);
1053                 }
1054
1055                 /*
1056                  * Free the page table if we're below the level we want to
1057                  * retain and the range covers the entire table.
1058                  */
1059                 if (level < retain_level && !(start_pfn > level_pfn ||
1060                       last_pfn < level_pfn + level_size(level) - 1)) {
1061                         dma_clear_pte(pte);
1062                         domain_flush_cache(domain, pte, sizeof(*pte));
1063                         free_pgtable_page(level_pte);
1064                 }
1065 next:
1066                 pfn += level_size(level);
1067         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1068 }
1069
1070 /*
1071  * clear last level (leaf) ptes and free page table pages below the
1072  * level we wish to keep intact.
1073  */
1074 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1075                                    unsigned long start_pfn,
1076                                    unsigned long last_pfn,
1077                                    int retain_level)
1078 {
1079         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1080         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1081         BUG_ON(start_pfn > last_pfn);
1082
1083         dma_pte_clear_range(domain, start_pfn, last_pfn);
1084
1085         /* We don't need lock here; nobody else touches the iova range */
1086         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1087                            domain->pgd, 0, start_pfn, last_pfn);
1088
1089         /* free pgd */
1090         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1091                 free_pgtable_page(domain->pgd);
1092                 domain->pgd = NULL;
1093         }
1094 }
1095
1096 /* When a page at a given level is being unlinked from its parent, we don't
1097    need to *modify* it at all. All we need to do is make a list of all the
1098    pages which can be freed just as soon as we've flushed the IOTLB and we
1099    know the hardware page-walk will no longer touch them.
1100    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1101    be freed. */
1102 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1103                                             int level, struct dma_pte *pte,
1104                                             struct page *freelist)
1105 {
1106         struct page *pg;
1107
1108         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1109         pg->freelist = freelist;
1110         freelist = pg;
1111
1112         if (level == 1)
1113                 return freelist;
1114
1115         pte = page_address(pg);
1116         do {
1117                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1118                         freelist = dma_pte_list_pagetables(domain, level - 1,
1119                                                            pte, freelist);
1120                 pte++;
1121         } while (!first_pte_in_page(pte));
1122
1123         return freelist;
1124 }
1125
1126 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1127                                         struct dma_pte *pte, unsigned long pfn,
1128                                         unsigned long start_pfn,
1129                                         unsigned long last_pfn,
1130                                         struct page *freelist)
1131 {
1132         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1133
1134         pfn = max(start_pfn, pfn);
1135         pte = &pte[pfn_level_offset(pfn, level)];
1136
1137         do {
1138                 unsigned long level_pfn;
1139
1140                 if (!dma_pte_present(pte))
1141                         goto next;
1142
1143                 level_pfn = pfn & level_mask(level);
1144
1145                 /* If range covers entire pagetable, free it */
1146                 if (start_pfn <= level_pfn &&
1147                     last_pfn >= level_pfn + level_size(level) - 1) {
1148                         /* These suborbinate page tables are going away entirely. Don't
1149                            bother to clear them; we're just going to *free* them. */
1150                         if (level > 1 && !dma_pte_superpage(pte))
1151                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1152
1153                         dma_clear_pte(pte);
1154                         if (!first_pte)
1155                                 first_pte = pte;
1156                         last_pte = pte;
1157                 } else if (level > 1) {
1158                         /* Recurse down into a level that isn't *entirely* obsolete */
1159                         freelist = dma_pte_clear_level(domain, level - 1,
1160                                                        phys_to_virt(dma_pte_addr(pte)),
1161                                                        level_pfn, start_pfn, last_pfn,
1162                                                        freelist);
1163                 }
1164 next:
1165                 pfn += level_size(level);
1166         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1167
1168         if (first_pte)
1169                 domain_flush_cache(domain, first_pte,
1170                                    (void *)++last_pte - (void *)first_pte);
1171
1172         return freelist;
1173 }
1174
1175 /* We can't just free the pages because the IOMMU may still be walking
1176    the page tables, and may have cached the intermediate levels. The
1177    pages can only be freed after the IOTLB flush has been done. */
1178 static struct page *domain_unmap(struct dmar_domain *domain,
1179                                  unsigned long start_pfn,
1180                                  unsigned long last_pfn)
1181 {
1182         struct page *freelist;
1183
1184         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1185         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1186         BUG_ON(start_pfn > last_pfn);
1187
1188         /* we don't need lock here; nobody else touches the iova range */
1189         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1190                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1191
1192         /* free pgd */
1193         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1194                 struct page *pgd_page = virt_to_page(domain->pgd);
1195                 pgd_page->freelist = freelist;
1196                 freelist = pgd_page;
1197
1198                 domain->pgd = NULL;
1199         }
1200
1201         return freelist;
1202 }
1203
1204 static void dma_free_pagelist(struct page *freelist)
1205 {
1206         struct page *pg;
1207
1208         while ((pg = freelist)) {
1209                 freelist = pg->freelist;
1210                 free_pgtable_page(page_address(pg));
1211         }
1212 }
1213
1214 static void iova_entry_free(unsigned long data)
1215 {
1216         struct page *freelist = (struct page *)data;
1217
1218         dma_free_pagelist(freelist);
1219 }
1220
1221 /* iommu handling */
1222 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1223 {
1224         struct root_entry *root;
1225         unsigned long flags;
1226
1227         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1228         if (!root) {
1229                 pr_err("Allocating root entry for %s failed\n",
1230                         iommu->name);
1231                 return -ENOMEM;
1232         }
1233
1234         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1235
1236         spin_lock_irqsave(&iommu->lock, flags);
1237         iommu->root_entry = root;
1238         spin_unlock_irqrestore(&iommu->lock, flags);
1239
1240         return 0;
1241 }
1242
1243 static void iommu_set_root_entry(struct intel_iommu *iommu)
1244 {
1245         u64 addr;
1246         u32 sts;
1247         unsigned long flag;
1248
1249         addr = virt_to_phys(iommu->root_entry);
1250         if (sm_supported(iommu))
1251                 addr |= DMA_RTADDR_SMT;
1252
1253         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1255
1256         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1257
1258         /* Make sure hardware complete it */
1259         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1260                       readl, (sts & DMA_GSTS_RTPS), sts);
1261
1262         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 }
1264
1265 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1266 {
1267         u32 val;
1268         unsigned long flag;
1269
1270         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1271                 return;
1272
1273         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1274         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1275
1276         /* Make sure hardware complete it */
1277         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1278                       readl, (!(val & DMA_GSTS_WBFS)), val);
1279
1280         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1281 }
1282
1283 /* return value determine if we need a write buffer flush */
1284 static void __iommu_flush_context(struct intel_iommu *iommu,
1285                                   u16 did, u16 source_id, u8 function_mask,
1286                                   u64 type)
1287 {
1288         u64 val = 0;
1289         unsigned long flag;
1290
1291         switch (type) {
1292         case DMA_CCMD_GLOBAL_INVL:
1293                 val = DMA_CCMD_GLOBAL_INVL;
1294                 break;
1295         case DMA_CCMD_DOMAIN_INVL:
1296                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1297                 break;
1298         case DMA_CCMD_DEVICE_INVL:
1299                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1300                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1301                 break;
1302         default:
1303                 BUG();
1304         }
1305         val |= DMA_CCMD_ICC;
1306
1307         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1308         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1309
1310         /* Make sure hardware complete it */
1311         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1312                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1313
1314         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1315 }
1316
1317 /* return value determine if we need a write buffer flush */
1318 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1319                                 u64 addr, unsigned int size_order, u64 type)
1320 {
1321         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1322         u64 val = 0, val_iva = 0;
1323         unsigned long flag;
1324
1325         switch (type) {
1326         case DMA_TLB_GLOBAL_FLUSH:
1327                 /* global flush doesn't need set IVA_REG */
1328                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1329                 break;
1330         case DMA_TLB_DSI_FLUSH:
1331                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1332                 break;
1333         case DMA_TLB_PSI_FLUSH:
1334                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1335                 /* IH bit is passed in as part of address */
1336                 val_iva = size_order | addr;
1337                 break;
1338         default:
1339                 BUG();
1340         }
1341         /* Note: set drain read/write */
1342 #if 0
1343         /*
1344          * This is probably to be super secure.. Looks like we can
1345          * ignore it without any impact.
1346          */
1347         if (cap_read_drain(iommu->cap))
1348                 val |= DMA_TLB_READ_DRAIN;
1349 #endif
1350         if (cap_write_drain(iommu->cap))
1351                 val |= DMA_TLB_WRITE_DRAIN;
1352
1353         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1354         /* Note: Only uses first TLB reg currently */
1355         if (val_iva)
1356                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1357         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1358
1359         /* Make sure hardware complete it */
1360         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1361                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1362
1363         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1364
1365         /* check IOTLB invalidation granularity */
1366         if (DMA_TLB_IAIG(val) == 0)
1367                 pr_err("Flush IOTLB failed\n");
1368         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1369                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1370                         (unsigned long long)DMA_TLB_IIRG(type),
1371                         (unsigned long long)DMA_TLB_IAIG(val));
1372 }
1373
1374 static struct device_domain_info *
1375 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1376                          u8 bus, u8 devfn)
1377 {
1378         struct device_domain_info *info;
1379
1380         assert_spin_locked(&device_domain_lock);
1381
1382         if (!iommu->qi)
1383                 return NULL;
1384
1385         list_for_each_entry(info, &domain->devices, link)
1386                 if (info->iommu == iommu && info->bus == bus &&
1387                     info->devfn == devfn) {
1388                         if (info->ats_supported && info->dev)
1389                                 return info;
1390                         break;
1391                 }
1392
1393         return NULL;
1394 }
1395
1396 static void domain_update_iotlb(struct dmar_domain *domain)
1397 {
1398         struct device_domain_info *info;
1399         bool has_iotlb_device = false;
1400
1401         assert_spin_locked(&device_domain_lock);
1402
1403         list_for_each_entry(info, &domain->devices, link) {
1404                 struct pci_dev *pdev;
1405
1406                 if (!info->dev || !dev_is_pci(info->dev))
1407                         continue;
1408
1409                 pdev = to_pci_dev(info->dev);
1410                 if (pdev->ats_enabled) {
1411                         has_iotlb_device = true;
1412                         break;
1413                 }
1414         }
1415
1416         domain->has_iotlb_device = has_iotlb_device;
1417 }
1418
1419 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1420 {
1421         struct pci_dev *pdev;
1422
1423         assert_spin_locked(&device_domain_lock);
1424
1425         if (!info || !dev_is_pci(info->dev))
1426                 return;
1427
1428         pdev = to_pci_dev(info->dev);
1429         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1430          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1431          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1432          * reserved, which should be set to 0.
1433          */
1434         if (!ecap_dit(info->iommu->ecap))
1435                 info->pfsid = 0;
1436         else {
1437                 struct pci_dev *pf_pdev;
1438
1439                 /* pdev will be returned if device is not a vf */
1440                 pf_pdev = pci_physfn(pdev);
1441                 info->pfsid = pci_dev_id(pf_pdev);
1442         }
1443
1444 #ifdef CONFIG_INTEL_IOMMU_SVM
1445         /* The PCIe spec, in its wisdom, declares that the behaviour of
1446            the device if you enable PASID support after ATS support is
1447            undefined. So always enable PASID support on devices which
1448            have it, even if we can't yet know if we're ever going to
1449            use it. */
1450         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1451                 info->pasid_enabled = 1;
1452
1453         if (info->pri_supported &&
1454             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1455             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1456                 info->pri_enabled = 1;
1457 #endif
1458         if (!pdev->untrusted && info->ats_supported &&
1459             pci_ats_page_aligned(pdev) &&
1460             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1461                 info->ats_enabled = 1;
1462                 domain_update_iotlb(info->domain);
1463                 info->ats_qdep = pci_ats_queue_depth(pdev);
1464         }
1465 }
1466
1467 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1468 {
1469         struct pci_dev *pdev;
1470
1471         assert_spin_locked(&device_domain_lock);
1472
1473         if (!dev_is_pci(info->dev))
1474                 return;
1475
1476         pdev = to_pci_dev(info->dev);
1477
1478         if (info->ats_enabled) {
1479                 pci_disable_ats(pdev);
1480                 info->ats_enabled = 0;
1481                 domain_update_iotlb(info->domain);
1482         }
1483 #ifdef CONFIG_INTEL_IOMMU_SVM
1484         if (info->pri_enabled) {
1485                 pci_disable_pri(pdev);
1486                 info->pri_enabled = 0;
1487         }
1488         if (info->pasid_enabled) {
1489                 pci_disable_pasid(pdev);
1490                 info->pasid_enabled = 0;
1491         }
1492 #endif
1493 }
1494
1495 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1496                                   u64 addr, unsigned mask)
1497 {
1498         u16 sid, qdep;
1499         unsigned long flags;
1500         struct device_domain_info *info;
1501
1502         if (!domain->has_iotlb_device)
1503                 return;
1504
1505         spin_lock_irqsave(&device_domain_lock, flags);
1506         list_for_each_entry(info, &domain->devices, link) {
1507                 if (!info->ats_enabled)
1508                         continue;
1509
1510                 sid = info->bus << 8 | info->devfn;
1511                 qdep = info->ats_qdep;
1512                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1513                                 qdep, addr, mask);
1514         }
1515         spin_unlock_irqrestore(&device_domain_lock, flags);
1516 }
1517
1518 static void domain_flush_piotlb(struct intel_iommu *iommu,
1519                                 struct dmar_domain *domain,
1520                                 u64 addr, unsigned long npages, bool ih)
1521 {
1522         u16 did = domain->iommu_did[iommu->seq_id];
1523
1524         if (domain->default_pasid)
1525                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1526                                 addr, npages, ih);
1527
1528         if (!list_empty(&domain->devices))
1529                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1530 }
1531
1532 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1533                                   struct dmar_domain *domain,
1534                                   unsigned long pfn, unsigned int pages,
1535                                   int ih, int map)
1536 {
1537         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1538         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1539         u16 did = domain->iommu_did[iommu->seq_id];
1540
1541         BUG_ON(pages == 0);
1542
1543         if (ih)
1544                 ih = 1 << 6;
1545
1546         if (domain_use_first_level(domain)) {
1547                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1548         } else {
1549                 /*
1550                  * Fallback to domain selective flush if no PSI support or
1551                  * the size is too big. PSI requires page size to be 2 ^ x,
1552                  * and the base address is naturally aligned to the size.
1553                  */
1554                 if (!cap_pgsel_inv(iommu->cap) ||
1555                     mask > cap_max_amask_val(iommu->cap))
1556                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1557                                                         DMA_TLB_DSI_FLUSH);
1558                 else
1559                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1560                                                         DMA_TLB_PSI_FLUSH);
1561         }
1562
1563         /*
1564          * In caching mode, changes of pages from non-present to present require
1565          * flush. However, device IOTLB doesn't need to be flushed in this case.
1566          */
1567         if (!cap_caching_mode(iommu->cap) || !map)
1568                 iommu_flush_dev_iotlb(domain, addr, mask);
1569 }
1570
1571 /* Notification for newly created mappings */
1572 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1573                                         struct dmar_domain *domain,
1574                                         unsigned long pfn, unsigned int pages)
1575 {
1576         /*
1577          * It's a non-present to present mapping. Only flush if caching mode
1578          * and second level.
1579          */
1580         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1581                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1582         else
1583                 iommu_flush_write_buffer(iommu);
1584 }
1585
1586 static void iommu_flush_iova(struct iova_domain *iovad)
1587 {
1588         struct dmar_domain *domain;
1589         int idx;
1590
1591         domain = container_of(iovad, struct dmar_domain, iovad);
1592
1593         for_each_domain_iommu(idx, domain) {
1594                 struct intel_iommu *iommu = g_iommus[idx];
1595                 u16 did = domain->iommu_did[iommu->seq_id];
1596
1597                 if (domain_use_first_level(domain))
1598                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1599                 else
1600                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1601                                                  DMA_TLB_DSI_FLUSH);
1602
1603                 if (!cap_caching_mode(iommu->cap))
1604                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1605                                               0, MAX_AGAW_PFN_WIDTH);
1606         }
1607 }
1608
1609 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1610 {
1611         u32 pmen;
1612         unsigned long flags;
1613
1614         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1615                 return;
1616
1617         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1618         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1619         pmen &= ~DMA_PMEN_EPM;
1620         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1621
1622         /* wait for the protected region status bit to clear */
1623         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1624                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1625
1626         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1627 }
1628
1629 static void iommu_enable_translation(struct intel_iommu *iommu)
1630 {
1631         u32 sts;
1632         unsigned long flags;
1633
1634         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1635         iommu->gcmd |= DMA_GCMD_TE;
1636         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1637
1638         /* Make sure hardware complete it */
1639         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1640                       readl, (sts & DMA_GSTS_TES), sts);
1641
1642         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1643 }
1644
1645 static void iommu_disable_translation(struct intel_iommu *iommu)
1646 {
1647         u32 sts;
1648         unsigned long flag;
1649
1650         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1651         iommu->gcmd &= ~DMA_GCMD_TE;
1652         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1653
1654         /* Make sure hardware complete it */
1655         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1656                       readl, (!(sts & DMA_GSTS_TES)), sts);
1657
1658         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1659 }
1660
1661 static int iommu_init_domains(struct intel_iommu *iommu)
1662 {
1663         u32 ndomains, nlongs;
1664         size_t size;
1665
1666         ndomains = cap_ndoms(iommu->cap);
1667         pr_debug("%s: Number of Domains supported <%d>\n",
1668                  iommu->name, ndomains);
1669         nlongs = BITS_TO_LONGS(ndomains);
1670
1671         spin_lock_init(&iommu->lock);
1672
1673         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1674         if (!iommu->domain_ids) {
1675                 pr_err("%s: Allocating domain id array failed\n",
1676                        iommu->name);
1677                 return -ENOMEM;
1678         }
1679
1680         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1681         iommu->domains = kzalloc(size, GFP_KERNEL);
1682
1683         if (iommu->domains) {
1684                 size = 256 * sizeof(struct dmar_domain *);
1685                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1686         }
1687
1688         if (!iommu->domains || !iommu->domains[0]) {
1689                 pr_err("%s: Allocating domain array failed\n",
1690                        iommu->name);
1691                 kfree(iommu->domain_ids);
1692                 kfree(iommu->domains);
1693                 iommu->domain_ids = NULL;
1694                 iommu->domains    = NULL;
1695                 return -ENOMEM;
1696         }
1697
1698         /*
1699          * If Caching mode is set, then invalid translations are tagged
1700          * with domain-id 0, hence we need to pre-allocate it. We also
1701          * use domain-id 0 as a marker for non-allocated domain-id, so
1702          * make sure it is not used for a real domain.
1703          */
1704         set_bit(0, iommu->domain_ids);
1705
1706         /*
1707          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1708          * entry for first-level or pass-through translation modes should
1709          * be programmed with a domain id different from those used for
1710          * second-level or nested translation. We reserve a domain id for
1711          * this purpose.
1712          */
1713         if (sm_supported(iommu))
1714                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1715
1716         return 0;
1717 }
1718
1719 static void disable_dmar_iommu(struct intel_iommu *iommu)
1720 {
1721         struct device_domain_info *info, *tmp;
1722         unsigned long flags;
1723
1724         if (!iommu->domains || !iommu->domain_ids)
1725                 return;
1726
1727         spin_lock_irqsave(&device_domain_lock, flags);
1728         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1729                 if (info->iommu != iommu)
1730                         continue;
1731
1732                 if (!info->dev || !info->domain)
1733                         continue;
1734
1735                 __dmar_remove_one_dev_info(info);
1736         }
1737         spin_unlock_irqrestore(&device_domain_lock, flags);
1738
1739         if (iommu->gcmd & DMA_GCMD_TE)
1740                 iommu_disable_translation(iommu);
1741 }
1742
1743 static void free_dmar_iommu(struct intel_iommu *iommu)
1744 {
1745         if ((iommu->domains) && (iommu->domain_ids)) {
1746                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1747                 int i;
1748
1749                 for (i = 0; i < elems; i++)
1750                         kfree(iommu->domains[i]);
1751                 kfree(iommu->domains);
1752                 kfree(iommu->domain_ids);
1753                 iommu->domains = NULL;
1754                 iommu->domain_ids = NULL;
1755         }
1756
1757         g_iommus[iommu->seq_id] = NULL;
1758
1759         /* free context mapping */
1760         free_context_table(iommu);
1761
1762 #ifdef CONFIG_INTEL_IOMMU_SVM
1763         if (pasid_supported(iommu)) {
1764                 if (ecap_prs(iommu->ecap))
1765                         intel_svm_finish_prq(iommu);
1766         }
1767 #endif
1768 }
1769
1770 /*
1771  * Check and return whether first level is used by default for
1772  * DMA translation.
1773  */
1774 static bool first_level_by_default(void)
1775 {
1776         struct dmar_drhd_unit *drhd;
1777         struct intel_iommu *iommu;
1778         static int first_level_support = -1;
1779
1780         if (likely(first_level_support != -1))
1781                 return first_level_support;
1782
1783         first_level_support = 1;
1784
1785         rcu_read_lock();
1786         for_each_active_iommu(iommu, drhd) {
1787                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1788                         first_level_support = 0;
1789                         break;
1790                 }
1791         }
1792         rcu_read_unlock();
1793
1794         return first_level_support;
1795 }
1796
1797 static struct dmar_domain *alloc_domain(int flags)
1798 {
1799         struct dmar_domain *domain;
1800
1801         domain = alloc_domain_mem();
1802         if (!domain)
1803                 return NULL;
1804
1805         memset(domain, 0, sizeof(*domain));
1806         domain->nid = NUMA_NO_NODE;
1807         domain->flags = flags;
1808         if (first_level_by_default())
1809                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1810         domain->has_iotlb_device = false;
1811         INIT_LIST_HEAD(&domain->devices);
1812
1813         return domain;
1814 }
1815
1816 /* Must be called with iommu->lock */
1817 static int domain_attach_iommu(struct dmar_domain *domain,
1818                                struct intel_iommu *iommu)
1819 {
1820         unsigned long ndomains;
1821         int num;
1822
1823         assert_spin_locked(&device_domain_lock);
1824         assert_spin_locked(&iommu->lock);
1825
1826         domain->iommu_refcnt[iommu->seq_id] += 1;
1827         domain->iommu_count += 1;
1828         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1829                 ndomains = cap_ndoms(iommu->cap);
1830                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1831
1832                 if (num >= ndomains) {
1833                         pr_err("%s: No free domain ids\n", iommu->name);
1834                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1835                         domain->iommu_count -= 1;
1836                         return -ENOSPC;
1837                 }
1838
1839                 set_bit(num, iommu->domain_ids);
1840                 set_iommu_domain(iommu, num, domain);
1841
1842                 domain->iommu_did[iommu->seq_id] = num;
1843                 domain->nid                      = iommu->node;
1844
1845                 domain_update_iommu_cap(domain);
1846         }
1847
1848         return 0;
1849 }
1850
1851 static int domain_detach_iommu(struct dmar_domain *domain,
1852                                struct intel_iommu *iommu)
1853 {
1854         int num, count;
1855
1856         assert_spin_locked(&device_domain_lock);
1857         assert_spin_locked(&iommu->lock);
1858
1859         domain->iommu_refcnt[iommu->seq_id] -= 1;
1860         count = --domain->iommu_count;
1861         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1862                 num = domain->iommu_did[iommu->seq_id];
1863                 clear_bit(num, iommu->domain_ids);
1864                 set_iommu_domain(iommu, num, NULL);
1865
1866                 domain_update_iommu_cap(domain);
1867                 domain->iommu_did[iommu->seq_id] = 0;
1868         }
1869
1870         return count;
1871 }
1872
1873 static struct iova_domain reserved_iova_list;
1874 static struct lock_class_key reserved_rbtree_key;
1875
1876 static int dmar_init_reserved_ranges(void)
1877 {
1878         struct pci_dev *pdev = NULL;
1879         struct iova *iova;
1880         int i;
1881
1882         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1883
1884         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1885                 &reserved_rbtree_key);
1886
1887         /* IOAPIC ranges shouldn't be accessed by DMA */
1888         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1889                 IOVA_PFN(IOAPIC_RANGE_END));
1890         if (!iova) {
1891                 pr_err("Reserve IOAPIC range failed\n");
1892                 return -ENODEV;
1893         }
1894
1895         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1896         for_each_pci_dev(pdev) {
1897                 struct resource *r;
1898
1899                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1900                         r = &pdev->resource[i];
1901                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1902                                 continue;
1903                         iova = reserve_iova(&reserved_iova_list,
1904                                             IOVA_PFN(r->start),
1905                                             IOVA_PFN(r->end));
1906                         if (!iova) {
1907                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1908                                 return -ENODEV;
1909                         }
1910                 }
1911         }
1912         return 0;
1913 }
1914
1915 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1916 {
1917         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1918 }
1919
1920 static inline int guestwidth_to_adjustwidth(int gaw)
1921 {
1922         int agaw;
1923         int r = (gaw - 12) % 9;
1924
1925         if (r == 0)
1926                 agaw = gaw;
1927         else
1928                 agaw = gaw + 9 - r;
1929         if (agaw > 64)
1930                 agaw = 64;
1931         return agaw;
1932 }
1933
1934 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1935                        int guest_width)
1936 {
1937         int adjust_width, agaw;
1938         unsigned long sagaw;
1939         int ret;
1940
1941         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1942
1943         if (!intel_iommu_strict) {
1944                 ret = init_iova_flush_queue(&domain->iovad,
1945                                             iommu_flush_iova, iova_entry_free);
1946                 if (ret)
1947                         pr_info("iova flush queue initialization failed\n");
1948         }
1949
1950         domain_reserve_special_ranges(domain);
1951
1952         /* calculate AGAW */
1953         if (guest_width > cap_mgaw(iommu->cap))
1954                 guest_width = cap_mgaw(iommu->cap);
1955         domain->gaw = guest_width;
1956         adjust_width = guestwidth_to_adjustwidth(guest_width);
1957         agaw = width_to_agaw(adjust_width);
1958         sagaw = cap_sagaw(iommu->cap);
1959         if (!test_bit(agaw, &sagaw)) {
1960                 /* hardware doesn't support it, choose a bigger one */
1961                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1962                 agaw = find_next_bit(&sagaw, 5, agaw);
1963                 if (agaw >= 5)
1964                         return -ENODEV;
1965         }
1966         domain->agaw = agaw;
1967
1968         if (ecap_coherent(iommu->ecap))
1969                 domain->iommu_coherency = 1;
1970         else
1971                 domain->iommu_coherency = 0;
1972
1973         if (ecap_sc_support(iommu->ecap))
1974                 domain->iommu_snooping = 1;
1975         else
1976                 domain->iommu_snooping = 0;
1977
1978         if (intel_iommu_superpage)
1979                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1980         else
1981                 domain->iommu_superpage = 0;
1982
1983         domain->nid = iommu->node;
1984
1985         /* always allocate the top pgd */
1986         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1987         if (!domain->pgd)
1988                 return -ENOMEM;
1989         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1990         return 0;
1991 }
1992
1993 static void domain_exit(struct dmar_domain *domain)
1994 {
1995
1996         /* Remove associated devices and clear attached or cached domains */
1997         domain_remove_dev_info(domain);
1998
1999         /* destroy iovas */
2000         put_iova_domain(&domain->iovad);
2001
2002         if (domain->pgd) {
2003                 struct page *freelist;
2004
2005                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2006                 dma_free_pagelist(freelist);
2007         }
2008
2009         free_domain_mem(domain);
2010 }
2011
2012 /*
2013  * Get the PASID directory size for scalable mode context entry.
2014  * Value of X in the PDTS field of a scalable mode context entry
2015  * indicates PASID directory with 2^(X + 7) entries.
2016  */
2017 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2018 {
2019         int pds, max_pde;
2020
2021         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2022         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2023         if (pds < 7)
2024                 return 0;
2025
2026         return pds - 7;
2027 }
2028
2029 /*
2030  * Set the RID_PASID field of a scalable mode context entry. The
2031  * IOMMU hardware will use the PASID value set in this field for
2032  * DMA translations of DMA requests without PASID.
2033  */
2034 static inline void
2035 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2036 {
2037         context->hi |= pasid & ((1 << 20) - 1);
2038         context->hi |= (1 << 20);
2039 }
2040
2041 /*
2042  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2043  * entry.
2044  */
2045 static inline void context_set_sm_dte(struct context_entry *context)
2046 {
2047         context->lo |= (1 << 2);
2048 }
2049
2050 /*
2051  * Set the PRE(Page Request Enable) field of a scalable mode context
2052  * entry.
2053  */
2054 static inline void context_set_sm_pre(struct context_entry *context)
2055 {
2056         context->lo |= (1 << 4);
2057 }
2058
2059 /* Convert value to context PASID directory size field coding. */
2060 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2061
2062 static int domain_context_mapping_one(struct dmar_domain *domain,
2063                                       struct intel_iommu *iommu,
2064                                       struct pasid_table *table,
2065                                       u8 bus, u8 devfn)
2066 {
2067         u16 did = domain->iommu_did[iommu->seq_id];
2068         int translation = CONTEXT_TT_MULTI_LEVEL;
2069         struct device_domain_info *info = NULL;
2070         struct context_entry *context;
2071         unsigned long flags;
2072         int ret;
2073
2074         WARN_ON(did == 0);
2075
2076         if (hw_pass_through && domain_type_is_si(domain))
2077                 translation = CONTEXT_TT_PASS_THROUGH;
2078
2079         pr_debug("Set context mapping for %02x:%02x.%d\n",
2080                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2081
2082         BUG_ON(!domain->pgd);
2083
2084         spin_lock_irqsave(&device_domain_lock, flags);
2085         spin_lock(&iommu->lock);
2086
2087         ret = -ENOMEM;
2088         context = iommu_context_addr(iommu, bus, devfn, 1);
2089         if (!context)
2090                 goto out_unlock;
2091
2092         ret = 0;
2093         if (context_present(context))
2094                 goto out_unlock;
2095
2096         /*
2097          * For kdump cases, old valid entries may be cached due to the
2098          * in-flight DMA and copied pgtable, but there is no unmapping
2099          * behaviour for them, thus we need an explicit cache flush for
2100          * the newly-mapped device. For kdump, at this point, the device
2101          * is supposed to finish reset at its driver probe stage, so no
2102          * in-flight DMA will exist, and we don't need to worry anymore
2103          * hereafter.
2104          */
2105         if (context_copied(context)) {
2106                 u16 did_old = context_domain_id(context);
2107
2108                 if (did_old < cap_ndoms(iommu->cap)) {
2109                         iommu->flush.flush_context(iommu, did_old,
2110                                                    (((u16)bus) << 8) | devfn,
2111                                                    DMA_CCMD_MASK_NOBIT,
2112                                                    DMA_CCMD_DEVICE_INVL);
2113                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2114                                                  DMA_TLB_DSI_FLUSH);
2115                 }
2116         }
2117
2118         context_clear_entry(context);
2119
2120         if (sm_supported(iommu)) {
2121                 unsigned long pds;
2122
2123                 WARN_ON(!table);
2124
2125                 /* Setup the PASID DIR pointer: */
2126                 pds = context_get_sm_pds(table);
2127                 context->lo = (u64)virt_to_phys(table->table) |
2128                                 context_pdts(pds);
2129
2130                 /* Setup the RID_PASID field: */
2131                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2132
2133                 /*
2134                  * Setup the Device-TLB enable bit and Page request
2135                  * Enable bit:
2136                  */
2137                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2138                 if (info && info->ats_supported)
2139                         context_set_sm_dte(context);
2140                 if (info && info->pri_supported)
2141                         context_set_sm_pre(context);
2142         } else {
2143                 struct dma_pte *pgd = domain->pgd;
2144                 int agaw;
2145
2146                 context_set_domain_id(context, did);
2147
2148                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2149                         /*
2150                          * Skip top levels of page tables for iommu which has
2151                          * less agaw than default. Unnecessary for PT mode.
2152                          */
2153                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2154                                 ret = -ENOMEM;
2155                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2156                                 if (!dma_pte_present(pgd))
2157                                         goto out_unlock;
2158                         }
2159
2160                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2161                         if (info && info->ats_supported)
2162                                 translation = CONTEXT_TT_DEV_IOTLB;
2163                         else
2164                                 translation = CONTEXT_TT_MULTI_LEVEL;
2165
2166                         context_set_address_root(context, virt_to_phys(pgd));
2167                         context_set_address_width(context, agaw);
2168                 } else {
2169                         /*
2170                          * In pass through mode, AW must be programmed to
2171                          * indicate the largest AGAW value supported by
2172                          * hardware. And ASR is ignored by hardware.
2173                          */
2174                         context_set_address_width(context, iommu->msagaw);
2175                 }
2176
2177                 context_set_translation_type(context, translation);
2178         }
2179
2180         context_set_fault_enable(context);
2181         context_set_present(context);
2182         domain_flush_cache(domain, context, sizeof(*context));
2183
2184         /*
2185          * It's a non-present to present mapping. If hardware doesn't cache
2186          * non-present entry we only need to flush the write-buffer. If the
2187          * _does_ cache non-present entries, then it does so in the special
2188          * domain #0, which we have to flush:
2189          */
2190         if (cap_caching_mode(iommu->cap)) {
2191                 iommu->flush.flush_context(iommu, 0,
2192                                            (((u16)bus) << 8) | devfn,
2193                                            DMA_CCMD_MASK_NOBIT,
2194                                            DMA_CCMD_DEVICE_INVL);
2195                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2196         } else {
2197                 iommu_flush_write_buffer(iommu);
2198         }
2199         iommu_enable_dev_iotlb(info);
2200
2201         ret = 0;
2202
2203 out_unlock:
2204         spin_unlock(&iommu->lock);
2205         spin_unlock_irqrestore(&device_domain_lock, flags);
2206
2207         return ret;
2208 }
2209
2210 struct domain_context_mapping_data {
2211         struct dmar_domain *domain;
2212         struct intel_iommu *iommu;
2213         struct pasid_table *table;
2214 };
2215
2216 static int domain_context_mapping_cb(struct pci_dev *pdev,
2217                                      u16 alias, void *opaque)
2218 {
2219         struct domain_context_mapping_data *data = opaque;
2220
2221         return domain_context_mapping_one(data->domain, data->iommu,
2222                                           data->table, PCI_BUS_NUM(alias),
2223                                           alias & 0xff);
2224 }
2225
2226 static int
2227 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2228 {
2229         struct domain_context_mapping_data data;
2230         struct pasid_table *table;
2231         struct intel_iommu *iommu;
2232         u8 bus, devfn;
2233
2234         iommu = device_to_iommu(dev, &bus, &devfn);
2235         if (!iommu)
2236                 return -ENODEV;
2237
2238         table = intel_pasid_get_table(dev);
2239
2240         if (!dev_is_pci(dev))
2241                 return domain_context_mapping_one(domain, iommu, table,
2242                                                   bus, devfn);
2243
2244         data.domain = domain;
2245         data.iommu = iommu;
2246         data.table = table;
2247
2248         return pci_for_each_dma_alias(to_pci_dev(dev),
2249                                       &domain_context_mapping_cb, &data);
2250 }
2251
2252 static int domain_context_mapped_cb(struct pci_dev *pdev,
2253                                     u16 alias, void *opaque)
2254 {
2255         struct intel_iommu *iommu = opaque;
2256
2257         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2258 }
2259
2260 static int domain_context_mapped(struct device *dev)
2261 {
2262         struct intel_iommu *iommu;
2263         u8 bus, devfn;
2264
2265         iommu = device_to_iommu(dev, &bus, &devfn);
2266         if (!iommu)
2267                 return -ENODEV;
2268
2269         if (!dev_is_pci(dev))
2270                 return device_context_mapped(iommu, bus, devfn);
2271
2272         return !pci_for_each_dma_alias(to_pci_dev(dev),
2273                                        domain_context_mapped_cb, iommu);
2274 }
2275
2276 /* Returns a number of VTD pages, but aligned to MM page size */
2277 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2278                                             size_t size)
2279 {
2280         host_addr &= ~PAGE_MASK;
2281         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2282 }
2283
2284 /* Return largest possible superpage level for a given mapping */
2285 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2286                                           unsigned long iov_pfn,
2287                                           unsigned long phy_pfn,
2288                                           unsigned long pages)
2289 {
2290         int support, level = 1;
2291         unsigned long pfnmerge;
2292
2293         support = domain->iommu_superpage;
2294
2295         /* To use a large page, the virtual *and* physical addresses
2296            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2297            of them will mean we have to use smaller pages. So just
2298            merge them and check both at once. */
2299         pfnmerge = iov_pfn | phy_pfn;
2300
2301         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2302                 pages >>= VTD_STRIDE_SHIFT;
2303                 if (!pages)
2304                         break;
2305                 pfnmerge >>= VTD_STRIDE_SHIFT;
2306                 level++;
2307                 support--;
2308         }
2309         return level;
2310 }
2311
2312 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2313                             struct scatterlist *sg, unsigned long phys_pfn,
2314                             unsigned long nr_pages, int prot)
2315 {
2316         struct dma_pte *first_pte = NULL, *pte = NULL;
2317         phys_addr_t uninitialized_var(pteval);
2318         unsigned long sg_res = 0;
2319         unsigned int largepage_lvl = 0;
2320         unsigned long lvl_pages = 0;
2321         u64 attr;
2322
2323         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2324
2325         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2326                 return -EINVAL;
2327
2328         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2329         if (domain_use_first_level(domain))
2330                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2331
2332         if (!sg) {
2333                 sg_res = nr_pages;
2334                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2335         }
2336
2337         while (nr_pages > 0) {
2338                 uint64_t tmp;
2339
2340                 if (!sg_res) {
2341                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2342
2343                         sg_res = aligned_nrpages(sg->offset, sg->length);
2344                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2345                         sg->dma_length = sg->length;
2346                         pteval = (sg_phys(sg) - pgoff) | attr;
2347                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2348                 }
2349
2350                 if (!pte) {
2351                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2352
2353                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2354                         if (!pte)
2355                                 return -ENOMEM;
2356                         /* It is large page*/
2357                         if (largepage_lvl > 1) {
2358                                 unsigned long nr_superpages, end_pfn;
2359
2360                                 pteval |= DMA_PTE_LARGE_PAGE;
2361                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2362
2363                                 nr_superpages = sg_res / lvl_pages;
2364                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2365
2366                                 /*
2367                                  * Ensure that old small page tables are
2368                                  * removed to make room for superpage(s).
2369                                  * We're adding new large pages, so make sure
2370                                  * we don't remove their parent tables.
2371                                  */
2372                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2373                                                        largepage_lvl + 1);
2374                         } else {
2375                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2376                         }
2377
2378                 }
2379                 /* We don't need lock here, nobody else
2380                  * touches the iova range
2381                  */
2382                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2383                 if (tmp) {
2384                         static int dumps = 5;
2385                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2386                                 iov_pfn, tmp, (unsigned long long)pteval);
2387                         if (dumps) {
2388                                 dumps--;
2389                                 debug_dma_dump_mappings(NULL);
2390                         }
2391                         WARN_ON(1);
2392                 }
2393
2394                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2395
2396                 BUG_ON(nr_pages < lvl_pages);
2397                 BUG_ON(sg_res < lvl_pages);
2398
2399                 nr_pages -= lvl_pages;
2400                 iov_pfn += lvl_pages;
2401                 phys_pfn += lvl_pages;
2402                 pteval += lvl_pages * VTD_PAGE_SIZE;
2403                 sg_res -= lvl_pages;
2404
2405                 /* If the next PTE would be the first in a new page, then we
2406                    need to flush the cache on the entries we've just written.
2407                    And then we'll need to recalculate 'pte', so clear it and
2408                    let it get set again in the if (!pte) block above.
2409
2410                    If we're done (!nr_pages) we need to flush the cache too.
2411
2412                    Also if we've been setting superpages, we may need to
2413                    recalculate 'pte' and switch back to smaller pages for the
2414                    end of the mapping, if the trailing size is not enough to
2415                    use another superpage (i.e. sg_res < lvl_pages). */
2416                 pte++;
2417                 if (!nr_pages || first_pte_in_page(pte) ||
2418                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2419                         domain_flush_cache(domain, first_pte,
2420                                            (void *)pte - (void *)first_pte);
2421                         pte = NULL;
2422                 }
2423
2424                 if (!sg_res && nr_pages)
2425                         sg = sg_next(sg);
2426         }
2427         return 0;
2428 }
2429
2430 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2431                           struct scatterlist *sg, unsigned long phys_pfn,
2432                           unsigned long nr_pages, int prot)
2433 {
2434         int iommu_id, ret;
2435         struct intel_iommu *iommu;
2436
2437         /* Do the real mapping first */
2438         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2439         if (ret)
2440                 return ret;
2441
2442         for_each_domain_iommu(iommu_id, domain) {
2443                 iommu = g_iommus[iommu_id];
2444                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2445         }
2446
2447         return 0;
2448 }
2449
2450 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2451                                     struct scatterlist *sg, unsigned long nr_pages,
2452                                     int prot)
2453 {
2454         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2455 }
2456
2457 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2458                                      unsigned long phys_pfn, unsigned long nr_pages,
2459                                      int prot)
2460 {
2461         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2462 }
2463
2464 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2465 {
2466         unsigned long flags;
2467         struct context_entry *context;
2468         u16 did_old;
2469
2470         if (!iommu)
2471                 return;
2472
2473         spin_lock_irqsave(&iommu->lock, flags);
2474         context = iommu_context_addr(iommu, bus, devfn, 0);
2475         if (!context) {
2476                 spin_unlock_irqrestore(&iommu->lock, flags);
2477                 return;
2478         }
2479         did_old = context_domain_id(context);
2480         context_clear_entry(context);
2481         __iommu_flush_cache(iommu, context, sizeof(*context));
2482         spin_unlock_irqrestore(&iommu->lock, flags);
2483         iommu->flush.flush_context(iommu,
2484                                    did_old,
2485                                    (((u16)bus) << 8) | devfn,
2486                                    DMA_CCMD_MASK_NOBIT,
2487                                    DMA_CCMD_DEVICE_INVL);
2488         iommu->flush.flush_iotlb(iommu,
2489                                  did_old,
2490                                  0,
2491                                  0,
2492                                  DMA_TLB_DSI_FLUSH);
2493 }
2494
2495 static inline void unlink_domain_info(struct device_domain_info *info)
2496 {
2497         assert_spin_locked(&device_domain_lock);
2498         list_del(&info->link);
2499         list_del(&info->global);
2500         if (info->dev)
2501                 info->dev->archdata.iommu = NULL;
2502 }
2503
2504 static void domain_remove_dev_info(struct dmar_domain *domain)
2505 {
2506         struct device_domain_info *info, *tmp;
2507         unsigned long flags;
2508
2509         spin_lock_irqsave(&device_domain_lock, flags);
2510         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2511                 __dmar_remove_one_dev_info(info);
2512         spin_unlock_irqrestore(&device_domain_lock, flags);
2513 }
2514
2515 struct dmar_domain *find_domain(struct device *dev)
2516 {
2517         struct device_domain_info *info;
2518
2519         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2520                      dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2521                 return NULL;
2522
2523         /* No lock here, assumes no domain exit in normal case */
2524         info = dev->archdata.iommu;
2525         if (likely(info))
2526                 return info->domain;
2527
2528         return NULL;
2529 }
2530
2531 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2532 {
2533         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2534                 struct iommu_domain *domain;
2535
2536                 dev->archdata.iommu = NULL;
2537                 domain = iommu_get_domain_for_dev(dev);
2538                 if (domain)
2539                         intel_iommu_attach_device(domain, dev);
2540         }
2541
2542         return find_domain(dev);
2543 }
2544
2545 static inline struct device_domain_info *
2546 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2547 {
2548         struct device_domain_info *info;
2549
2550         list_for_each_entry(info, &device_domain_list, global)
2551                 if (info->iommu->segment == segment && info->bus == bus &&
2552                     info->devfn == devfn)
2553                         return info;
2554
2555         return NULL;
2556 }
2557
2558 static int domain_setup_first_level(struct intel_iommu *iommu,
2559                                     struct dmar_domain *domain,
2560                                     struct device *dev,
2561                                     int pasid)
2562 {
2563         int flags = PASID_FLAG_SUPERVISOR_MODE;
2564         struct dma_pte *pgd = domain->pgd;
2565         int agaw, level;
2566
2567         /*
2568          * Skip top levels of page tables for iommu which has
2569          * less agaw than default. Unnecessary for PT mode.
2570          */
2571         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2572                 pgd = phys_to_virt(dma_pte_addr(pgd));
2573                 if (!dma_pte_present(pgd))
2574                         return -ENOMEM;
2575         }
2576
2577         level = agaw_to_level(agaw);
2578         if (level != 4 && level != 5)
2579                 return -EINVAL;
2580
2581         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2582
2583         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2584                                              domain->iommu_did[iommu->seq_id],
2585                                              flags);
2586 }
2587
2588 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2589                                                     int bus, int devfn,
2590                                                     struct device *dev,
2591                                                     struct dmar_domain *domain)
2592 {
2593         struct dmar_domain *found = NULL;
2594         struct device_domain_info *info;
2595         unsigned long flags;
2596         int ret;
2597
2598         info = alloc_devinfo_mem();
2599         if (!info)
2600                 return NULL;
2601
2602         info->bus = bus;
2603         info->devfn = devfn;
2604         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2605         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2606         info->ats_qdep = 0;
2607         info->dev = dev;
2608         info->domain = domain;
2609         info->iommu = iommu;
2610         info->pasid_table = NULL;
2611         info->auxd_enabled = 0;
2612         INIT_LIST_HEAD(&info->auxiliary_domains);
2613
2614         if (dev && dev_is_pci(dev)) {
2615                 struct pci_dev *pdev = to_pci_dev(info->dev);
2616
2617                 if (!pdev->untrusted &&
2618                     !pci_ats_disabled() &&
2619                     ecap_dev_iotlb_support(iommu->ecap) &&
2620                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2621                     dmar_find_matched_atsr_unit(pdev))
2622                         info->ats_supported = 1;
2623
2624                 if (sm_supported(iommu)) {
2625                         if (pasid_supported(iommu)) {
2626                                 int features = pci_pasid_features(pdev);
2627                                 if (features >= 0)
2628                                         info->pasid_supported = features | 1;
2629                         }
2630
2631                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2632                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2633                                 info->pri_supported = 1;
2634                 }
2635         }
2636
2637         spin_lock_irqsave(&device_domain_lock, flags);
2638         if (dev)
2639                 found = find_domain(dev);
2640
2641         if (!found) {
2642                 struct device_domain_info *info2;
2643                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2644                 if (info2) {
2645                         found      = info2->domain;
2646                         info2->dev = dev;
2647                 }
2648         }
2649
2650         if (found) {
2651                 spin_unlock_irqrestore(&device_domain_lock, flags);
2652                 free_devinfo_mem(info);
2653                 /* Caller must free the original domain */
2654                 return found;
2655         }
2656
2657         spin_lock(&iommu->lock);
2658         ret = domain_attach_iommu(domain, iommu);
2659         spin_unlock(&iommu->lock);
2660
2661         if (ret) {
2662                 spin_unlock_irqrestore(&device_domain_lock, flags);
2663                 free_devinfo_mem(info);
2664                 return NULL;
2665         }
2666
2667         list_add(&info->link, &domain->devices);
2668         list_add(&info->global, &device_domain_list);
2669         if (dev)
2670                 dev->archdata.iommu = info;
2671         spin_unlock_irqrestore(&device_domain_lock, flags);
2672
2673         /* PASID table is mandatory for a PCI device in scalable mode. */
2674         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2675                 ret = intel_pasid_alloc_table(dev);
2676                 if (ret) {
2677                         dev_err(dev, "PASID table allocation failed\n");
2678                         dmar_remove_one_dev_info(dev);
2679                         return NULL;
2680                 }
2681
2682                 /* Setup the PASID entry for requests without PASID: */
2683                 spin_lock(&iommu->lock);
2684                 if (hw_pass_through && domain_type_is_si(domain))
2685                         ret = intel_pasid_setup_pass_through(iommu, domain,
2686                                         dev, PASID_RID2PASID);
2687                 else if (domain_use_first_level(domain))
2688                         ret = domain_setup_first_level(iommu, domain, dev,
2689                                         PASID_RID2PASID);
2690                 else
2691                         ret = intel_pasid_setup_second_level(iommu, domain,
2692                                         dev, PASID_RID2PASID);
2693                 spin_unlock(&iommu->lock);
2694                 if (ret) {
2695                         dev_err(dev, "Setup RID2PASID failed\n");
2696                         dmar_remove_one_dev_info(dev);
2697                         return NULL;
2698                 }
2699         }
2700
2701         if (dev && domain_context_mapping(domain, dev)) {
2702                 dev_err(dev, "Domain context map failed\n");
2703                 dmar_remove_one_dev_info(dev);
2704                 return NULL;
2705         }
2706
2707         return domain;
2708 }
2709
2710 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2711 {
2712         *(u16 *)opaque = alias;
2713         return 0;
2714 }
2715
2716 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2717 {
2718         struct device_domain_info *info;
2719         struct dmar_domain *domain = NULL;
2720         struct intel_iommu *iommu;
2721         u16 dma_alias;
2722         unsigned long flags;
2723         u8 bus, devfn;
2724
2725         iommu = device_to_iommu(dev, &bus, &devfn);
2726         if (!iommu)
2727                 return NULL;
2728
2729         if (dev_is_pci(dev)) {
2730                 struct pci_dev *pdev = to_pci_dev(dev);
2731
2732                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2733
2734                 spin_lock_irqsave(&device_domain_lock, flags);
2735                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2736                                                       PCI_BUS_NUM(dma_alias),
2737                                                       dma_alias & 0xff);
2738                 if (info) {
2739                         iommu = info->iommu;
2740                         domain = info->domain;
2741                 }
2742                 spin_unlock_irqrestore(&device_domain_lock, flags);
2743
2744                 /* DMA alias already has a domain, use it */
2745                 if (info)
2746                         goto out;
2747         }
2748
2749         /* Allocate and initialize new domain for the device */
2750         domain = alloc_domain(0);
2751         if (!domain)
2752                 return NULL;
2753         if (domain_init(domain, iommu, gaw)) {
2754                 domain_exit(domain);
2755                 return NULL;
2756         }
2757
2758 out:
2759         return domain;
2760 }
2761
2762 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2763                                               struct dmar_domain *domain)
2764 {
2765         struct intel_iommu *iommu;
2766         struct dmar_domain *tmp;
2767         u16 req_id, dma_alias;
2768         u8 bus, devfn;
2769
2770         iommu = device_to_iommu(dev, &bus, &devfn);
2771         if (!iommu)
2772                 return NULL;
2773
2774         req_id = ((u16)bus << 8) | devfn;
2775
2776         if (dev_is_pci(dev)) {
2777                 struct pci_dev *pdev = to_pci_dev(dev);
2778
2779                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2780
2781                 /* register PCI DMA alias device */
2782                 if (req_id != dma_alias) {
2783                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2784                                         dma_alias & 0xff, NULL, domain);
2785
2786                         if (!tmp || tmp != domain)
2787                                 return tmp;
2788                 }
2789         }
2790
2791         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2792         if (!tmp || tmp != domain)
2793                 return tmp;
2794
2795         return domain;
2796 }
2797
2798 static int iommu_domain_identity_map(struct dmar_domain *domain,
2799                                      unsigned long long start,
2800                                      unsigned long long end)
2801 {
2802         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2803         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2804
2805         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2806                           dma_to_mm_pfn(last_vpfn))) {
2807                 pr_err("Reserving iova failed\n");
2808                 return -ENOMEM;
2809         }
2810
2811         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2812         /*
2813          * RMRR range might have overlap with physical memory range,
2814          * clear it first
2815          */
2816         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2817
2818         return __domain_mapping(domain, first_vpfn, NULL,
2819                                 first_vpfn, last_vpfn - first_vpfn + 1,
2820                                 DMA_PTE_READ|DMA_PTE_WRITE);
2821 }
2822
2823 static int domain_prepare_identity_map(struct device *dev,
2824                                        struct dmar_domain *domain,
2825                                        unsigned long long start,
2826                                        unsigned long long end)
2827 {
2828         /* For _hardware_ passthrough, don't bother. But for software
2829            passthrough, we do it anyway -- it may indicate a memory
2830            range which is reserved in E820, so which didn't get set
2831            up to start with in si_domain */
2832         if (domain == si_domain && hw_pass_through) {
2833                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2834                          start, end);
2835                 return 0;
2836         }
2837
2838         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2839
2840         if (end < start) {
2841                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2842                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2843                         dmi_get_system_info(DMI_BIOS_VENDOR),
2844                         dmi_get_system_info(DMI_BIOS_VERSION),
2845                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2846                 return -EIO;
2847         }
2848
2849         if (end >> agaw_to_width(domain->agaw)) {
2850                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2851                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2852                      agaw_to_width(domain->agaw),
2853                      dmi_get_system_info(DMI_BIOS_VENDOR),
2854                      dmi_get_system_info(DMI_BIOS_VERSION),
2855                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2856                 return -EIO;
2857         }
2858
2859         return iommu_domain_identity_map(domain, start, end);
2860 }
2861
2862 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2863
2864 static int __init si_domain_init(int hw)
2865 {
2866         struct dmar_rmrr_unit *rmrr;
2867         struct device *dev;
2868         int i, nid, ret;
2869
2870         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2871         if (!si_domain)
2872                 return -EFAULT;
2873
2874         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2875                 domain_exit(si_domain);
2876                 return -EFAULT;
2877         }
2878
2879         if (hw)
2880                 return 0;
2881
2882         for_each_online_node(nid) {
2883                 unsigned long start_pfn, end_pfn;
2884                 int i;
2885
2886                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2887                         ret = iommu_domain_identity_map(si_domain,
2888                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2889                         if (ret)
2890                                 return ret;
2891                 }
2892         }
2893
2894         /*
2895          * Identity map the RMRRs so that devices with RMRRs could also use
2896          * the si_domain.
2897          */
2898         for_each_rmrr_units(rmrr) {
2899                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2900                                           i, dev) {
2901                         unsigned long long start = rmrr->base_address;
2902                         unsigned long long end = rmrr->end_address;
2903
2904                         if (WARN_ON(end < start ||
2905                                     end >> agaw_to_width(si_domain->agaw)))
2906                                 continue;
2907
2908                         ret = iommu_domain_identity_map(si_domain, start, end);
2909                         if (ret)
2910                                 return ret;
2911                 }
2912         }
2913
2914         return 0;
2915 }
2916
2917 static int identity_mapping(struct device *dev)
2918 {
2919         struct device_domain_info *info;
2920
2921         info = dev->archdata.iommu;
2922         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2923                 return (info->domain == si_domain);
2924
2925         return 0;
2926 }
2927
2928 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2929 {
2930         struct dmar_domain *ndomain;
2931         struct intel_iommu *iommu;
2932         u8 bus, devfn;
2933
2934         iommu = device_to_iommu(dev, &bus, &devfn);
2935         if (!iommu)
2936                 return -ENODEV;
2937
2938         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2939         if (ndomain != domain)
2940                 return -EBUSY;
2941
2942         return 0;
2943 }
2944
2945 static bool device_has_rmrr(struct device *dev)
2946 {
2947         struct dmar_rmrr_unit *rmrr;
2948         struct device *tmp;
2949         int i;
2950
2951         rcu_read_lock();
2952         for_each_rmrr_units(rmrr) {
2953                 /*
2954                  * Return TRUE if this RMRR contains the device that
2955                  * is passed in.
2956                  */
2957                 for_each_active_dev_scope(rmrr->devices,
2958                                           rmrr->devices_cnt, i, tmp)
2959                         if (tmp == dev ||
2960                             is_downstream_to_pci_bridge(dev, tmp)) {
2961                                 rcu_read_unlock();
2962                                 return true;
2963                         }
2964         }
2965         rcu_read_unlock();
2966         return false;
2967 }
2968
2969 /**
2970  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2971  * is relaxable (ie. is allowed to be not enforced under some conditions)
2972  * @dev: device handle
2973  *
2974  * We assume that PCI USB devices with RMRRs have them largely
2975  * for historical reasons and that the RMRR space is not actively used post
2976  * boot.  This exclusion may change if vendors begin to abuse it.
2977  *
2978  * The same exception is made for graphics devices, with the requirement that
2979  * any use of the RMRR regions will be torn down before assigning the device
2980  * to a guest.
2981  *
2982  * Return: true if the RMRR is relaxable, false otherwise
2983  */
2984 static bool device_rmrr_is_relaxable(struct device *dev)
2985 {
2986         struct pci_dev *pdev;
2987
2988         if (!dev_is_pci(dev))
2989                 return false;
2990
2991         pdev = to_pci_dev(dev);
2992         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2993                 return true;
2994         else
2995                 return false;
2996 }
2997
2998 /*
2999  * There are a couple cases where we need to restrict the functionality of
3000  * devices associated with RMRRs.  The first is when evaluating a device for
3001  * identity mapping because problems exist when devices are moved in and out
3002  * of domains and their respective RMRR information is lost.  This means that
3003  * a device with associated RMRRs will never be in a "passthrough" domain.
3004  * The second is use of the device through the IOMMU API.  This interface
3005  * expects to have full control of the IOVA space for the device.  We cannot
3006  * satisfy both the requirement that RMRR access is maintained and have an
3007  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3008  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3009  * We therefore prevent devices associated with an RMRR from participating in
3010  * the IOMMU API, which eliminates them from device assignment.
3011  *
3012  * In both cases, devices which have relaxable RMRRs are not concerned by this
3013  * restriction. See device_rmrr_is_relaxable comment.
3014  */
3015 static bool device_is_rmrr_locked(struct device *dev)
3016 {
3017         if (!device_has_rmrr(dev))
3018                 return false;
3019
3020         if (device_rmrr_is_relaxable(dev))
3021                 return false;
3022
3023         return true;
3024 }
3025
3026 /*
3027  * Return the required default domain type for a specific device.
3028  *
3029  * @dev: the device in query
3030  * @startup: true if this is during early boot
3031  *
3032  * Returns:
3033  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3034  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3035  *  - 0: both identity and dynamic domains work for this device
3036  */
3037 static int device_def_domain_type(struct device *dev)
3038 {
3039         if (dev_is_pci(dev)) {
3040                 struct pci_dev *pdev = to_pci_dev(dev);
3041
3042                 /*
3043                  * Prevent any device marked as untrusted from getting
3044                  * placed into the statically identity mapping domain.
3045                  */
3046                 if (pdev->untrusted)
3047                         return IOMMU_DOMAIN_DMA;
3048
3049                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3050                         return IOMMU_DOMAIN_IDENTITY;
3051
3052                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3053                         return IOMMU_DOMAIN_IDENTITY;
3054
3055                 /*
3056                  * We want to start off with all devices in the 1:1 domain, and
3057                  * take them out later if we find they can't access all of memory.
3058                  *
3059                  * However, we can't do this for PCI devices behind bridges,
3060                  * because all PCI devices behind the same bridge will end up
3061                  * with the same source-id on their transactions.
3062                  *
3063                  * Practically speaking, we can't change things around for these
3064                  * devices at run-time, because we can't be sure there'll be no
3065                  * DMA transactions in flight for any of their siblings.
3066                  *
3067                  * So PCI devices (unless they're on the root bus) as well as
3068                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3069                  * the 1:1 domain, just in _case_ one of their siblings turns out
3070                  * not to be able to map all of memory.
3071                  */
3072                 if (!pci_is_pcie(pdev)) {
3073                         if (!pci_is_root_bus(pdev->bus))
3074                                 return IOMMU_DOMAIN_DMA;
3075                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3076                                 return IOMMU_DOMAIN_DMA;
3077                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3078                         return IOMMU_DOMAIN_DMA;
3079         }
3080
3081         return 0;
3082 }
3083
3084 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3085 {
3086         /*
3087          * Start from the sane iommu hardware state.
3088          * If the queued invalidation is already initialized by us
3089          * (for example, while enabling interrupt-remapping) then
3090          * we got the things already rolling from a sane state.
3091          */
3092         if (!iommu->qi) {
3093                 /*
3094                  * Clear any previous faults.
3095                  */
3096                 dmar_fault(-1, iommu);
3097                 /*
3098                  * Disable queued invalidation if supported and already enabled
3099                  * before OS handover.
3100                  */
3101                 dmar_disable_qi(iommu);
3102         }
3103
3104         if (dmar_enable_qi(iommu)) {
3105                 /*
3106                  * Queued Invalidate not enabled, use Register Based Invalidate
3107                  */
3108                 iommu->flush.flush_context = __iommu_flush_context;
3109                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3110                 pr_info("%s: Using Register based invalidation\n",
3111                         iommu->name);
3112         } else {
3113                 iommu->flush.flush_context = qi_flush_context;
3114                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3115                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3116         }
3117 }
3118
3119 static int copy_context_table(struct intel_iommu *iommu,
3120                               struct root_entry *old_re,
3121                               struct context_entry **tbl,
3122                               int bus, bool ext)
3123 {
3124         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3125         struct context_entry *new_ce = NULL, ce;
3126         struct context_entry *old_ce = NULL;
3127         struct root_entry re;
3128         phys_addr_t old_ce_phys;
3129
3130         tbl_idx = ext ? bus * 2 : bus;
3131         memcpy(&re, old_re, sizeof(re));
3132
3133         for (devfn = 0; devfn < 256; devfn++) {
3134                 /* First calculate the correct index */
3135                 idx = (ext ? devfn * 2 : devfn) % 256;
3136
3137                 if (idx == 0) {
3138                         /* First save what we may have and clean up */
3139                         if (new_ce) {
3140                                 tbl[tbl_idx] = new_ce;
3141                                 __iommu_flush_cache(iommu, new_ce,
3142                                                     VTD_PAGE_SIZE);
3143                                 pos = 1;
3144                         }
3145
3146                         if (old_ce)
3147                                 memunmap(old_ce);
3148
3149                         ret = 0;
3150                         if (devfn < 0x80)
3151                                 old_ce_phys = root_entry_lctp(&re);
3152                         else
3153                                 old_ce_phys = root_entry_uctp(&re);
3154
3155                         if (!old_ce_phys) {
3156                                 if (ext && devfn == 0) {
3157                                         /* No LCTP, try UCTP */
3158                                         devfn = 0x7f;
3159                                         continue;
3160                                 } else {
3161                                         goto out;
3162                                 }
3163                         }
3164
3165                         ret = -ENOMEM;
3166                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3167                                         MEMREMAP_WB);
3168                         if (!old_ce)
3169                                 goto out;
3170
3171                         new_ce = alloc_pgtable_page(iommu->node);
3172                         if (!new_ce)
3173                                 goto out_unmap;
3174
3175                         ret = 0;
3176                 }
3177
3178                 /* Now copy the context entry */
3179                 memcpy(&ce, old_ce + idx, sizeof(ce));
3180
3181                 if (!__context_present(&ce))
3182                         continue;
3183
3184                 did = context_domain_id(&ce);
3185                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3186                         set_bit(did, iommu->domain_ids);
3187
3188                 /*
3189                  * We need a marker for copied context entries. This
3190                  * marker needs to work for the old format as well as
3191                  * for extended context entries.
3192                  *
3193                  * Bit 67 of the context entry is used. In the old
3194                  * format this bit is available to software, in the
3195                  * extended format it is the PGE bit, but PGE is ignored
3196                  * by HW if PASIDs are disabled (and thus still
3197                  * available).
3198                  *
3199                  * So disable PASIDs first and then mark the entry
3200                  * copied. This means that we don't copy PASID
3201                  * translations from the old kernel, but this is fine as
3202                  * faults there are not fatal.
3203                  */
3204                 context_clear_pasid_enable(&ce);
3205                 context_set_copied(&ce);
3206
3207                 new_ce[idx] = ce;
3208         }
3209
3210         tbl[tbl_idx + pos] = new_ce;
3211
3212         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3213
3214 out_unmap:
3215         memunmap(old_ce);
3216
3217 out:
3218         return ret;
3219 }
3220
3221 static int copy_translation_tables(struct intel_iommu *iommu)
3222 {
3223         struct context_entry **ctxt_tbls;
3224         struct root_entry *old_rt;
3225         phys_addr_t old_rt_phys;
3226         int ctxt_table_entries;
3227         unsigned long flags;
3228         u64 rtaddr_reg;
3229         int bus, ret;
3230         bool new_ext, ext;
3231
3232         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3233         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3234         new_ext    = !!ecap_ecs(iommu->ecap);
3235
3236         /*
3237          * The RTT bit can only be changed when translation is disabled,
3238          * but disabling translation means to open a window for data
3239          * corruption. So bail out and don't copy anything if we would
3240          * have to change the bit.
3241          */
3242         if (new_ext != ext)
3243                 return -EINVAL;
3244
3245         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3246         if (!old_rt_phys)
3247                 return -EINVAL;
3248
3249         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3250         if (!old_rt)
3251                 return -ENOMEM;
3252
3253         /* This is too big for the stack - allocate it from slab */
3254         ctxt_table_entries = ext ? 512 : 256;
3255         ret = -ENOMEM;
3256         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3257         if (!ctxt_tbls)
3258                 goto out_unmap;
3259
3260         for (bus = 0; bus < 256; bus++) {
3261                 ret = copy_context_table(iommu, &old_rt[bus],
3262                                          ctxt_tbls, bus, ext);
3263                 if (ret) {
3264                         pr_err("%s: Failed to copy context table for bus %d\n",
3265                                 iommu->name, bus);
3266                         continue;
3267                 }
3268         }
3269
3270         spin_lock_irqsave(&iommu->lock, flags);
3271
3272         /* Context tables are copied, now write them to the root_entry table */
3273         for (bus = 0; bus < 256; bus++) {
3274                 int idx = ext ? bus * 2 : bus;
3275                 u64 val;
3276
3277                 if (ctxt_tbls[idx]) {
3278                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3279                         iommu->root_entry[bus].lo = val;
3280                 }
3281
3282                 if (!ext || !ctxt_tbls[idx + 1])
3283                         continue;
3284
3285                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3286                 iommu->root_entry[bus].hi = val;
3287         }
3288
3289         spin_unlock_irqrestore(&iommu->lock, flags);
3290
3291         kfree(ctxt_tbls);
3292
3293         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3294
3295         ret = 0;
3296
3297 out_unmap:
3298         memunmap(old_rt);
3299
3300         return ret;
3301 }
3302
3303 static int __init init_dmars(void)
3304 {
3305         struct dmar_drhd_unit *drhd;
3306         struct intel_iommu *iommu;
3307         int ret;
3308
3309         /*
3310          * for each drhd
3311          *    allocate root
3312          *    initialize and program root entry to not present
3313          * endfor
3314          */
3315         for_each_drhd_unit(drhd) {
3316                 /*
3317                  * lock not needed as this is only incremented in the single
3318                  * threaded kernel __init code path all other access are read
3319                  * only
3320                  */
3321                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3322                         g_num_of_iommus++;
3323                         continue;
3324                 }
3325                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3326         }
3327
3328         /* Preallocate enough resources for IOMMU hot-addition */
3329         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3330                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3331
3332         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3333                         GFP_KERNEL);
3334         if (!g_iommus) {
3335                 pr_err("Allocating global iommu array failed\n");
3336                 ret = -ENOMEM;
3337                 goto error;
3338         }
3339
3340         for_each_iommu(iommu, drhd) {
3341                 if (drhd->ignored) {
3342                         iommu_disable_translation(iommu);
3343                         continue;
3344                 }
3345
3346                 /*
3347                  * Find the max pasid size of all IOMMU's in the system.
3348                  * We need to ensure the system pasid table is no bigger
3349                  * than the smallest supported.
3350                  */
3351                 if (pasid_supported(iommu)) {
3352                         u32 temp = 2 << ecap_pss(iommu->ecap);
3353
3354                         intel_pasid_max_id = min_t(u32, temp,
3355                                                    intel_pasid_max_id);
3356                 }
3357
3358                 g_iommus[iommu->seq_id] = iommu;
3359
3360                 intel_iommu_init_qi(iommu);
3361
3362                 ret = iommu_init_domains(iommu);
3363                 if (ret)
3364                         goto free_iommu;
3365
3366                 init_translation_status(iommu);
3367
3368                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3369                         iommu_disable_translation(iommu);
3370                         clear_translation_pre_enabled(iommu);
3371                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3372                                 iommu->name);
3373                 }
3374
3375                 /*
3376                  * TBD:
3377                  * we could share the same root & context tables
3378                  * among all IOMMU's. Need to Split it later.
3379                  */
3380                 ret = iommu_alloc_root_entry(iommu);
3381                 if (ret)
3382                         goto free_iommu;
3383
3384                 if (translation_pre_enabled(iommu)) {
3385                         pr_info("Translation already enabled - trying to copy translation structures\n");
3386
3387                         ret = copy_translation_tables(iommu);
3388                         if (ret) {
3389                                 /*
3390                                  * We found the IOMMU with translation
3391                                  * enabled - but failed to copy over the
3392                                  * old root-entry table. Try to proceed
3393                                  * by disabling translation now and
3394                                  * allocating a clean root-entry table.
3395                                  * This might cause DMAR faults, but
3396                                  * probably the dump will still succeed.
3397                                  */
3398                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3399                                        iommu->name);
3400                                 iommu_disable_translation(iommu);
3401                                 clear_translation_pre_enabled(iommu);
3402                         } else {
3403                                 pr_info("Copied translation tables from previous kernel for %s\n",
3404                                         iommu->name);
3405                         }
3406                 }
3407
3408                 if (!ecap_pass_through(iommu->ecap))
3409                         hw_pass_through = 0;
3410                 intel_svm_check(iommu);
3411         }
3412
3413         /*
3414          * Now that qi is enabled on all iommus, set the root entry and flush
3415          * caches. This is required on some Intel X58 chipsets, otherwise the
3416          * flush_context function will loop forever and the boot hangs.
3417          */
3418         for_each_active_iommu(iommu, drhd) {
3419                 iommu_flush_write_buffer(iommu);
3420                 iommu_set_root_entry(iommu);
3421                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3422                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3423         }
3424
3425 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3426         dmar_map_gfx = 0;
3427 #endif
3428
3429         if (!dmar_map_gfx)
3430                 iommu_identity_mapping |= IDENTMAP_GFX;
3431
3432         check_tylersburg_isoch();
3433
3434         ret = si_domain_init(hw_pass_through);
3435         if (ret)
3436                 goto free_iommu;
3437
3438         /*
3439          * for each drhd
3440          *   enable fault log
3441          *   global invalidate context cache
3442          *   global invalidate iotlb
3443          *   enable translation
3444          */
3445         for_each_iommu(iommu, drhd) {
3446                 if (drhd->ignored) {
3447                         /*
3448                          * we always have to disable PMRs or DMA may fail on
3449                          * this device
3450                          */
3451                         if (force_on)
3452                                 iommu_disable_protect_mem_regions(iommu);
3453                         continue;
3454                 }
3455
3456                 iommu_flush_write_buffer(iommu);
3457
3458 #ifdef CONFIG_INTEL_IOMMU_SVM
3459                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3460                         /*
3461                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3462                          * could cause possible lock race condition.
3463                          */
3464                         up_write(&dmar_global_lock);
3465                         ret = intel_svm_enable_prq(iommu);
3466                         down_write(&dmar_global_lock);
3467                         if (ret)
3468                                 goto free_iommu;
3469                 }
3470 #endif
3471                 ret = dmar_set_interrupt(iommu);
3472                 if (ret)
3473                         goto free_iommu;
3474         }
3475
3476         return 0;
3477
3478 free_iommu:
3479         for_each_active_iommu(iommu, drhd) {
3480                 disable_dmar_iommu(iommu);
3481                 free_dmar_iommu(iommu);
3482         }
3483
3484         kfree(g_iommus);
3485
3486 error:
3487         return ret;
3488 }
3489
3490 /* This takes a number of _MM_ pages, not VTD pages */
3491 static unsigned long intel_alloc_iova(struct device *dev,
3492                                      struct dmar_domain *domain,
3493                                      unsigned long nrpages, uint64_t dma_mask)
3494 {
3495         unsigned long iova_pfn;
3496
3497         /*
3498          * Restrict dma_mask to the width that the iommu can handle.
3499          * First-level translation restricts the input-address to a
3500          * canonical address (i.e., address bits 63:N have the same
3501          * value as address bit [N-1], where N is 48-bits with 4-level
3502          * paging and 57-bits with 5-level paging). Hence, skip bit
3503          * [N-1].
3504          */
3505         if (domain_use_first_level(domain))
3506                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3507                                  dma_mask);
3508         else
3509                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3510                                  dma_mask);
3511
3512         /* Ensure we reserve the whole size-aligned region */
3513         nrpages = __roundup_pow_of_two(nrpages);
3514
3515         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3516                 /*
3517                  * First try to allocate an io virtual address in
3518                  * DMA_BIT_MASK(32) and if that fails then try allocating
3519                  * from higher range
3520                  */
3521                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3522                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3523                 if (iova_pfn)
3524                         return iova_pfn;
3525         }
3526         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3527                                    IOVA_PFN(dma_mask), true);
3528         if (unlikely(!iova_pfn)) {
3529                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3530                              nrpages);
3531                 return 0;
3532         }
3533
3534         return iova_pfn;
3535 }
3536
3537 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3538 {
3539         struct dmar_domain *domain, *tmp;
3540         struct dmar_rmrr_unit *rmrr;
3541         struct device *i_dev;
3542         int i, ret;
3543
3544         /* Device shouldn't be attached by any domains. */
3545         domain = find_domain(dev);
3546         if (domain)
3547                 return NULL;
3548
3549         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3550         if (!domain)
3551                 goto out;
3552
3553         /* We have a new domain - setup possible RMRRs for the device */
3554         rcu_read_lock();
3555         for_each_rmrr_units(rmrr) {
3556                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3557                                           i, i_dev) {
3558                         if (i_dev != dev)
3559                                 continue;
3560
3561                         ret = domain_prepare_identity_map(dev, domain,
3562                                                           rmrr->base_address,
3563                                                           rmrr->end_address);
3564                         if (ret)
3565                                 dev_err(dev, "Mapping reserved region failed\n");
3566                 }
3567         }
3568         rcu_read_unlock();
3569
3570         tmp = set_domain_for_dev(dev, domain);
3571         if (!tmp || domain != tmp) {
3572                 domain_exit(domain);
3573                 domain = tmp;
3574         }
3575
3576 out:
3577         if (!domain)
3578                 dev_err(dev, "Allocating domain failed\n");
3579         else
3580                 domain->domain.type = IOMMU_DOMAIN_DMA;
3581
3582         return domain;
3583 }
3584
3585 /* Check if the dev needs to go through non-identity map and unmap process.*/
3586 static bool iommu_need_mapping(struct device *dev)
3587 {
3588         int ret;
3589
3590         if (iommu_dummy(dev))
3591                 return false;
3592
3593         ret = identity_mapping(dev);
3594         if (ret) {
3595                 u64 dma_mask = *dev->dma_mask;
3596
3597                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3598                         dma_mask = dev->coherent_dma_mask;
3599
3600                 if (dma_mask >= dma_direct_get_required_mask(dev))
3601                         return false;
3602
3603                 /*
3604                  * 32 bit DMA is removed from si_domain and fall back to
3605                  * non-identity mapping.
3606                  */
3607                 dmar_remove_one_dev_info(dev);
3608                 ret = iommu_request_dma_domain_for_dev(dev);
3609                 if (ret) {
3610                         struct iommu_domain *domain;
3611                         struct dmar_domain *dmar_domain;
3612
3613                         domain = iommu_get_domain_for_dev(dev);
3614                         if (domain) {
3615                                 dmar_domain = to_dmar_domain(domain);
3616                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3617                         }
3618                         dmar_remove_one_dev_info(dev);
3619                         get_private_domain_for_dev(dev);
3620                 }
3621
3622                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3623         }
3624
3625         return true;
3626 }
3627
3628 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3629                                      size_t size, int dir, u64 dma_mask)
3630 {
3631         struct dmar_domain *domain;
3632         phys_addr_t start_paddr;
3633         unsigned long iova_pfn;
3634         int prot = 0;
3635         int ret;
3636         struct intel_iommu *iommu;
3637         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3638
3639         BUG_ON(dir == DMA_NONE);
3640
3641         domain = deferred_attach_domain(dev);
3642         if (!domain)
3643                 return DMA_MAPPING_ERROR;
3644
3645         iommu = domain_get_iommu(domain);
3646         size = aligned_nrpages(paddr, size);
3647
3648         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3649         if (!iova_pfn)
3650                 goto error;
3651
3652         /*
3653          * Check if DMAR supports zero-length reads on write only
3654          * mappings..
3655          */
3656         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3657                         !cap_zlr(iommu->cap))
3658                 prot |= DMA_PTE_READ;
3659         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3660                 prot |= DMA_PTE_WRITE;
3661         /*
3662          * paddr - (paddr + size) might be partial page, we should map the whole
3663          * page.  Note: if two part of one page are separately mapped, we
3664          * might have two guest_addr mapping to the same host paddr, but this
3665          * is not a big problem
3666          */
3667         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3668                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3669         if (ret)
3670                 goto error;
3671
3672         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3673         start_paddr += paddr & ~PAGE_MASK;
3674
3675         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3676
3677         return start_paddr;
3678
3679 error:
3680         if (iova_pfn)
3681                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3682         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3683                 size, (unsigned long long)paddr, dir);
3684         return DMA_MAPPING_ERROR;
3685 }
3686
3687 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3688                                  unsigned long offset, size_t size,
3689                                  enum dma_data_direction dir,
3690                                  unsigned long attrs)
3691 {
3692         if (iommu_need_mapping(dev))
3693                 return __intel_map_single(dev, page_to_phys(page) + offset,
3694                                 size, dir, *dev->dma_mask);
3695         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3696 }
3697
3698 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3699                                      size_t size, enum dma_data_direction dir,
3700                                      unsigned long attrs)
3701 {
3702         if (iommu_need_mapping(dev))
3703                 return __intel_map_single(dev, phys_addr, size, dir,
3704                                 *dev->dma_mask);
3705         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3706 }
3707
3708 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3709 {
3710         struct dmar_domain *domain;
3711         unsigned long start_pfn, last_pfn;
3712         unsigned long nrpages;
3713         unsigned long iova_pfn;
3714         struct intel_iommu *iommu;
3715         struct page *freelist;
3716         struct pci_dev *pdev = NULL;
3717
3718         domain = find_domain(dev);
3719         BUG_ON(!domain);
3720
3721         iommu = domain_get_iommu(domain);
3722
3723         iova_pfn = IOVA_PFN(dev_addr);
3724
3725         nrpages = aligned_nrpages(dev_addr, size);
3726         start_pfn = mm_to_dma_pfn(iova_pfn);
3727         last_pfn = start_pfn + nrpages - 1;
3728
3729         if (dev_is_pci(dev))
3730                 pdev = to_pci_dev(dev);
3731
3732         freelist = domain_unmap(domain, start_pfn, last_pfn);
3733         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3734                         !has_iova_flush_queue(&domain->iovad)) {
3735                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3736                                       nrpages, !freelist, 0);
3737                 /* free iova */
3738                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3739                 dma_free_pagelist(freelist);
3740         } else {
3741                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3742                            (unsigned long)freelist);
3743                 /*
3744                  * queue up the release of the unmap to save the 1/6th of the
3745                  * cpu used up by the iotlb flush operation...
3746                  */
3747         }
3748
3749         trace_unmap_single(dev, dev_addr, size);
3750 }
3751
3752 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3753                              size_t size, enum dma_data_direction dir,
3754                              unsigned long attrs)
3755 {
3756         if (iommu_need_mapping(dev))
3757                 intel_unmap(dev, dev_addr, size);
3758         else
3759                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3760 }
3761
3762 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3763                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3764 {
3765         if (iommu_need_mapping(dev))
3766                 intel_unmap(dev, dev_addr, size);
3767 }
3768
3769 static void *intel_alloc_coherent(struct device *dev, size_t size,
3770                                   dma_addr_t *dma_handle, gfp_t flags,
3771                                   unsigned long attrs)
3772 {
3773         struct page *page = NULL;
3774         int order;
3775
3776         if (!iommu_need_mapping(dev))
3777                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3778
3779         size = PAGE_ALIGN(size);
3780         order = get_order(size);
3781
3782         if (gfpflags_allow_blocking(flags)) {
3783                 unsigned int count = size >> PAGE_SHIFT;
3784
3785                 page = dma_alloc_from_contiguous(dev, count, order,
3786                                                  flags & __GFP_NOWARN);
3787         }
3788
3789         if (!page)
3790                 page = alloc_pages(flags, order);
3791         if (!page)
3792                 return NULL;
3793         memset(page_address(page), 0, size);
3794
3795         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3796                                          DMA_BIDIRECTIONAL,
3797                                          dev->coherent_dma_mask);
3798         if (*dma_handle != DMA_MAPPING_ERROR)
3799                 return page_address(page);
3800         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3801                 __free_pages(page, order);
3802
3803         return NULL;
3804 }
3805
3806 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3807                                 dma_addr_t dma_handle, unsigned long attrs)
3808 {
3809         int order;
3810         struct page *page = virt_to_page(vaddr);
3811
3812         if (!iommu_need_mapping(dev))
3813                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3814
3815         size = PAGE_ALIGN(size);
3816         order = get_order(size);
3817
3818         intel_unmap(dev, dma_handle, size);
3819         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3820                 __free_pages(page, order);
3821 }
3822
3823 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3824                            int nelems, enum dma_data_direction dir,
3825                            unsigned long attrs)
3826 {
3827         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3828         unsigned long nrpages = 0;
3829         struct scatterlist *sg;
3830         int i;
3831
3832         if (!iommu_need_mapping(dev))
3833                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3834
3835         for_each_sg(sglist, sg, nelems, i) {
3836                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3837         }
3838
3839         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3840
3841         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3842 }
3843
3844 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3845                         enum dma_data_direction dir, unsigned long attrs)
3846 {
3847         int i;
3848         struct dmar_domain *domain;
3849         size_t size = 0;
3850         int prot = 0;
3851         unsigned long iova_pfn;
3852         int ret;
3853         struct scatterlist *sg;
3854         unsigned long start_vpfn;
3855         struct intel_iommu *iommu;
3856
3857         BUG_ON(dir == DMA_NONE);
3858         if (!iommu_need_mapping(dev))
3859                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3860
3861         domain = deferred_attach_domain(dev);
3862         if (!domain)
3863                 return 0;
3864
3865         iommu = domain_get_iommu(domain);
3866
3867         for_each_sg(sglist, sg, nelems, i)
3868                 size += aligned_nrpages(sg->offset, sg->length);
3869
3870         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3871                                 *dev->dma_mask);
3872         if (!iova_pfn) {
3873                 sglist->dma_length = 0;
3874                 return 0;
3875         }
3876
3877         /*
3878          * Check if DMAR supports zero-length reads on write only
3879          * mappings..
3880          */
3881         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3882                         !cap_zlr(iommu->cap))
3883                 prot |= DMA_PTE_READ;
3884         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3885                 prot |= DMA_PTE_WRITE;
3886
3887         start_vpfn = mm_to_dma_pfn(iova_pfn);
3888
3889         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3890         if (unlikely(ret)) {
3891                 dma_pte_free_pagetable(domain, start_vpfn,
3892                                        start_vpfn + size - 1,
3893                                        agaw_to_level(domain->agaw) + 1);
3894                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3895                 return 0;
3896         }
3897
3898         for_each_sg(sglist, sg, nelems, i)
3899                 trace_map_sg(dev, i + 1, nelems, sg);
3900
3901         return nelems;
3902 }
3903
3904 static u64 intel_get_required_mask(struct device *dev)
3905 {
3906         if (!iommu_need_mapping(dev))
3907                 return dma_direct_get_required_mask(dev);
3908         return DMA_BIT_MASK(32);
3909 }
3910
3911 static const struct dma_map_ops intel_dma_ops = {
3912         .alloc = intel_alloc_coherent,
3913         .free = intel_free_coherent,
3914         .map_sg = intel_map_sg,
3915         .unmap_sg = intel_unmap_sg,
3916         .map_page = intel_map_page,
3917         .unmap_page = intel_unmap_page,
3918         .map_resource = intel_map_resource,
3919         .unmap_resource = intel_unmap_resource,
3920         .dma_supported = dma_direct_supported,
3921         .mmap = dma_common_mmap,
3922         .get_sgtable = dma_common_get_sgtable,
3923         .get_required_mask = intel_get_required_mask,
3924 };
3925
3926 static void
3927 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3928                    enum dma_data_direction dir, enum dma_sync_target target)
3929 {
3930         struct dmar_domain *domain;
3931         phys_addr_t tlb_addr;
3932
3933         domain = find_domain(dev);
3934         if (WARN_ON(!domain))
3935                 return;
3936
3937         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3938         if (is_swiotlb_buffer(tlb_addr))
3939                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3940 }
3941
3942 static dma_addr_t
3943 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3944                   enum dma_data_direction dir, unsigned long attrs,
3945                   u64 dma_mask)
3946 {
3947         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3948         struct dmar_domain *domain;
3949         struct intel_iommu *iommu;
3950         unsigned long iova_pfn;
3951         unsigned long nrpages;
3952         phys_addr_t tlb_addr;
3953         int prot = 0;
3954         int ret;
3955
3956         domain = deferred_attach_domain(dev);
3957         if (WARN_ON(dir == DMA_NONE || !domain))
3958                 return DMA_MAPPING_ERROR;
3959
3960         iommu = domain_get_iommu(domain);
3961         if (WARN_ON(!iommu))
3962                 return DMA_MAPPING_ERROR;
3963
3964         nrpages = aligned_nrpages(0, size);
3965         iova_pfn = intel_alloc_iova(dev, domain,
3966                                     dma_to_mm_pfn(nrpages), dma_mask);
3967         if (!iova_pfn)
3968                 return DMA_MAPPING_ERROR;
3969
3970         /*
3971          * Check if DMAR supports zero-length reads on write only
3972          * mappings..
3973          */
3974         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3975                         !cap_zlr(iommu->cap))
3976                 prot |= DMA_PTE_READ;
3977         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3978                 prot |= DMA_PTE_WRITE;
3979
3980         /*
3981          * If both the physical buffer start address and size are
3982          * page aligned, we don't need to use a bounce page.
3983          */
3984         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3985                 tlb_addr = swiotlb_tbl_map_single(dev,
3986                                 __phys_to_dma(dev, io_tlb_start),
3987                                 paddr, size, aligned_size, dir, attrs);
3988                 if (tlb_addr == DMA_MAPPING_ERROR) {
3989                         goto swiotlb_error;
3990                 } else {
3991                         /* Cleanup the padding area. */
3992                         void *padding_start = phys_to_virt(tlb_addr);
3993                         size_t padding_size = aligned_size;
3994
3995                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3996                             (dir == DMA_TO_DEVICE ||
3997                              dir == DMA_BIDIRECTIONAL)) {
3998                                 padding_start += size;
3999                                 padding_size -= size;
4000                         }
4001
4002                         memset(padding_start, 0, padding_size);
4003                 }
4004         } else {
4005                 tlb_addr = paddr;
4006         }
4007
4008         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4009                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4010         if (ret)
4011                 goto mapping_error;
4012
4013         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4014
4015         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4016
4017 mapping_error:
4018         if (is_swiotlb_buffer(tlb_addr))
4019                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4020                                          aligned_size, dir, attrs);
4021 swiotlb_error:
4022         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4023         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4024                 size, (unsigned long long)paddr, dir);
4025
4026         return DMA_MAPPING_ERROR;
4027 }
4028
4029 static void
4030 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4031                     enum dma_data_direction dir, unsigned long attrs)
4032 {
4033         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4034         struct dmar_domain *domain;
4035         phys_addr_t tlb_addr;
4036
4037         domain = find_domain(dev);
4038         if (WARN_ON(!domain))
4039                 return;
4040
4041         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4042         if (WARN_ON(!tlb_addr))
4043                 return;
4044
4045         intel_unmap(dev, dev_addr, size);
4046         if (is_swiotlb_buffer(tlb_addr))
4047                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4048                                          aligned_size, dir, attrs);
4049
4050         trace_bounce_unmap_single(dev, dev_addr, size);
4051 }
4052
4053 static dma_addr_t
4054 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4055                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4056 {
4057         return bounce_map_single(dev, page_to_phys(page) + offset,
4058                                  size, dir, attrs, *dev->dma_mask);
4059 }
4060
4061 static dma_addr_t
4062 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4063                     enum dma_data_direction dir, unsigned long attrs)
4064 {
4065         return bounce_map_single(dev, phys_addr, size,
4066                                  dir, attrs, *dev->dma_mask);
4067 }
4068
4069 static void
4070 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4071                   enum dma_data_direction dir, unsigned long attrs)
4072 {
4073         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4074 }
4075
4076 static void
4077 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4078                       enum dma_data_direction dir, unsigned long attrs)
4079 {
4080         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4081 }
4082
4083 static void
4084 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4085                 enum dma_data_direction dir, unsigned long attrs)
4086 {
4087         struct scatterlist *sg;
4088         int i;
4089
4090         for_each_sg(sglist, sg, nelems, i)
4091                 bounce_unmap_page(dev, sg->dma_address,
4092                                   sg_dma_len(sg), dir, attrs);
4093 }
4094
4095 static int
4096 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4097               enum dma_data_direction dir, unsigned long attrs)
4098 {
4099         int i;
4100         struct scatterlist *sg;
4101
4102         for_each_sg(sglist, sg, nelems, i) {
4103                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4104                                                   sg->offset, sg->length,
4105                                                   dir, attrs);
4106                 if (sg->dma_address == DMA_MAPPING_ERROR)
4107                         goto out_unmap;
4108                 sg_dma_len(sg) = sg->length;
4109         }
4110
4111         for_each_sg(sglist, sg, nelems, i)
4112                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4113
4114         return nelems;
4115
4116 out_unmap:
4117         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4118         return 0;
4119 }
4120
4121 static void
4122 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4123                            size_t size, enum dma_data_direction dir)
4124 {
4125         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4126 }
4127
4128 static void
4129 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4130                               size_t size, enum dma_data_direction dir)
4131 {
4132         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4133 }
4134
4135 static void
4136 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4137                        int nelems, enum dma_data_direction dir)
4138 {
4139         struct scatterlist *sg;
4140         int i;
4141
4142         for_each_sg(sglist, sg, nelems, i)
4143                 bounce_sync_single(dev, sg_dma_address(sg),
4144                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4145 }
4146
4147 static void
4148 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4149                           int nelems, enum dma_data_direction dir)
4150 {
4151         struct scatterlist *sg;
4152         int i;
4153
4154         for_each_sg(sglist, sg, nelems, i)
4155                 bounce_sync_single(dev, sg_dma_address(sg),
4156                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4157 }
4158
4159 static const struct dma_map_ops bounce_dma_ops = {
4160         .alloc                  = intel_alloc_coherent,
4161         .free                   = intel_free_coherent,
4162         .map_sg                 = bounce_map_sg,
4163         .unmap_sg               = bounce_unmap_sg,
4164         .map_page               = bounce_map_page,
4165         .unmap_page             = bounce_unmap_page,
4166         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4167         .sync_single_for_device = bounce_sync_single_for_device,
4168         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4169         .sync_sg_for_device     = bounce_sync_sg_for_device,
4170         .map_resource           = bounce_map_resource,
4171         .unmap_resource         = bounce_unmap_resource,
4172         .dma_supported          = dma_direct_supported,
4173 };
4174
4175 static inline int iommu_domain_cache_init(void)
4176 {
4177         int ret = 0;
4178
4179         iommu_domain_cache = kmem_cache_create("iommu_domain",
4180                                          sizeof(struct dmar_domain),
4181                                          0,
4182                                          SLAB_HWCACHE_ALIGN,
4183
4184                                          NULL);
4185         if (!iommu_domain_cache) {
4186                 pr_err("Couldn't create iommu_domain cache\n");
4187                 ret = -ENOMEM;
4188         }
4189
4190         return ret;
4191 }
4192
4193 static inline int iommu_devinfo_cache_init(void)
4194 {
4195         int ret = 0;
4196
4197         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4198                                          sizeof(struct device_domain_info),
4199                                          0,
4200                                          SLAB_HWCACHE_ALIGN,
4201                                          NULL);
4202         if (!iommu_devinfo_cache) {
4203                 pr_err("Couldn't create devinfo cache\n");
4204                 ret = -ENOMEM;
4205         }
4206
4207         return ret;
4208 }
4209
4210 static int __init iommu_init_mempool(void)
4211 {
4212         int ret;
4213         ret = iova_cache_get();
4214         if (ret)
4215                 return ret;
4216
4217         ret = iommu_domain_cache_init();
4218         if (ret)
4219                 goto domain_error;
4220
4221         ret = iommu_devinfo_cache_init();
4222         if (!ret)
4223                 return ret;
4224
4225         kmem_cache_destroy(iommu_domain_cache);
4226 domain_error:
4227         iova_cache_put();
4228
4229         return -ENOMEM;
4230 }
4231
4232 static void __init iommu_exit_mempool(void)
4233 {
4234         kmem_cache_destroy(iommu_devinfo_cache);
4235         kmem_cache_destroy(iommu_domain_cache);
4236         iova_cache_put();
4237 }
4238
4239 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4240 {
4241         struct dmar_drhd_unit *drhd;
4242         u32 vtbar;
4243         int rc;
4244
4245         /* We know that this device on this chipset has its own IOMMU.
4246          * If we find it under a different IOMMU, then the BIOS is lying
4247          * to us. Hope that the IOMMU for this device is actually
4248          * disabled, and it needs no translation...
4249          */
4250         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4251         if (rc) {
4252                 /* "can't" happen */
4253                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4254                 return;
4255         }
4256         vtbar &= 0xffff0000;
4257
4258         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4259         drhd = dmar_find_matched_drhd_unit(pdev);
4260         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4261                             TAINT_FIRMWARE_WORKAROUND,
4262                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4263                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4264 }
4265 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4266
4267 static void __init init_no_remapping_devices(void)
4268 {
4269         struct dmar_drhd_unit *drhd;
4270         struct device *dev;
4271         int i;
4272
4273         for_each_drhd_unit(drhd) {
4274                 if (!drhd->include_all) {
4275                         for_each_active_dev_scope(drhd->devices,
4276                                                   drhd->devices_cnt, i, dev)
4277                                 break;
4278                         /* ignore DMAR unit if no devices exist */
4279                         if (i == drhd->devices_cnt)
4280                                 drhd->ignored = 1;
4281                 }
4282         }
4283
4284         for_each_active_drhd_unit(drhd) {
4285                 if (drhd->include_all)
4286                         continue;
4287
4288                 for_each_active_dev_scope(drhd->devices,
4289                                           drhd->devices_cnt, i, dev)
4290                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4291                                 break;
4292                 if (i < drhd->devices_cnt)
4293                         continue;
4294
4295                 /* This IOMMU has *only* gfx devices. Either bypass it or
4296                    set the gfx_mapped flag, as appropriate */
4297                 if (!dmar_map_gfx) {
4298                         drhd->ignored = 1;
4299                         for_each_active_dev_scope(drhd->devices,
4300                                                   drhd->devices_cnt, i, dev)
4301                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4302                 }
4303         }
4304 }
4305
4306 #ifdef CONFIG_SUSPEND
4307 static int init_iommu_hw(void)
4308 {
4309         struct dmar_drhd_unit *drhd;
4310         struct intel_iommu *iommu = NULL;
4311
4312         for_each_active_iommu(iommu, drhd)
4313                 if (iommu->qi)
4314                         dmar_reenable_qi(iommu);
4315
4316         for_each_iommu(iommu, drhd) {
4317                 if (drhd->ignored) {
4318                         /*
4319                          * we always have to disable PMRs or DMA may fail on
4320                          * this device
4321                          */
4322                         if (force_on)
4323                                 iommu_disable_protect_mem_regions(iommu);
4324                         continue;
4325                 }
4326
4327                 iommu_flush_write_buffer(iommu);
4328
4329                 iommu_set_root_entry(iommu);
4330
4331                 iommu->flush.flush_context(iommu, 0, 0, 0,
4332                                            DMA_CCMD_GLOBAL_INVL);
4333                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4334                 iommu_enable_translation(iommu);
4335                 iommu_disable_protect_mem_regions(iommu);
4336         }
4337
4338         return 0;
4339 }
4340
4341 static void iommu_flush_all(void)
4342 {
4343         struct dmar_drhd_unit *drhd;
4344         struct intel_iommu *iommu;
4345
4346         for_each_active_iommu(iommu, drhd) {
4347                 iommu->flush.flush_context(iommu, 0, 0, 0,
4348                                            DMA_CCMD_GLOBAL_INVL);
4349                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4350                                          DMA_TLB_GLOBAL_FLUSH);
4351         }
4352 }
4353
4354 static int iommu_suspend(void)
4355 {
4356         struct dmar_drhd_unit *drhd;
4357         struct intel_iommu *iommu = NULL;
4358         unsigned long flag;
4359
4360         for_each_active_iommu(iommu, drhd) {
4361                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4362                                                  GFP_ATOMIC);
4363                 if (!iommu->iommu_state)
4364                         goto nomem;
4365         }
4366
4367         iommu_flush_all();
4368
4369         for_each_active_iommu(iommu, drhd) {
4370                 iommu_disable_translation(iommu);
4371
4372                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4373
4374                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4375                         readl(iommu->reg + DMAR_FECTL_REG);
4376                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4377                         readl(iommu->reg + DMAR_FEDATA_REG);
4378                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4379                         readl(iommu->reg + DMAR_FEADDR_REG);
4380                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4381                         readl(iommu->reg + DMAR_FEUADDR_REG);
4382
4383                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4384         }
4385         return 0;
4386
4387 nomem:
4388         for_each_active_iommu(iommu, drhd)
4389                 kfree(iommu->iommu_state);
4390
4391         return -ENOMEM;
4392 }
4393
4394 static void iommu_resume(void)
4395 {
4396         struct dmar_drhd_unit *drhd;
4397         struct intel_iommu *iommu = NULL;
4398         unsigned long flag;
4399
4400         if (init_iommu_hw()) {
4401                 if (force_on)
4402                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4403                 else
4404                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4405                 return;
4406         }
4407
4408         for_each_active_iommu(iommu, drhd) {
4409
4410                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4411
4412                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4413                         iommu->reg + DMAR_FECTL_REG);
4414                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4415                         iommu->reg + DMAR_FEDATA_REG);
4416                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4417                         iommu->reg + DMAR_FEADDR_REG);
4418                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4419                         iommu->reg + DMAR_FEUADDR_REG);
4420
4421                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4422         }
4423
4424         for_each_active_iommu(iommu, drhd)
4425                 kfree(iommu->iommu_state);
4426 }
4427
4428 static struct syscore_ops iommu_syscore_ops = {
4429         .resume         = iommu_resume,
4430         .suspend        = iommu_suspend,
4431 };
4432
4433 static void __init init_iommu_pm_ops(void)
4434 {
4435         register_syscore_ops(&iommu_syscore_ops);
4436 }
4437
4438 #else
4439 static inline void init_iommu_pm_ops(void) {}
4440 #endif  /* CONFIG_PM */
4441
4442 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4443 {
4444         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4445             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4446             rmrr->end_address <= rmrr->base_address ||
4447             arch_rmrr_sanity_check(rmrr))
4448                 return -EINVAL;
4449
4450         return 0;
4451 }
4452
4453 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4454 {
4455         struct acpi_dmar_reserved_memory *rmrr;
4456         struct dmar_rmrr_unit *rmrru;
4457
4458         rmrr = (struct acpi_dmar_reserved_memory *)header;
4459         if (rmrr_sanity_check(rmrr))
4460                 WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
4461                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4462                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4463                            rmrr->base_address, rmrr->end_address,
4464                            dmi_get_system_info(DMI_BIOS_VENDOR),
4465                            dmi_get_system_info(DMI_BIOS_VERSION),
4466                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4467
4468         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4469         if (!rmrru)
4470                 goto out;
4471
4472         rmrru->hdr = header;
4473
4474         rmrru->base_address = rmrr->base_address;
4475         rmrru->end_address = rmrr->end_address;
4476
4477         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4478                                 ((void *)rmrr) + rmrr->header.length,
4479                                 &rmrru->devices_cnt);
4480         if (rmrru->devices_cnt && rmrru->devices == NULL)
4481                 goto free_rmrru;
4482
4483         list_add(&rmrru->list, &dmar_rmrr_units);
4484
4485         return 0;
4486 free_rmrru:
4487         kfree(rmrru);
4488 out:
4489         return -ENOMEM;
4490 }
4491
4492 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4493 {
4494         struct dmar_atsr_unit *atsru;
4495         struct acpi_dmar_atsr *tmp;
4496
4497         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4498                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4499                 if (atsr->segment != tmp->segment)
4500                         continue;
4501                 if (atsr->header.length != tmp->header.length)
4502                         continue;
4503                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4504                         return atsru;
4505         }
4506
4507         return NULL;
4508 }
4509
4510 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4511 {
4512         struct acpi_dmar_atsr *atsr;
4513         struct dmar_atsr_unit *atsru;
4514
4515         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4516                 return 0;
4517
4518         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4519         atsru = dmar_find_atsr(atsr);
4520         if (atsru)
4521                 return 0;
4522
4523         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4524         if (!atsru)
4525                 return -ENOMEM;
4526
4527         /*
4528          * If memory is allocated from slab by ACPI _DSM method, we need to
4529          * copy the memory content because the memory buffer will be freed
4530          * on return.
4531          */
4532         atsru->hdr = (void *)(atsru + 1);
4533         memcpy(atsru->hdr, hdr, hdr->length);
4534         atsru->include_all = atsr->flags & 0x1;
4535         if (!atsru->include_all) {
4536                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4537                                 (void *)atsr + atsr->header.length,
4538                                 &atsru->devices_cnt);
4539                 if (atsru->devices_cnt && atsru->devices == NULL) {
4540                         kfree(atsru);
4541                         return -ENOMEM;
4542                 }
4543         }
4544
4545         list_add_rcu(&atsru->list, &dmar_atsr_units);
4546
4547         return 0;
4548 }
4549
4550 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4551 {
4552         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4553         kfree(atsru);
4554 }
4555
4556 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4557 {
4558         struct acpi_dmar_atsr *atsr;
4559         struct dmar_atsr_unit *atsru;
4560
4561         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4562         atsru = dmar_find_atsr(atsr);
4563         if (atsru) {
4564                 list_del_rcu(&atsru->list);
4565                 synchronize_rcu();
4566                 intel_iommu_free_atsr(atsru);
4567         }
4568
4569         return 0;
4570 }
4571
4572 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4573 {
4574         int i;
4575         struct device *dev;
4576         struct acpi_dmar_atsr *atsr;
4577         struct dmar_atsr_unit *atsru;
4578
4579         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4580         atsru = dmar_find_atsr(atsr);
4581         if (!atsru)
4582                 return 0;
4583
4584         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4585                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4586                                           i, dev)
4587                         return -EBUSY;
4588         }
4589
4590         return 0;
4591 }
4592
4593 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4594 {
4595         int sp, ret;
4596         struct intel_iommu *iommu = dmaru->iommu;
4597
4598         if (g_iommus[iommu->seq_id])
4599                 return 0;
4600
4601         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4602                 pr_warn("%s: Doesn't support hardware pass through.\n",
4603                         iommu->name);
4604                 return -ENXIO;
4605         }
4606         if (!ecap_sc_support(iommu->ecap) &&
4607             domain_update_iommu_snooping(iommu)) {
4608                 pr_warn("%s: Doesn't support snooping.\n",
4609                         iommu->name);
4610                 return -ENXIO;
4611         }
4612         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4613         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4614                 pr_warn("%s: Doesn't support large page.\n",
4615                         iommu->name);
4616                 return -ENXIO;
4617         }
4618
4619         /*
4620          * Disable translation if already enabled prior to OS handover.
4621          */
4622         if (iommu->gcmd & DMA_GCMD_TE)
4623                 iommu_disable_translation(iommu);
4624
4625         g_iommus[iommu->seq_id] = iommu;
4626         ret = iommu_init_domains(iommu);
4627         if (ret == 0)
4628                 ret = iommu_alloc_root_entry(iommu);
4629         if (ret)
4630                 goto out;
4631
4632         intel_svm_check(iommu);
4633
4634         if (dmaru->ignored) {
4635                 /*
4636                  * we always have to disable PMRs or DMA may fail on this device
4637                  */
4638                 if (force_on)
4639                         iommu_disable_protect_mem_regions(iommu);
4640                 return 0;
4641         }
4642
4643         intel_iommu_init_qi(iommu);
4644         iommu_flush_write_buffer(iommu);
4645
4646 #ifdef CONFIG_INTEL_IOMMU_SVM
4647         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4648                 ret = intel_svm_enable_prq(iommu);
4649                 if (ret)
4650                         goto disable_iommu;
4651         }
4652 #endif
4653         ret = dmar_set_interrupt(iommu);
4654         if (ret)
4655                 goto disable_iommu;
4656
4657         iommu_set_root_entry(iommu);
4658         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4659         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4660         iommu_enable_translation(iommu);
4661
4662         iommu_disable_protect_mem_regions(iommu);
4663         return 0;
4664
4665 disable_iommu:
4666         disable_dmar_iommu(iommu);
4667 out:
4668         free_dmar_iommu(iommu);
4669         return ret;
4670 }
4671
4672 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4673 {
4674         int ret = 0;
4675         struct intel_iommu *iommu = dmaru->iommu;
4676
4677         if (!intel_iommu_enabled)
4678                 return 0;
4679         if (iommu == NULL)
4680                 return -EINVAL;
4681
4682         if (insert) {
4683                 ret = intel_iommu_add(dmaru);
4684         } else {
4685                 disable_dmar_iommu(iommu);
4686                 free_dmar_iommu(iommu);
4687         }
4688
4689         return ret;
4690 }
4691
4692 static void intel_iommu_free_dmars(void)
4693 {
4694         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4695         struct dmar_atsr_unit *atsru, *atsr_n;
4696
4697         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4698                 list_del(&rmrru->list);
4699                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4700                 kfree(rmrru);
4701         }
4702
4703         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4704                 list_del(&atsru->list);
4705                 intel_iommu_free_atsr(atsru);
4706         }
4707 }
4708
4709 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4710 {
4711         int i, ret = 1;
4712         struct pci_bus *bus;
4713         struct pci_dev *bridge = NULL;
4714         struct device *tmp;
4715         struct acpi_dmar_atsr *atsr;
4716         struct dmar_atsr_unit *atsru;
4717
4718         dev = pci_physfn(dev);
4719         for (bus = dev->bus; bus; bus = bus->parent) {
4720                 bridge = bus->self;
4721                 /* If it's an integrated device, allow ATS */
4722                 if (!bridge)
4723                         return 1;
4724                 /* Connected via non-PCIe: no ATS */
4725                 if (!pci_is_pcie(bridge) ||
4726                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4727                         return 0;
4728                 /* If we found the root port, look it up in the ATSR */
4729                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4730                         break;
4731         }
4732
4733         rcu_read_lock();
4734         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4735                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4736                 if (atsr->segment != pci_domain_nr(dev->bus))
4737                         continue;
4738
4739                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4740                         if (tmp == &bridge->dev)
4741                                 goto out;
4742
4743                 if (atsru->include_all)
4744                         goto out;
4745         }
4746         ret = 0;
4747 out:
4748         rcu_read_unlock();
4749
4750         return ret;
4751 }
4752
4753 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4754 {
4755         int ret;
4756         struct dmar_rmrr_unit *rmrru;
4757         struct dmar_atsr_unit *atsru;
4758         struct acpi_dmar_atsr *atsr;
4759         struct acpi_dmar_reserved_memory *rmrr;
4760
4761         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4762                 return 0;
4763
4764         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4765                 rmrr = container_of(rmrru->hdr,
4766                                     struct acpi_dmar_reserved_memory, header);
4767                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4768                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4769                                 ((void *)rmrr) + rmrr->header.length,
4770                                 rmrr->segment, rmrru->devices,
4771                                 rmrru->devices_cnt);
4772                         if (ret < 0)
4773                                 return ret;
4774                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4775                         dmar_remove_dev_scope(info, rmrr->segment,
4776                                 rmrru->devices, rmrru->devices_cnt);
4777                 }
4778         }
4779
4780         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4781                 if (atsru->include_all)
4782                         continue;
4783
4784                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4785                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4786                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4787                                         (void *)atsr + atsr->header.length,
4788                                         atsr->segment, atsru->devices,
4789                                         atsru->devices_cnt);
4790                         if (ret > 0)
4791                                 break;
4792                         else if (ret < 0)
4793                                 return ret;
4794                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4795                         if (dmar_remove_dev_scope(info, atsr->segment,
4796                                         atsru->devices, atsru->devices_cnt))
4797                                 break;
4798                 }
4799         }
4800
4801         return 0;
4802 }
4803
4804 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4805                                        unsigned long val, void *v)
4806 {
4807         struct memory_notify *mhp = v;
4808         unsigned long long start, end;
4809         unsigned long start_vpfn, last_vpfn;
4810
4811         switch (val) {
4812         case MEM_GOING_ONLINE:
4813                 start = mhp->start_pfn << PAGE_SHIFT;
4814                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4815                 if (iommu_domain_identity_map(si_domain, start, end)) {
4816                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4817                                 start, end);
4818                         return NOTIFY_BAD;
4819                 }
4820                 break;
4821
4822         case MEM_OFFLINE:
4823         case MEM_CANCEL_ONLINE:
4824                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4825                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4826                 while (start_vpfn <= last_vpfn) {
4827                         struct iova *iova;
4828                         struct dmar_drhd_unit *drhd;
4829                         struct intel_iommu *iommu;
4830                         struct page *freelist;
4831
4832                         iova = find_iova(&si_domain->iovad, start_vpfn);
4833                         if (iova == NULL) {
4834                                 pr_debug("Failed get IOVA for PFN %lx\n",
4835                                          start_vpfn);
4836                                 break;
4837                         }
4838
4839                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4840                                                      start_vpfn, last_vpfn);
4841                         if (iova == NULL) {
4842                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4843                                         start_vpfn, last_vpfn);
4844                                 return NOTIFY_BAD;
4845                         }
4846
4847                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4848                                                iova->pfn_hi);
4849
4850                         rcu_read_lock();
4851                         for_each_active_iommu(iommu, drhd)
4852                                 iommu_flush_iotlb_psi(iommu, si_domain,
4853                                         iova->pfn_lo, iova_size(iova),
4854                                         !freelist, 0);
4855                         rcu_read_unlock();
4856                         dma_free_pagelist(freelist);
4857
4858                         start_vpfn = iova->pfn_hi + 1;
4859                         free_iova_mem(iova);
4860                 }
4861                 break;
4862         }
4863
4864         return NOTIFY_OK;
4865 }
4866
4867 static struct notifier_block intel_iommu_memory_nb = {
4868         .notifier_call = intel_iommu_memory_notifier,
4869         .priority = 0
4870 };
4871
4872 static void free_all_cpu_cached_iovas(unsigned int cpu)
4873 {
4874         int i;
4875
4876         for (i = 0; i < g_num_of_iommus; i++) {
4877                 struct intel_iommu *iommu = g_iommus[i];
4878                 struct dmar_domain *domain;
4879                 int did;
4880
4881                 if (!iommu)
4882                         continue;
4883
4884                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4885                         domain = get_iommu_domain(iommu, (u16)did);
4886
4887                         if (!domain)
4888                                 continue;
4889                         free_cpu_cached_iovas(cpu, &domain->iovad);
4890                 }
4891         }
4892 }
4893
4894 static int intel_iommu_cpu_dead(unsigned int cpu)
4895 {
4896         free_all_cpu_cached_iovas(cpu);
4897         return 0;
4898 }
4899
4900 static void intel_disable_iommus(void)
4901 {
4902         struct intel_iommu *iommu = NULL;
4903         struct dmar_drhd_unit *drhd;
4904
4905         for_each_iommu(iommu, drhd)
4906                 iommu_disable_translation(iommu);
4907 }
4908
4909 void intel_iommu_shutdown(void)
4910 {
4911         struct dmar_drhd_unit *drhd;
4912         struct intel_iommu *iommu = NULL;
4913
4914         if (no_iommu || dmar_disabled)
4915                 return;
4916
4917         down_write(&dmar_global_lock);
4918
4919         /* Disable PMRs explicitly here. */
4920         for_each_iommu(iommu, drhd)
4921                 iommu_disable_protect_mem_regions(iommu);
4922
4923         /* Make sure the IOMMUs are switched off */
4924         intel_disable_iommus();
4925
4926         up_write(&dmar_global_lock);
4927 }
4928
4929 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4930 {
4931         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4932
4933         return container_of(iommu_dev, struct intel_iommu, iommu);
4934 }
4935
4936 static ssize_t intel_iommu_show_version(struct device *dev,
4937                                         struct device_attribute *attr,
4938                                         char *buf)
4939 {
4940         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4941         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4942         return sprintf(buf, "%d:%d\n",
4943                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4944 }
4945 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4946
4947 static ssize_t intel_iommu_show_address(struct device *dev,
4948                                         struct device_attribute *attr,
4949                                         char *buf)
4950 {
4951         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4952         return sprintf(buf, "%llx\n", iommu->reg_phys);
4953 }
4954 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4955
4956 static ssize_t intel_iommu_show_cap(struct device *dev,
4957                                     struct device_attribute *attr,
4958                                     char *buf)
4959 {
4960         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4961         return sprintf(buf, "%llx\n", iommu->cap);
4962 }
4963 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4964
4965 static ssize_t intel_iommu_show_ecap(struct device *dev,
4966                                     struct device_attribute *attr,
4967                                     char *buf)
4968 {
4969         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4970         return sprintf(buf, "%llx\n", iommu->ecap);
4971 }
4972 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4973
4974 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4975                                       struct device_attribute *attr,
4976                                       char *buf)
4977 {
4978         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4979         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4980 }
4981 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4982
4983 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4984                                            struct device_attribute *attr,
4985                                            char *buf)
4986 {
4987         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4988         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4989                                                   cap_ndoms(iommu->cap)));
4990 }
4991 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4992
4993 static struct attribute *intel_iommu_attrs[] = {
4994         &dev_attr_version.attr,
4995         &dev_attr_address.attr,
4996         &dev_attr_cap.attr,
4997         &dev_attr_ecap.attr,
4998         &dev_attr_domains_supported.attr,
4999         &dev_attr_domains_used.attr,
5000         NULL,
5001 };
5002
5003 static struct attribute_group intel_iommu_group = {
5004         .name = "intel-iommu",
5005         .attrs = intel_iommu_attrs,
5006 };
5007
5008 const struct attribute_group *intel_iommu_groups[] = {
5009         &intel_iommu_group,
5010         NULL,
5011 };
5012
5013 static inline bool has_untrusted_dev(void)
5014 {
5015         struct pci_dev *pdev = NULL;
5016
5017         for_each_pci_dev(pdev)
5018                 if (pdev->untrusted)
5019                         return true;
5020
5021         return false;
5022 }
5023
5024 static int __init platform_optin_force_iommu(void)
5025 {
5026         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5027                 return 0;
5028
5029         if (no_iommu || dmar_disabled)
5030                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5031
5032         /*
5033          * If Intel-IOMMU is disabled by default, we will apply identity
5034          * map for all devices except those marked as being untrusted.
5035          */
5036         if (dmar_disabled)
5037                 iommu_set_default_passthrough(false);
5038
5039         dmar_disabled = 0;
5040         no_iommu = 0;
5041
5042         return 1;
5043 }
5044
5045 static int __init probe_acpi_namespace_devices(void)
5046 {
5047         struct dmar_drhd_unit *drhd;
5048         /* To avoid a -Wunused-but-set-variable warning. */
5049         struct intel_iommu *iommu __maybe_unused;
5050         struct device *dev;
5051         int i, ret = 0;
5052
5053         for_each_active_iommu(iommu, drhd) {
5054                 for_each_active_dev_scope(drhd->devices,
5055                                           drhd->devices_cnt, i, dev) {
5056                         struct acpi_device_physical_node *pn;
5057                         struct iommu_group *group;
5058                         struct acpi_device *adev;
5059
5060                         if (dev->bus != &acpi_bus_type)
5061                                 continue;
5062
5063                         adev = to_acpi_device(dev);
5064                         mutex_lock(&adev->physical_node_lock);
5065                         list_for_each_entry(pn,
5066                                             &adev->physical_node_list, node) {
5067                                 group = iommu_group_get(pn->dev);
5068                                 if (group) {
5069                                         iommu_group_put(group);
5070                                         continue;
5071                                 }
5072
5073                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5074                                 ret = iommu_probe_device(pn->dev);
5075                                 if (ret)
5076                                         break;
5077                         }
5078                         mutex_unlock(&adev->physical_node_lock);
5079
5080                         if (ret)
5081                                 return ret;
5082                 }
5083         }
5084
5085         return 0;
5086 }
5087
5088 int __init intel_iommu_init(void)
5089 {
5090         int ret = -ENODEV;
5091         struct dmar_drhd_unit *drhd;
5092         struct intel_iommu *iommu;
5093
5094         /*
5095          * Intel IOMMU is required for a TXT/tboot launch or platform
5096          * opt in, so enforce that.
5097          */
5098         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5099
5100         if (iommu_init_mempool()) {
5101                 if (force_on)
5102                         panic("tboot: Failed to initialize iommu memory\n");
5103                 return -ENOMEM;
5104         }
5105
5106         down_write(&dmar_global_lock);
5107         if (dmar_table_init()) {
5108                 if (force_on)
5109                         panic("tboot: Failed to initialize DMAR table\n");
5110                 goto out_free_dmar;
5111         }
5112
5113         if (dmar_dev_scope_init() < 0) {
5114                 if (force_on)
5115                         panic("tboot: Failed to initialize DMAR device scope\n");
5116                 goto out_free_dmar;
5117         }
5118
5119         up_write(&dmar_global_lock);
5120
5121         /*
5122          * The bus notifier takes the dmar_global_lock, so lockdep will
5123          * complain later when we register it under the lock.
5124          */
5125         dmar_register_bus_notifier();
5126
5127         down_write(&dmar_global_lock);
5128
5129         if (no_iommu || dmar_disabled) {
5130                 /*
5131                  * We exit the function here to ensure IOMMU's remapping and
5132                  * mempool aren't setup, which means that the IOMMU's PMRs
5133                  * won't be disabled via the call to init_dmars(). So disable
5134                  * it explicitly here. The PMRs were setup by tboot prior to
5135                  * calling SENTER, but the kernel is expected to reset/tear
5136                  * down the PMRs.
5137                  */
5138                 if (intel_iommu_tboot_noforce) {
5139                         for_each_iommu(iommu, drhd)
5140                                 iommu_disable_protect_mem_regions(iommu);
5141                 }
5142
5143                 /*
5144                  * Make sure the IOMMUs are switched off, even when we
5145                  * boot into a kexec kernel and the previous kernel left
5146                  * them enabled
5147                  */
5148                 intel_disable_iommus();
5149                 goto out_free_dmar;
5150         }
5151
5152         if (list_empty(&dmar_rmrr_units))
5153                 pr_info("No RMRR found\n");
5154
5155         if (list_empty(&dmar_atsr_units))
5156                 pr_info("No ATSR found\n");
5157
5158         if (dmar_init_reserved_ranges()) {
5159                 if (force_on)
5160                         panic("tboot: Failed to reserve iommu ranges\n");
5161                 goto out_free_reserved_range;
5162         }
5163
5164         if (dmar_map_gfx)
5165                 intel_iommu_gfx_mapped = 1;
5166
5167         init_no_remapping_devices();
5168
5169         ret = init_dmars();
5170         if (ret) {
5171                 if (force_on)
5172                         panic("tboot: Failed to initialize DMARs\n");
5173                 pr_err("Initialization failed\n");
5174                 goto out_free_reserved_range;
5175         }
5176         up_write(&dmar_global_lock);
5177
5178 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5179         /*
5180          * If the system has no untrusted device or the user has decided
5181          * to disable the bounce page mechanisms, we don't need swiotlb.
5182          * Mark this and the pre-allocated bounce pages will be released
5183          * later.
5184          */
5185         if (!has_untrusted_dev() || intel_no_bounce)
5186                 swiotlb = 0;
5187 #endif
5188         dma_ops = &intel_dma_ops;
5189
5190         init_iommu_pm_ops();
5191
5192         for_each_active_iommu(iommu, drhd) {
5193                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5194                                        intel_iommu_groups,
5195                                        "%s", iommu->name);
5196                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5197                 iommu_device_register(&iommu->iommu);
5198         }
5199
5200         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5201         if (si_domain && !hw_pass_through)
5202                 register_memory_notifier(&intel_iommu_memory_nb);
5203         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5204                           intel_iommu_cpu_dead);
5205
5206         down_read(&dmar_global_lock);
5207         if (probe_acpi_namespace_devices())
5208                 pr_warn("ACPI name space devices didn't probe correctly\n");
5209         up_read(&dmar_global_lock);
5210
5211         /* Finally, we enable the DMA remapping hardware. */
5212         for_each_iommu(iommu, drhd) {
5213                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5214                         iommu_enable_translation(iommu);
5215
5216                 iommu_disable_protect_mem_regions(iommu);
5217         }
5218         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5219
5220         intel_iommu_enabled = 1;
5221         intel_iommu_debugfs_init();
5222
5223         return 0;
5224
5225 out_free_reserved_range:
5226         put_iova_domain(&reserved_iova_list);
5227 out_free_dmar:
5228         intel_iommu_free_dmars();
5229         up_write(&dmar_global_lock);
5230         iommu_exit_mempool();
5231         return ret;
5232 }
5233
5234 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5235 {
5236         struct intel_iommu *iommu = opaque;
5237
5238         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5239         return 0;
5240 }
5241
5242 /*
5243  * NB - intel-iommu lacks any sort of reference counting for the users of
5244  * dependent devices.  If multiple endpoints have intersecting dependent
5245  * devices, unbinding the driver from any one of them will possibly leave
5246  * the others unable to operate.
5247  */
5248 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5249 {
5250         if (!iommu || !dev || !dev_is_pci(dev))
5251                 return;
5252
5253         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5254 }
5255
5256 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5257 {
5258         struct dmar_domain *domain;
5259         struct intel_iommu *iommu;
5260         unsigned long flags;
5261
5262         assert_spin_locked(&device_domain_lock);
5263
5264         if (WARN_ON(!info))
5265                 return;
5266
5267         iommu = info->iommu;
5268         domain = info->domain;
5269
5270         if (info->dev) {
5271                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5272                         intel_pasid_tear_down_entry(iommu, info->dev,
5273                                         PASID_RID2PASID);
5274
5275                 iommu_disable_dev_iotlb(info);
5276                 domain_context_clear(iommu, info->dev);
5277                 intel_pasid_free_table(info->dev);
5278         }
5279
5280         unlink_domain_info(info);
5281
5282         spin_lock_irqsave(&iommu->lock, flags);
5283         domain_detach_iommu(domain, iommu);
5284         spin_unlock_irqrestore(&iommu->lock, flags);
5285
5286         /* free the private domain */
5287         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5288             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5289             list_empty(&domain->devices))
5290                 domain_exit(info->domain);
5291
5292         free_devinfo_mem(info);
5293 }
5294
5295 static void dmar_remove_one_dev_info(struct device *dev)
5296 {
5297         struct device_domain_info *info;
5298         unsigned long flags;
5299
5300         spin_lock_irqsave(&device_domain_lock, flags);
5301         info = dev->archdata.iommu;
5302         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5303             && info != DUMMY_DEVICE_DOMAIN_INFO)
5304                 __dmar_remove_one_dev_info(info);
5305         spin_unlock_irqrestore(&device_domain_lock, flags);
5306 }
5307
5308 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5309 {
5310         int adjust_width;
5311
5312         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5313         domain_reserve_special_ranges(domain);
5314
5315         /* calculate AGAW */
5316         domain->gaw = guest_width;
5317         adjust_width = guestwidth_to_adjustwidth(guest_width);
5318         domain->agaw = width_to_agaw(adjust_width);
5319
5320         domain->iommu_coherency = 0;
5321         domain->iommu_snooping = 0;
5322         domain->iommu_superpage = 0;
5323         domain->max_addr = 0;
5324
5325         /* always allocate the top pgd */
5326         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5327         if (!domain->pgd)
5328                 return -ENOMEM;
5329         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5330         return 0;
5331 }
5332
5333 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5334 {
5335         struct dmar_domain *dmar_domain;
5336         struct iommu_domain *domain;
5337         int ret;
5338
5339         switch (type) {
5340         case IOMMU_DOMAIN_DMA:
5341         /* fallthrough */
5342         case IOMMU_DOMAIN_UNMANAGED:
5343                 dmar_domain = alloc_domain(0);
5344                 if (!dmar_domain) {
5345                         pr_err("Can't allocate dmar_domain\n");
5346                         return NULL;
5347                 }
5348                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5349                         pr_err("Domain initialization failed\n");
5350                         domain_exit(dmar_domain);
5351                         return NULL;
5352                 }
5353
5354                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5355                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5356                                                     iommu_flush_iova,
5357                                                     iova_entry_free);
5358                         if (ret)
5359                                 pr_info("iova flush queue initialization failed\n");
5360                 }
5361
5362                 domain_update_iommu_cap(dmar_domain);
5363
5364                 domain = &dmar_domain->domain;
5365                 domain->geometry.aperture_start = 0;
5366                 domain->geometry.aperture_end   =
5367                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5368                 domain->geometry.force_aperture = true;
5369
5370                 return domain;
5371         case IOMMU_DOMAIN_IDENTITY:
5372                 return &si_domain->domain;
5373         default:
5374                 return NULL;
5375         }
5376
5377         return NULL;
5378 }
5379
5380 static void intel_iommu_domain_free(struct iommu_domain *domain)
5381 {
5382         if (domain != &si_domain->domain)
5383                 domain_exit(to_dmar_domain(domain));
5384 }
5385
5386 /*
5387  * Check whether a @domain could be attached to the @dev through the
5388  * aux-domain attach/detach APIs.
5389  */
5390 static inline bool
5391 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5392 {
5393         struct device_domain_info *info = dev->archdata.iommu;
5394
5395         return info && info->auxd_enabled &&
5396                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5397 }
5398
5399 static void auxiliary_link_device(struct dmar_domain *domain,
5400                                   struct device *dev)
5401 {
5402         struct device_domain_info *info = dev->archdata.iommu;
5403
5404         assert_spin_locked(&device_domain_lock);
5405         if (WARN_ON(!info))
5406                 return;
5407
5408         domain->auxd_refcnt++;
5409         list_add(&domain->auxd, &info->auxiliary_domains);
5410 }
5411
5412 static void auxiliary_unlink_device(struct dmar_domain *domain,
5413                                     struct device *dev)
5414 {
5415         struct device_domain_info *info = dev->archdata.iommu;
5416
5417         assert_spin_locked(&device_domain_lock);
5418         if (WARN_ON(!info))
5419                 return;
5420
5421         list_del(&domain->auxd);
5422         domain->auxd_refcnt--;
5423
5424         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5425                 ioasid_free(domain->default_pasid);
5426 }
5427
5428 static int aux_domain_add_dev(struct dmar_domain *domain,
5429                               struct device *dev)
5430 {
5431         int ret;
5432         u8 bus, devfn;
5433         unsigned long flags;
5434         struct intel_iommu *iommu;
5435
5436         iommu = device_to_iommu(dev, &bus, &devfn);
5437         if (!iommu)
5438                 return -ENODEV;
5439
5440         if (domain->default_pasid <= 0) {
5441                 int pasid;
5442
5443                 /* No private data needed for the default pasid */
5444                 pasid = ioasid_alloc(NULL, PASID_MIN,
5445                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5446                                      NULL);
5447                 if (pasid == INVALID_IOASID) {
5448                         pr_err("Can't allocate default pasid\n");
5449                         return -ENODEV;
5450                 }
5451                 domain->default_pasid = pasid;
5452         }
5453
5454         spin_lock_irqsave(&device_domain_lock, flags);
5455         /*
5456          * iommu->lock must be held to attach domain to iommu and setup the
5457          * pasid entry for second level translation.
5458          */
5459         spin_lock(&iommu->lock);
5460         ret = domain_attach_iommu(domain, iommu);
5461         if (ret)
5462                 goto attach_failed;
5463
5464         /* Setup the PASID entry for mediated devices: */
5465         if (domain_use_first_level(domain))
5466                 ret = domain_setup_first_level(iommu, domain, dev,
5467                                                domain->default_pasid);
5468         else
5469                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5470                                                      domain->default_pasid);
5471         if (ret)
5472                 goto table_failed;
5473         spin_unlock(&iommu->lock);
5474
5475         auxiliary_link_device(domain, dev);
5476
5477         spin_unlock_irqrestore(&device_domain_lock, flags);
5478
5479         return 0;
5480
5481 table_failed:
5482         domain_detach_iommu(domain, iommu);
5483 attach_failed:
5484         spin_unlock(&iommu->lock);
5485         spin_unlock_irqrestore(&device_domain_lock, flags);
5486         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5487                 ioasid_free(domain->default_pasid);
5488
5489         return ret;
5490 }
5491
5492 static void aux_domain_remove_dev(struct dmar_domain *domain,
5493                                   struct device *dev)
5494 {
5495         struct device_domain_info *info;
5496         struct intel_iommu *iommu;
5497         unsigned long flags;
5498
5499         if (!is_aux_domain(dev, &domain->domain))
5500                 return;
5501
5502         spin_lock_irqsave(&device_domain_lock, flags);
5503         info = dev->archdata.iommu;
5504         iommu = info->iommu;
5505
5506         auxiliary_unlink_device(domain, dev);
5507
5508         spin_lock(&iommu->lock);
5509         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5510         domain_detach_iommu(domain, iommu);
5511         spin_unlock(&iommu->lock);
5512
5513         spin_unlock_irqrestore(&device_domain_lock, flags);
5514 }
5515
5516 static int prepare_domain_attach_device(struct iommu_domain *domain,
5517                                         struct device *dev)
5518 {
5519         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5520         struct intel_iommu *iommu;
5521         int addr_width;
5522         u8 bus, devfn;
5523
5524         iommu = device_to_iommu(dev, &bus, &devfn);
5525         if (!iommu)
5526                 return -ENODEV;
5527
5528         /* check if this iommu agaw is sufficient for max mapped address */
5529         addr_width = agaw_to_width(iommu->agaw);
5530         if (addr_width > cap_mgaw(iommu->cap))
5531                 addr_width = cap_mgaw(iommu->cap);
5532
5533         if (dmar_domain->max_addr > (1LL << addr_width)) {
5534                 dev_err(dev, "%s: iommu width (%d) is not "
5535                         "sufficient for the mapped address (%llx)\n",
5536                         __func__, addr_width, dmar_domain->max_addr);
5537                 return -EFAULT;
5538         }
5539         dmar_domain->gaw = addr_width;
5540
5541         /*
5542          * Knock out extra levels of page tables if necessary
5543          */
5544         while (iommu->agaw < dmar_domain->agaw) {
5545                 struct dma_pte *pte;
5546
5547                 pte = dmar_domain->pgd;
5548                 if (dma_pte_present(pte)) {
5549                         dmar_domain->pgd = (struct dma_pte *)
5550                                 phys_to_virt(dma_pte_addr(pte));
5551                         free_pgtable_page(pte);
5552                 }
5553                 dmar_domain->agaw--;
5554         }
5555
5556         return 0;
5557 }
5558
5559 static int intel_iommu_attach_device(struct iommu_domain *domain,
5560                                      struct device *dev)
5561 {
5562         int ret;
5563
5564         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5565             device_is_rmrr_locked(dev)) {
5566                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5567                 return -EPERM;
5568         }
5569
5570         if (is_aux_domain(dev, domain))
5571                 return -EPERM;
5572
5573         /* normally dev is not mapped */
5574         if (unlikely(domain_context_mapped(dev))) {
5575                 struct dmar_domain *old_domain;
5576
5577                 old_domain = find_domain(dev);
5578                 if (old_domain)
5579                         dmar_remove_one_dev_info(dev);
5580         }
5581
5582         ret = prepare_domain_attach_device(domain, dev);
5583         if (ret)
5584                 return ret;
5585
5586         return domain_add_dev_info(to_dmar_domain(domain), dev);
5587 }
5588
5589 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5590                                          struct device *dev)
5591 {
5592         int ret;
5593
5594         if (!is_aux_domain(dev, domain))
5595                 return -EPERM;
5596
5597         ret = prepare_domain_attach_device(domain, dev);
5598         if (ret)
5599                 return ret;
5600
5601         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5602 }
5603
5604 static void intel_iommu_detach_device(struct iommu_domain *domain,
5605                                       struct device *dev)
5606 {
5607         dmar_remove_one_dev_info(dev);
5608 }
5609
5610 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5611                                           struct device *dev)
5612 {
5613         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5614 }
5615
5616 static int intel_iommu_map(struct iommu_domain *domain,
5617                            unsigned long iova, phys_addr_t hpa,
5618                            size_t size, int iommu_prot, gfp_t gfp)
5619 {
5620         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5621         u64 max_addr;
5622         int prot = 0;
5623         int ret;
5624
5625         if (iommu_prot & IOMMU_READ)
5626                 prot |= DMA_PTE_READ;
5627         if (iommu_prot & IOMMU_WRITE)
5628                 prot |= DMA_PTE_WRITE;
5629         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5630                 prot |= DMA_PTE_SNP;
5631
5632         max_addr = iova + size;
5633         if (dmar_domain->max_addr < max_addr) {
5634                 u64 end;
5635
5636                 /* check if minimum agaw is sufficient for mapped address */
5637                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5638                 if (end < max_addr) {
5639                         pr_err("%s: iommu width (%d) is not "
5640                                "sufficient for the mapped address (%llx)\n",
5641                                __func__, dmar_domain->gaw, max_addr);
5642                         return -EFAULT;
5643                 }
5644                 dmar_domain->max_addr = max_addr;
5645         }
5646         /* Round up size to next multiple of PAGE_SIZE, if it and
5647            the low bits of hpa would take us onto the next page */
5648         size = aligned_nrpages(hpa, size);
5649         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5650                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5651         return ret;
5652 }
5653
5654 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5655                                 unsigned long iova, size_t size,
5656                                 struct iommu_iotlb_gather *gather)
5657 {
5658         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5659         struct page *freelist = NULL;
5660         unsigned long start_pfn, last_pfn;
5661         unsigned int npages;
5662         int iommu_id, level = 0;
5663
5664         /* Cope with horrid API which requires us to unmap more than the
5665            size argument if it happens to be a large-page mapping. */
5666         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5667
5668         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5669                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5670
5671         start_pfn = iova >> VTD_PAGE_SHIFT;
5672         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5673
5674         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5675
5676         npages = last_pfn - start_pfn + 1;
5677
5678         for_each_domain_iommu(iommu_id, dmar_domain)
5679                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5680                                       start_pfn, npages, !freelist, 0);
5681
5682         dma_free_pagelist(freelist);
5683
5684         if (dmar_domain->max_addr == iova + size)
5685                 dmar_domain->max_addr = iova;
5686
5687         return size;
5688 }
5689
5690 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5691                                             dma_addr_t iova)
5692 {
5693         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5694         struct dma_pte *pte;
5695         int level = 0;
5696         u64 phys = 0;
5697
5698         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5699         if (pte)
5700                 phys = dma_pte_addr(pte);
5701
5702         return phys;
5703 }
5704
5705 static inline bool scalable_mode_support(void)
5706 {
5707         struct dmar_drhd_unit *drhd;
5708         struct intel_iommu *iommu;
5709         bool ret = true;
5710
5711         rcu_read_lock();
5712         for_each_active_iommu(iommu, drhd) {
5713                 if (!sm_supported(iommu)) {
5714                         ret = false;
5715                         break;
5716                 }
5717         }
5718         rcu_read_unlock();
5719
5720         return ret;
5721 }
5722
5723 static inline bool iommu_pasid_support(void)
5724 {
5725         struct dmar_drhd_unit *drhd;
5726         struct intel_iommu *iommu;
5727         bool ret = true;
5728
5729         rcu_read_lock();
5730         for_each_active_iommu(iommu, drhd) {
5731                 if (!pasid_supported(iommu)) {
5732                         ret = false;
5733                         break;
5734                 }
5735         }
5736         rcu_read_unlock();
5737
5738         return ret;
5739 }
5740
5741 static inline bool nested_mode_support(void)
5742 {
5743         struct dmar_drhd_unit *drhd;
5744         struct intel_iommu *iommu;
5745         bool ret = true;
5746
5747         rcu_read_lock();
5748         for_each_active_iommu(iommu, drhd) {
5749                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5750                         ret = false;
5751                         break;
5752                 }
5753         }
5754         rcu_read_unlock();
5755
5756         return ret;
5757 }
5758
5759 static bool intel_iommu_capable(enum iommu_cap cap)
5760 {
5761         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5762                 return domain_update_iommu_snooping(NULL) == 1;
5763         if (cap == IOMMU_CAP_INTR_REMAP)
5764                 return irq_remapping_enabled == 1;
5765
5766         return false;
5767 }
5768
5769 static int intel_iommu_add_device(struct device *dev)
5770 {
5771         struct dmar_domain *dmar_domain;
5772         struct iommu_domain *domain;
5773         struct intel_iommu *iommu;
5774         struct iommu_group *group;
5775         u8 bus, devfn;
5776         int ret;
5777
5778         iommu = device_to_iommu(dev, &bus, &devfn);
5779         if (!iommu)
5780                 return -ENODEV;
5781
5782         iommu_device_link(&iommu->iommu, dev);
5783
5784         if (translation_pre_enabled(iommu))
5785                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5786
5787         group = iommu_group_get_for_dev(dev);
5788
5789         if (IS_ERR(group)) {
5790                 ret = PTR_ERR(group);
5791                 goto unlink;
5792         }
5793
5794         iommu_group_put(group);
5795
5796         domain = iommu_get_domain_for_dev(dev);
5797         dmar_domain = to_dmar_domain(domain);
5798         if (domain->type == IOMMU_DOMAIN_DMA) {
5799                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5800                         ret = iommu_request_dm_for_dev(dev);
5801                         if (ret) {
5802                                 dmar_remove_one_dev_info(dev);
5803                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5804                                 domain_add_dev_info(si_domain, dev);
5805                                 dev_info(dev,
5806                                          "Device uses a private identity domain.\n");
5807                         }
5808                 }
5809         } else {
5810                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5811                         ret = iommu_request_dma_domain_for_dev(dev);
5812                         if (ret) {
5813                                 dmar_remove_one_dev_info(dev);
5814                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5815                                 if (!get_private_domain_for_dev(dev)) {
5816                                         dev_warn(dev,
5817                                                  "Failed to get a private domain.\n");
5818                                         ret = -ENOMEM;
5819                                         goto unlink;
5820                                 }
5821
5822                                 dev_info(dev,
5823                                          "Device uses a private dma domain.\n");
5824                         }
5825                 }
5826         }
5827
5828         if (device_needs_bounce(dev)) {
5829                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5830                 set_dma_ops(dev, &bounce_dma_ops);
5831         }
5832
5833         return 0;
5834
5835 unlink:
5836         iommu_device_unlink(&iommu->iommu, dev);
5837         return ret;
5838 }
5839
5840 static void intel_iommu_remove_device(struct device *dev)
5841 {
5842         struct intel_iommu *iommu;
5843         u8 bus, devfn;
5844
5845         iommu = device_to_iommu(dev, &bus, &devfn);
5846         if (!iommu)
5847                 return;
5848
5849         dmar_remove_one_dev_info(dev);
5850
5851         iommu_group_remove_device(dev);
5852
5853         iommu_device_unlink(&iommu->iommu, dev);
5854
5855         if (device_needs_bounce(dev))
5856                 set_dma_ops(dev, NULL);
5857 }
5858
5859 static void intel_iommu_get_resv_regions(struct device *device,
5860                                          struct list_head *head)
5861 {
5862         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5863         struct iommu_resv_region *reg;
5864         struct dmar_rmrr_unit *rmrr;
5865         struct device *i_dev;
5866         int i;
5867
5868         down_read(&dmar_global_lock);
5869         for_each_rmrr_units(rmrr) {
5870                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5871                                           i, i_dev) {
5872                         struct iommu_resv_region *resv;
5873                         enum iommu_resv_type type;
5874                         size_t length;
5875
5876                         if (i_dev != device &&
5877                             !is_downstream_to_pci_bridge(device, i_dev))
5878                                 continue;
5879
5880                         length = rmrr->end_address - rmrr->base_address + 1;
5881
5882                         type = device_rmrr_is_relaxable(device) ?
5883                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5884
5885                         resv = iommu_alloc_resv_region(rmrr->base_address,
5886                                                        length, prot, type);
5887                         if (!resv)
5888                                 break;
5889
5890                         list_add_tail(&resv->list, head);
5891                 }
5892         }
5893         up_read(&dmar_global_lock);
5894
5895 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5896         if (dev_is_pci(device)) {
5897                 struct pci_dev *pdev = to_pci_dev(device);
5898
5899                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5900                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5901                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5902                         if (reg)
5903                                 list_add_tail(&reg->list, head);
5904                 }
5905         }
5906 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5907
5908         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5909                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5910                                       0, IOMMU_RESV_MSI);
5911         if (!reg)
5912                 return;
5913         list_add_tail(&reg->list, head);
5914 }
5915
5916 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5917 {
5918         struct device_domain_info *info;
5919         struct context_entry *context;
5920         struct dmar_domain *domain;
5921         unsigned long flags;
5922         u64 ctx_lo;
5923         int ret;
5924
5925         domain = find_domain(dev);
5926         if (!domain)
5927                 return -EINVAL;
5928
5929         spin_lock_irqsave(&device_domain_lock, flags);
5930         spin_lock(&iommu->lock);
5931
5932         ret = -EINVAL;
5933         info = dev->archdata.iommu;
5934         if (!info || !info->pasid_supported)
5935                 goto out;
5936
5937         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5938         if (WARN_ON(!context))
5939                 goto out;
5940
5941         ctx_lo = context[0].lo;
5942
5943         if (!(ctx_lo & CONTEXT_PASIDE)) {
5944                 ctx_lo |= CONTEXT_PASIDE;
5945                 context[0].lo = ctx_lo;
5946                 wmb();
5947                 iommu->flush.flush_context(iommu,
5948                                            domain->iommu_did[iommu->seq_id],
5949                                            PCI_DEVID(info->bus, info->devfn),
5950                                            DMA_CCMD_MASK_NOBIT,
5951                                            DMA_CCMD_DEVICE_INVL);
5952         }
5953
5954         /* Enable PASID support in the device, if it wasn't already */
5955         if (!info->pasid_enabled)
5956                 iommu_enable_dev_iotlb(info);
5957
5958         ret = 0;
5959
5960  out:
5961         spin_unlock(&iommu->lock);
5962         spin_unlock_irqrestore(&device_domain_lock, flags);
5963
5964         return ret;
5965 }
5966
5967 static void intel_iommu_apply_resv_region(struct device *dev,
5968                                           struct iommu_domain *domain,
5969                                           struct iommu_resv_region *region)
5970 {
5971         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5972         unsigned long start, end;
5973
5974         start = IOVA_PFN(region->start);
5975         end   = IOVA_PFN(region->start + region->length - 1);
5976
5977         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5978 }
5979
5980 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5981 {
5982         if (dev_is_pci(dev))
5983                 return pci_device_group(dev);
5984         return generic_device_group(dev);
5985 }
5986
5987 #ifdef CONFIG_INTEL_IOMMU_SVM
5988 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5989 {
5990         struct intel_iommu *iommu;
5991         u8 bus, devfn;
5992
5993         if (iommu_dummy(dev)) {
5994                 dev_warn(dev,
5995                          "No IOMMU translation for device; cannot enable SVM\n");
5996                 return NULL;
5997         }
5998
5999         iommu = device_to_iommu(dev, &bus, &devfn);
6000         if ((!iommu)) {
6001                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
6002                 return NULL;
6003         }
6004
6005         return iommu;
6006 }
6007 #endif /* CONFIG_INTEL_IOMMU_SVM */
6008
6009 static int intel_iommu_enable_auxd(struct device *dev)
6010 {
6011         struct device_domain_info *info;
6012         struct intel_iommu *iommu;
6013         unsigned long flags;
6014         u8 bus, devfn;
6015         int ret;
6016
6017         iommu = device_to_iommu(dev, &bus, &devfn);
6018         if (!iommu || dmar_disabled)
6019                 return -EINVAL;
6020
6021         if (!sm_supported(iommu) || !pasid_supported(iommu))
6022                 return -EINVAL;
6023
6024         ret = intel_iommu_enable_pasid(iommu, dev);
6025         if (ret)
6026                 return -ENODEV;
6027
6028         spin_lock_irqsave(&device_domain_lock, flags);
6029         info = dev->archdata.iommu;
6030         info->auxd_enabled = 1;
6031         spin_unlock_irqrestore(&device_domain_lock, flags);
6032
6033         return 0;
6034 }
6035
6036 static int intel_iommu_disable_auxd(struct device *dev)
6037 {
6038         struct device_domain_info *info;
6039         unsigned long flags;
6040
6041         spin_lock_irqsave(&device_domain_lock, flags);
6042         info = dev->archdata.iommu;
6043         if (!WARN_ON(!info))
6044                 info->auxd_enabled = 0;
6045         spin_unlock_irqrestore(&device_domain_lock, flags);
6046
6047         return 0;
6048 }
6049
6050 /*
6051  * A PCI express designated vendor specific extended capability is defined
6052  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6053  * for system software and tools to detect endpoint devices supporting the
6054  * Intel scalable IO virtualization without host driver dependency.
6055  *
6056  * Returns the address of the matching extended capability structure within
6057  * the device's PCI configuration space or 0 if the device does not support
6058  * it.
6059  */
6060 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6061 {
6062         int pos;
6063         u16 vendor, id;
6064
6065         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6066         while (pos) {
6067                 pci_read_config_word(pdev, pos + 4, &vendor);
6068                 pci_read_config_word(pdev, pos + 8, &id);
6069                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6070                         return pos;
6071
6072                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6073         }
6074
6075         return 0;
6076 }
6077
6078 static bool
6079 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6080 {
6081         if (feat == IOMMU_DEV_FEAT_AUX) {
6082                 int ret;
6083
6084                 if (!dev_is_pci(dev) || dmar_disabled ||
6085                     !scalable_mode_support() || !iommu_pasid_support())
6086                         return false;
6087
6088                 ret = pci_pasid_features(to_pci_dev(dev));
6089                 if (ret < 0)
6090                         return false;
6091
6092                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6093         }
6094
6095         return false;
6096 }
6097
6098 static int
6099 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6100 {
6101         if (feat == IOMMU_DEV_FEAT_AUX)
6102                 return intel_iommu_enable_auxd(dev);
6103
6104         return -ENODEV;
6105 }
6106
6107 static int
6108 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6109 {
6110         if (feat == IOMMU_DEV_FEAT_AUX)
6111                 return intel_iommu_disable_auxd(dev);
6112
6113         return -ENODEV;
6114 }
6115
6116 static bool
6117 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6118 {
6119         struct device_domain_info *info = dev->archdata.iommu;
6120
6121         if (feat == IOMMU_DEV_FEAT_AUX)
6122                 return scalable_mode_support() && info && info->auxd_enabled;
6123
6124         return false;
6125 }
6126
6127 static int
6128 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6129 {
6130         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6131
6132         return dmar_domain->default_pasid > 0 ?
6133                         dmar_domain->default_pasid : -EINVAL;
6134 }
6135
6136 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6137                                            struct device *dev)
6138 {
6139         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
6140 }
6141
6142 static int
6143 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6144                             enum iommu_attr attr, void *data)
6145 {
6146         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6147         unsigned long flags;
6148         int ret = 0;
6149
6150         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6151                 return -EINVAL;
6152
6153         switch (attr) {
6154         case DOMAIN_ATTR_NESTING:
6155                 spin_lock_irqsave(&device_domain_lock, flags);
6156                 if (nested_mode_support() &&
6157                     list_empty(&dmar_domain->devices)) {
6158                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6159                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6160                 } else {
6161                         ret = -ENODEV;
6162                 }
6163                 spin_unlock_irqrestore(&device_domain_lock, flags);
6164                 break;
6165         default:
6166                 ret = -EINVAL;
6167                 break;
6168         }
6169
6170         return ret;
6171 }
6172
6173 const struct iommu_ops intel_iommu_ops = {
6174         .capable                = intel_iommu_capable,
6175         .domain_alloc           = intel_iommu_domain_alloc,
6176         .domain_free            = intel_iommu_domain_free,
6177         .domain_set_attr        = intel_iommu_domain_set_attr,
6178         .attach_dev             = intel_iommu_attach_device,
6179         .detach_dev             = intel_iommu_detach_device,
6180         .aux_attach_dev         = intel_iommu_aux_attach_device,
6181         .aux_detach_dev         = intel_iommu_aux_detach_device,
6182         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6183         .map                    = intel_iommu_map,
6184         .unmap                  = intel_iommu_unmap,
6185         .iova_to_phys           = intel_iommu_iova_to_phys,
6186         .add_device             = intel_iommu_add_device,
6187         .remove_device          = intel_iommu_remove_device,
6188         .get_resv_regions       = intel_iommu_get_resv_regions,
6189         .put_resv_regions       = generic_iommu_put_resv_regions,
6190         .apply_resv_region      = intel_iommu_apply_resv_region,
6191         .device_group           = intel_iommu_device_group,
6192         .dev_has_feat           = intel_iommu_dev_has_feat,
6193         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6194         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6195         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6196         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6197         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6198 };
6199
6200 static void quirk_iommu_igfx(struct pci_dev *dev)
6201 {
6202         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6203         dmar_map_gfx = 0;
6204 }
6205
6206 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6208 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6209 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6214
6215 /* Broadwell igfx malfunctions with dmar */
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6218 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6237 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6238 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6239 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6240
6241 static void quirk_iommu_rwbf(struct pci_dev *dev)
6242 {
6243         /*
6244          * Mobile 4 Series Chipset neglects to set RWBF capability,
6245          * but needs it. Same seems to hold for the desktop versions.
6246          */
6247         pci_info(dev, "Forcing write-buffer flush capability\n");
6248         rwbf_quirk = 1;
6249 }
6250
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6255 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6256 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6258
6259 #define GGC 0x52
6260 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6261 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6262 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6263 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6264 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6265 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6266 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6267 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6268
6269 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6270 {
6271         unsigned short ggc;
6272
6273         if (pci_read_config_word(dev, GGC, &ggc))
6274                 return;
6275
6276         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6277                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6278                 dmar_map_gfx = 0;
6279         } else if (dmar_map_gfx) {
6280                 /* we have to ensure the gfx device is idle before we flush */
6281                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6282                 intel_iommu_strict = 1;
6283        }
6284 }
6285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6286 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6287 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6288 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6289
6290 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6291    ISOCH DMAR unit for the Azalia sound device, but not give it any
6292    TLB entries, which causes it to deadlock. Check for that.  We do
6293    this in a function called from init_dmars(), instead of in a PCI
6294    quirk, because we don't want to print the obnoxious "BIOS broken"
6295    message if VT-d is actually disabled.
6296 */
6297 static void __init check_tylersburg_isoch(void)
6298 {
6299         struct pci_dev *pdev;
6300         uint32_t vtisochctrl;
6301
6302         /* If there's no Azalia in the system anyway, forget it. */
6303         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6304         if (!pdev)
6305                 return;
6306         pci_dev_put(pdev);
6307
6308         /* System Management Registers. Might be hidden, in which case
6309            we can't do the sanity check. But that's OK, because the
6310            known-broken BIOSes _don't_ actually hide it, so far. */
6311         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6312         if (!pdev)
6313                 return;
6314
6315         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6316                 pci_dev_put(pdev);
6317                 return;
6318         }
6319
6320         pci_dev_put(pdev);
6321
6322         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6323         if (vtisochctrl & 1)
6324                 return;
6325
6326         /* Drop all bits other than the number of TLB entries */
6327         vtisochctrl &= 0x1c;
6328
6329         /* If we have the recommended number of TLB entries (16), fine. */
6330         if (vtisochctrl == 0x10)
6331                 return;
6332
6333         /* Zero TLB entries? You get to ride the short bus to school. */
6334         if (!vtisochctrl) {
6335                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6336                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6337                      dmi_get_system_info(DMI_BIOS_VENDOR),
6338                      dmi_get_system_info(DMI_BIOS_VERSION),
6339                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6340                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6341                 return;
6342         }
6343
6344         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6345                vtisochctrl);
6346 }