]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
Merge tag 'trace-v5.6-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/rostedt...
[linux.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 /*
311  * When VT-d works in the scalable mode, it allows DMA translation to
312  * happen through either first level or second level page table. This
313  * bit marks that the DMA translation for the domain goes through the
314  * first level page table, otherwise, it goes through the second level.
315  */
316 #define DOMAIN_FLAG_USE_FIRST_LEVEL             BIT(2)
317
318 /*
319  * Domain represents a virtual machine which demands iommu nested
320  * translation mode support.
321  */
322 #define DOMAIN_FLAG_NESTING_MODE                BIT(3)
323
324 #define for_each_domain_iommu(idx, domain)                      \
325         for (idx = 0; idx < g_num_of_iommus; idx++)             \
326                 if (domain->iommu_refcnt[idx])
327
328 struct dmar_rmrr_unit {
329         struct list_head list;          /* list of rmrr units   */
330         struct acpi_dmar_header *hdr;   /* ACPI header          */
331         u64     base_address;           /* reserved base address*/
332         u64     end_address;            /* reserved end address */
333         struct dmar_dev_scope *devices; /* target devices */
334         int     devices_cnt;            /* target device count */
335 };
336
337 struct dmar_atsr_unit {
338         struct list_head list;          /* list of ATSR units */
339         struct acpi_dmar_header *hdr;   /* ACPI header */
340         struct dmar_dev_scope *devices; /* target devices */
341         int devices_cnt;                /* target device count */
342         u8 include_all:1;               /* include all ports */
343 };
344
345 static LIST_HEAD(dmar_atsr_units);
346 static LIST_HEAD(dmar_rmrr_units);
347
348 #define for_each_rmrr_units(rmrr) \
349         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
350
351 /* bitmap for indexing intel_iommus */
352 static int g_num_of_iommus;
353
354 static void domain_exit(struct dmar_domain *domain);
355 static void domain_remove_dev_info(struct dmar_domain *domain);
356 static void dmar_remove_one_dev_info(struct device *dev);
357 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
358 static void domain_context_clear(struct intel_iommu *iommu,
359                                  struct device *dev);
360 static int domain_detach_iommu(struct dmar_domain *domain,
361                                struct intel_iommu *iommu);
362 static bool device_is_rmrr_locked(struct device *dev);
363 static int intel_iommu_attach_device(struct iommu_domain *domain,
364                                      struct device *dev);
365 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
366                                             dma_addr_t iova);
367
368 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
369 int dmar_disabled = 0;
370 #else
371 int dmar_disabled = 1;
372 #endif /* CONFIG_INTEL_IOMMU_DEFAULT_ON */
373
374 #ifdef INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
375 int intel_iommu_sm = 1;
376 #else
377 int intel_iommu_sm;
378 #endif /* INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON */
379
380 int intel_iommu_enabled = 0;
381 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
382
383 static int dmar_map_gfx = 1;
384 static int dmar_forcedac;
385 static int intel_iommu_strict;
386 static int intel_iommu_superpage = 1;
387 static int iommu_identity_mapping;
388 static int intel_no_bounce;
389
390 #define IDENTMAP_GFX            2
391 #define IDENTMAP_AZALIA         4
392
393 int intel_iommu_gfx_mapped;
394 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
395
396 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
397 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
398 DEFINE_SPINLOCK(device_domain_lock);
399 static LIST_HEAD(device_domain_list);
400
401 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
402                                 to_pci_dev(d)->untrusted)
403
404 /*
405  * Iterate over elements in device_domain_list and call the specified
406  * callback @fn against each element.
407  */
408 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
409                                      void *data), void *data)
410 {
411         int ret = 0;
412         unsigned long flags;
413         struct device_domain_info *info;
414
415         spin_lock_irqsave(&device_domain_lock, flags);
416         list_for_each_entry(info, &device_domain_list, global) {
417                 ret = fn(info, data);
418                 if (ret) {
419                         spin_unlock_irqrestore(&device_domain_lock, flags);
420                         return ret;
421                 }
422         }
423         spin_unlock_irqrestore(&device_domain_lock, flags);
424
425         return 0;
426 }
427
428 const struct iommu_ops intel_iommu_ops;
429
430 static bool translation_pre_enabled(struct intel_iommu *iommu)
431 {
432         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
433 }
434
435 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
436 {
437         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
438 }
439
440 static void init_translation_status(struct intel_iommu *iommu)
441 {
442         u32 gsts;
443
444         gsts = readl(iommu->reg + DMAR_GSTS_REG);
445         if (gsts & DMA_GSTS_TES)
446                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
447 }
448
449 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
450 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
451 {
452         return container_of(dom, struct dmar_domain, domain);
453 }
454
455 static int __init intel_iommu_setup(char *str)
456 {
457         if (!str)
458                 return -EINVAL;
459         while (*str) {
460                 if (!strncmp(str, "on", 2)) {
461                         dmar_disabled = 0;
462                         pr_info("IOMMU enabled\n");
463                 } else if (!strncmp(str, "off", 3)) {
464                         dmar_disabled = 1;
465                         no_platform_optin = 1;
466                         pr_info("IOMMU disabled\n");
467                 } else if (!strncmp(str, "igfx_off", 8)) {
468                         dmar_map_gfx = 0;
469                         pr_info("Disable GFX device mapping\n");
470                 } else if (!strncmp(str, "forcedac", 8)) {
471                         pr_info("Forcing DAC for PCI devices\n");
472                         dmar_forcedac = 1;
473                 } else if (!strncmp(str, "strict", 6)) {
474                         pr_info("Disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         pr_info("Disable supported super page\n");
478                         intel_iommu_superpage = 0;
479                 } else if (!strncmp(str, "sm_on", 5)) {
480                         pr_info("Intel-IOMMU: scalable mode supported\n");
481                         intel_iommu_sm = 1;
482                 } else if (!strncmp(str, "tboot_noforce", 13)) {
483                         printk(KERN_INFO
484                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
485                         intel_iommu_tboot_noforce = 1;
486                 } else if (!strncmp(str, "nobounce", 8)) {
487                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
488                         intel_no_bounce = 1;
489                 }
490
491                 str += strcspn(str, ",");
492                 while (*str == ',')
493                         str++;
494         }
495         return 0;
496 }
497 __setup("intel_iommu=", intel_iommu_setup);
498
499 static struct kmem_cache *iommu_domain_cache;
500 static struct kmem_cache *iommu_devinfo_cache;
501
502 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
503 {
504         struct dmar_domain **domains;
505         int idx = did >> 8;
506
507         domains = iommu->domains[idx];
508         if (!domains)
509                 return NULL;
510
511         return domains[did & 0xff];
512 }
513
514 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
515                              struct dmar_domain *domain)
516 {
517         struct dmar_domain **domains;
518         int idx = did >> 8;
519
520         if (!iommu->domains[idx]) {
521                 size_t size = 256 * sizeof(struct dmar_domain *);
522                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
523         }
524
525         domains = iommu->domains[idx];
526         if (WARN_ON(!domains))
527                 return;
528         else
529                 domains[did & 0xff] = domain;
530 }
531
532 void *alloc_pgtable_page(int node)
533 {
534         struct page *page;
535         void *vaddr = NULL;
536
537         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
538         if (page)
539                 vaddr = page_address(page);
540         return vaddr;
541 }
542
543 void free_pgtable_page(void *vaddr)
544 {
545         free_page((unsigned long)vaddr);
546 }
547
548 static inline void *alloc_domain_mem(void)
549 {
550         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
551 }
552
553 static void free_domain_mem(void *vaddr)
554 {
555         kmem_cache_free(iommu_domain_cache, vaddr);
556 }
557
558 static inline void * alloc_devinfo_mem(void)
559 {
560         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
561 }
562
563 static inline void free_devinfo_mem(void *vaddr)
564 {
565         kmem_cache_free(iommu_devinfo_cache, vaddr);
566 }
567
568 static inline int domain_type_is_si(struct dmar_domain *domain)
569 {
570         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
571 }
572
573 static inline bool domain_use_first_level(struct dmar_domain *domain)
574 {
575         return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
576 }
577
578 static inline int domain_pfn_supported(struct dmar_domain *domain,
579                                        unsigned long pfn)
580 {
581         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
582
583         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
584 }
585
586 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
587 {
588         unsigned long sagaw;
589         int agaw = -1;
590
591         sagaw = cap_sagaw(iommu->cap);
592         for (agaw = width_to_agaw(max_gaw);
593              agaw >= 0; agaw--) {
594                 if (test_bit(agaw, &sagaw))
595                         break;
596         }
597
598         return agaw;
599 }
600
601 /*
602  * Calculate max SAGAW for each iommu.
603  */
604 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
605 {
606         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
607 }
608
609 /*
610  * calculate agaw for each iommu.
611  * "SAGAW" may be different across iommus, use a default agaw, and
612  * get a supported less agaw for iommus that don't support the default agaw.
613  */
614 int iommu_calculate_agaw(struct intel_iommu *iommu)
615 {
616         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
617 }
618
619 /* This functionin only returns single iommu in a domain */
620 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
621 {
622         int iommu_id;
623
624         /* si_domain and vm domain should not get here. */
625         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
626                 return NULL;
627
628         for_each_domain_iommu(iommu_id, domain)
629                 break;
630
631         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
632                 return NULL;
633
634         return g_iommus[iommu_id];
635 }
636
637 static void domain_update_iommu_coherency(struct dmar_domain *domain)
638 {
639         struct dmar_drhd_unit *drhd;
640         struct intel_iommu *iommu;
641         bool found = false;
642         int i;
643
644         domain->iommu_coherency = 1;
645
646         for_each_domain_iommu(i, domain) {
647                 found = true;
648                 if (!ecap_coherent(g_iommus[i]->ecap)) {
649                         domain->iommu_coherency = 0;
650                         break;
651                 }
652         }
653         if (found)
654                 return;
655
656         /* No hardware attached; use lowest common denominator */
657         rcu_read_lock();
658         for_each_active_iommu(iommu, drhd) {
659                 if (!ecap_coherent(iommu->ecap)) {
660                         domain->iommu_coherency = 0;
661                         break;
662                 }
663         }
664         rcu_read_unlock();
665 }
666
667 static int domain_update_iommu_snooping(struct intel_iommu *skip)
668 {
669         struct dmar_drhd_unit *drhd;
670         struct intel_iommu *iommu;
671         int ret = 1;
672
673         rcu_read_lock();
674         for_each_active_iommu(iommu, drhd) {
675                 if (iommu != skip) {
676                         if (!ecap_sc_support(iommu->ecap)) {
677                                 ret = 0;
678                                 break;
679                         }
680                 }
681         }
682         rcu_read_unlock();
683
684         return ret;
685 }
686
687 static int domain_update_iommu_superpage(struct dmar_domain *domain,
688                                          struct intel_iommu *skip)
689 {
690         struct dmar_drhd_unit *drhd;
691         struct intel_iommu *iommu;
692         int mask = 0x3;
693
694         if (!intel_iommu_superpage) {
695                 return 0;
696         }
697
698         /* set iommu_superpage to the smallest common denominator */
699         rcu_read_lock();
700         for_each_active_iommu(iommu, drhd) {
701                 if (iommu != skip) {
702                         if (domain && domain_use_first_level(domain)) {
703                                 if (!cap_fl1gp_support(iommu->cap))
704                                         mask = 0x1;
705                         } else {
706                                 mask &= cap_super_page_val(iommu->cap);
707                         }
708
709                         if (!mask)
710                                 break;
711                 }
712         }
713         rcu_read_unlock();
714
715         return fls(mask);
716 }
717
718 /* Some capabilities may be different across iommus */
719 static void domain_update_iommu_cap(struct dmar_domain *domain)
720 {
721         domain_update_iommu_coherency(domain);
722         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
723         domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
724 }
725
726 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
727                                          u8 devfn, int alloc)
728 {
729         struct root_entry *root = &iommu->root_entry[bus];
730         struct context_entry *context;
731         u64 *entry;
732
733         entry = &root->lo;
734         if (sm_supported(iommu)) {
735                 if (devfn >= 0x80) {
736                         devfn -= 0x80;
737                         entry = &root->hi;
738                 }
739                 devfn *= 2;
740         }
741         if (*entry & 1)
742                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
743         else {
744                 unsigned long phy_addr;
745                 if (!alloc)
746                         return NULL;
747
748                 context = alloc_pgtable_page(iommu->node);
749                 if (!context)
750                         return NULL;
751
752                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
753                 phy_addr = virt_to_phys((void *)context);
754                 *entry = phy_addr | 1;
755                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
756         }
757         return &context[devfn];
758 }
759
760 static int iommu_dummy(struct device *dev)
761 {
762         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
763 }
764
765 /**
766  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
767  *                               sub-hierarchy of a candidate PCI-PCI bridge
768  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
769  * @bridge: the candidate PCI-PCI bridge
770  *
771  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
772  */
773 static bool
774 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
775 {
776         struct pci_dev *pdev, *pbridge;
777
778         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
779                 return false;
780
781         pdev = to_pci_dev(dev);
782         pbridge = to_pci_dev(bridge);
783
784         if (pbridge->subordinate &&
785             pbridge->subordinate->number <= pdev->bus->number &&
786             pbridge->subordinate->busn_res.end >= pdev->bus->number)
787                 return true;
788
789         return false;
790 }
791
792 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
793 {
794         struct dmar_drhd_unit *drhd = NULL;
795         struct intel_iommu *iommu;
796         struct device *tmp;
797         struct pci_dev *pdev = NULL;
798         u16 segment = 0;
799         int i;
800
801         if (iommu_dummy(dev))
802                 return NULL;
803
804         if (dev_is_pci(dev)) {
805                 struct pci_dev *pf_pdev;
806
807                 pdev = pci_real_dma_dev(to_pci_dev(dev));
808
809                 /* VFs aren't listed in scope tables; we need to look up
810                  * the PF instead to find the IOMMU. */
811                 pf_pdev = pci_physfn(pdev);
812                 dev = &pf_pdev->dev;
813                 segment = pci_domain_nr(pdev->bus);
814         } else if (has_acpi_companion(dev))
815                 dev = &ACPI_COMPANION(dev)->dev;
816
817         rcu_read_lock();
818         for_each_active_iommu(iommu, drhd) {
819                 if (pdev && segment != drhd->segment)
820                         continue;
821
822                 for_each_active_dev_scope(drhd->devices,
823                                           drhd->devices_cnt, i, tmp) {
824                         if (tmp == dev) {
825                                 /* For a VF use its original BDF# not that of the PF
826                                  * which we used for the IOMMU lookup. Strictly speaking
827                                  * we could do this for all PCI devices; we only need to
828                                  * get the BDF# from the scope table for ACPI matches. */
829                                 if (pdev && pdev->is_virtfn)
830                                         goto got_pdev;
831
832                                 *bus = drhd->devices[i].bus;
833                                 *devfn = drhd->devices[i].devfn;
834                                 goto out;
835                         }
836
837                         if (is_downstream_to_pci_bridge(dev, tmp))
838                                 goto got_pdev;
839                 }
840
841                 if (pdev && drhd->include_all) {
842                 got_pdev:
843                         *bus = pdev->bus->number;
844                         *devfn = pdev->devfn;
845                         goto out;
846                 }
847         }
848         iommu = NULL;
849  out:
850         rcu_read_unlock();
851
852         return iommu;
853 }
854
855 static void domain_flush_cache(struct dmar_domain *domain,
856                                void *addr, int size)
857 {
858         if (!domain->iommu_coherency)
859                 clflush_cache_range(addr, size);
860 }
861
862 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
863 {
864         struct context_entry *context;
865         int ret = 0;
866         unsigned long flags;
867
868         spin_lock_irqsave(&iommu->lock, flags);
869         context = iommu_context_addr(iommu, bus, devfn, 0);
870         if (context)
871                 ret = context_present(context);
872         spin_unlock_irqrestore(&iommu->lock, flags);
873         return ret;
874 }
875
876 static void free_context_table(struct intel_iommu *iommu)
877 {
878         int i;
879         unsigned long flags;
880         struct context_entry *context;
881
882         spin_lock_irqsave(&iommu->lock, flags);
883         if (!iommu->root_entry) {
884                 goto out;
885         }
886         for (i = 0; i < ROOT_ENTRY_NR; i++) {
887                 context = iommu_context_addr(iommu, i, 0, 0);
888                 if (context)
889                         free_pgtable_page(context);
890
891                 if (!sm_supported(iommu))
892                         continue;
893
894                 context = iommu_context_addr(iommu, i, 0x80, 0);
895                 if (context)
896                         free_pgtable_page(context);
897
898         }
899         free_pgtable_page(iommu->root_entry);
900         iommu->root_entry = NULL;
901 out:
902         spin_unlock_irqrestore(&iommu->lock, flags);
903 }
904
905 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
906                                       unsigned long pfn, int *target_level)
907 {
908         struct dma_pte *parent, *pte;
909         int level = agaw_to_level(domain->agaw);
910         int offset;
911
912         BUG_ON(!domain->pgd);
913
914         if (!domain_pfn_supported(domain, pfn))
915                 /* Address beyond IOMMU's addressing capabilities. */
916                 return NULL;
917
918         parent = domain->pgd;
919
920         while (1) {
921                 void *tmp_page;
922
923                 offset = pfn_level_offset(pfn, level);
924                 pte = &parent[offset];
925                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
926                         break;
927                 if (level == *target_level)
928                         break;
929
930                 if (!dma_pte_present(pte)) {
931                         uint64_t pteval;
932
933                         tmp_page = alloc_pgtable_page(domain->nid);
934
935                         if (!tmp_page)
936                                 return NULL;
937
938                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
939                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
940                         if (domain_use_first_level(domain))
941                                 pteval |= DMA_FL_PTE_XD;
942                         if (cmpxchg64(&pte->val, 0ULL, pteval))
943                                 /* Someone else set it while we were thinking; use theirs. */
944                                 free_pgtable_page(tmp_page);
945                         else
946                                 domain_flush_cache(domain, pte, sizeof(*pte));
947                 }
948                 if (level == 1)
949                         break;
950
951                 parent = phys_to_virt(dma_pte_addr(pte));
952                 level--;
953         }
954
955         if (!*target_level)
956                 *target_level = level;
957
958         return pte;
959 }
960
961 /* return address's pte at specific level */
962 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
963                                          unsigned long pfn,
964                                          int level, int *large_page)
965 {
966         struct dma_pte *parent, *pte;
967         int total = agaw_to_level(domain->agaw);
968         int offset;
969
970         parent = domain->pgd;
971         while (level <= total) {
972                 offset = pfn_level_offset(pfn, total);
973                 pte = &parent[offset];
974                 if (level == total)
975                         return pte;
976
977                 if (!dma_pte_present(pte)) {
978                         *large_page = total;
979                         break;
980                 }
981
982                 if (dma_pte_superpage(pte)) {
983                         *large_page = total;
984                         return pte;
985                 }
986
987                 parent = phys_to_virt(dma_pte_addr(pte));
988                 total--;
989         }
990         return NULL;
991 }
992
993 /* clear last level pte, a tlb flush should be followed */
994 static void dma_pte_clear_range(struct dmar_domain *domain,
995                                 unsigned long start_pfn,
996                                 unsigned long last_pfn)
997 {
998         unsigned int large_page;
999         struct dma_pte *first_pte, *pte;
1000
1001         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1002         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1003         BUG_ON(start_pfn > last_pfn);
1004
1005         /* we don't need lock here; nobody else touches the iova range */
1006         do {
1007                 large_page = 1;
1008                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1009                 if (!pte) {
1010                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1011                         continue;
1012                 }
1013                 do {
1014                         dma_clear_pte(pte);
1015                         start_pfn += lvl_to_nr_pages(large_page);
1016                         pte++;
1017                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1018
1019                 domain_flush_cache(domain, first_pte,
1020                                    (void *)pte - (void *)first_pte);
1021
1022         } while (start_pfn && start_pfn <= last_pfn);
1023 }
1024
1025 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1026                                int retain_level, struct dma_pte *pte,
1027                                unsigned long pfn, unsigned long start_pfn,
1028                                unsigned long last_pfn)
1029 {
1030         pfn = max(start_pfn, pfn);
1031         pte = &pte[pfn_level_offset(pfn, level)];
1032
1033         do {
1034                 unsigned long level_pfn;
1035                 struct dma_pte *level_pte;
1036
1037                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1038                         goto next;
1039
1040                 level_pfn = pfn & level_mask(level);
1041                 level_pte = phys_to_virt(dma_pte_addr(pte));
1042
1043                 if (level > 2) {
1044                         dma_pte_free_level(domain, level - 1, retain_level,
1045                                            level_pte, level_pfn, start_pfn,
1046                                            last_pfn);
1047                 }
1048
1049                 /*
1050                  * Free the page table if we're below the level we want to
1051                  * retain and the range covers the entire table.
1052                  */
1053                 if (level < retain_level && !(start_pfn > level_pfn ||
1054                       last_pfn < level_pfn + level_size(level) - 1)) {
1055                         dma_clear_pte(pte);
1056                         domain_flush_cache(domain, pte, sizeof(*pte));
1057                         free_pgtable_page(level_pte);
1058                 }
1059 next:
1060                 pfn += level_size(level);
1061         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1062 }
1063
1064 /*
1065  * clear last level (leaf) ptes and free page table pages below the
1066  * level we wish to keep intact.
1067  */
1068 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1069                                    unsigned long start_pfn,
1070                                    unsigned long last_pfn,
1071                                    int retain_level)
1072 {
1073         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1074         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1075         BUG_ON(start_pfn > last_pfn);
1076
1077         dma_pte_clear_range(domain, start_pfn, last_pfn);
1078
1079         /* We don't need lock here; nobody else touches the iova range */
1080         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1081                            domain->pgd, 0, start_pfn, last_pfn);
1082
1083         /* free pgd */
1084         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1085                 free_pgtable_page(domain->pgd);
1086                 domain->pgd = NULL;
1087         }
1088 }
1089
1090 /* When a page at a given level is being unlinked from its parent, we don't
1091    need to *modify* it at all. All we need to do is make a list of all the
1092    pages which can be freed just as soon as we've flushed the IOTLB and we
1093    know the hardware page-walk will no longer touch them.
1094    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1095    be freed. */
1096 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1097                                             int level, struct dma_pte *pte,
1098                                             struct page *freelist)
1099 {
1100         struct page *pg;
1101
1102         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1103         pg->freelist = freelist;
1104         freelist = pg;
1105
1106         if (level == 1)
1107                 return freelist;
1108
1109         pte = page_address(pg);
1110         do {
1111                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1112                         freelist = dma_pte_list_pagetables(domain, level - 1,
1113                                                            pte, freelist);
1114                 pte++;
1115         } while (!first_pte_in_page(pte));
1116
1117         return freelist;
1118 }
1119
1120 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1121                                         struct dma_pte *pte, unsigned long pfn,
1122                                         unsigned long start_pfn,
1123                                         unsigned long last_pfn,
1124                                         struct page *freelist)
1125 {
1126         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1127
1128         pfn = max(start_pfn, pfn);
1129         pte = &pte[pfn_level_offset(pfn, level)];
1130
1131         do {
1132                 unsigned long level_pfn;
1133
1134                 if (!dma_pte_present(pte))
1135                         goto next;
1136
1137                 level_pfn = pfn & level_mask(level);
1138
1139                 /* If range covers entire pagetable, free it */
1140                 if (start_pfn <= level_pfn &&
1141                     last_pfn >= level_pfn + level_size(level) - 1) {
1142                         /* These suborbinate page tables are going away entirely. Don't
1143                            bother to clear them; we're just going to *free* them. */
1144                         if (level > 1 && !dma_pte_superpage(pte))
1145                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1146
1147                         dma_clear_pte(pte);
1148                         if (!first_pte)
1149                                 first_pte = pte;
1150                         last_pte = pte;
1151                 } else if (level > 1) {
1152                         /* Recurse down into a level that isn't *entirely* obsolete */
1153                         freelist = dma_pte_clear_level(domain, level - 1,
1154                                                        phys_to_virt(dma_pte_addr(pte)),
1155                                                        level_pfn, start_pfn, last_pfn,
1156                                                        freelist);
1157                 }
1158 next:
1159                 pfn += level_size(level);
1160         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1161
1162         if (first_pte)
1163                 domain_flush_cache(domain, first_pte,
1164                                    (void *)++last_pte - (void *)first_pte);
1165
1166         return freelist;
1167 }
1168
1169 /* We can't just free the pages because the IOMMU may still be walking
1170    the page tables, and may have cached the intermediate levels. The
1171    pages can only be freed after the IOTLB flush has been done. */
1172 static struct page *domain_unmap(struct dmar_domain *domain,
1173                                  unsigned long start_pfn,
1174                                  unsigned long last_pfn)
1175 {
1176         struct page *freelist;
1177
1178         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1179         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1180         BUG_ON(start_pfn > last_pfn);
1181
1182         /* we don't need lock here; nobody else touches the iova range */
1183         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1184                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1185
1186         /* free pgd */
1187         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1188                 struct page *pgd_page = virt_to_page(domain->pgd);
1189                 pgd_page->freelist = freelist;
1190                 freelist = pgd_page;
1191
1192                 domain->pgd = NULL;
1193         }
1194
1195         return freelist;
1196 }
1197
1198 static void dma_free_pagelist(struct page *freelist)
1199 {
1200         struct page *pg;
1201
1202         while ((pg = freelist)) {
1203                 freelist = pg->freelist;
1204                 free_pgtable_page(page_address(pg));
1205         }
1206 }
1207
1208 static void iova_entry_free(unsigned long data)
1209 {
1210         struct page *freelist = (struct page *)data;
1211
1212         dma_free_pagelist(freelist);
1213 }
1214
1215 /* iommu handling */
1216 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1217 {
1218         struct root_entry *root;
1219         unsigned long flags;
1220
1221         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1222         if (!root) {
1223                 pr_err("Allocating root entry for %s failed\n",
1224                         iommu->name);
1225                 return -ENOMEM;
1226         }
1227
1228         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1229
1230         spin_lock_irqsave(&iommu->lock, flags);
1231         iommu->root_entry = root;
1232         spin_unlock_irqrestore(&iommu->lock, flags);
1233
1234         return 0;
1235 }
1236
1237 static void iommu_set_root_entry(struct intel_iommu *iommu)
1238 {
1239         u64 addr;
1240         u32 sts;
1241         unsigned long flag;
1242
1243         addr = virt_to_phys(iommu->root_entry);
1244         if (sm_supported(iommu))
1245                 addr |= DMA_RTADDR_SMT;
1246
1247         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1248         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1249
1250         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1251
1252         /* Make sure hardware complete it */
1253         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1254                       readl, (sts & DMA_GSTS_RTPS), sts);
1255
1256         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1257 }
1258
1259 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1260 {
1261         u32 val;
1262         unsigned long flag;
1263
1264         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1265                 return;
1266
1267         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1268         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1269
1270         /* Make sure hardware complete it */
1271         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1272                       readl, (!(val & DMA_GSTS_WBFS)), val);
1273
1274         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1275 }
1276
1277 /* return value determine if we need a write buffer flush */
1278 static void __iommu_flush_context(struct intel_iommu *iommu,
1279                                   u16 did, u16 source_id, u8 function_mask,
1280                                   u64 type)
1281 {
1282         u64 val = 0;
1283         unsigned long flag;
1284
1285         switch (type) {
1286         case DMA_CCMD_GLOBAL_INVL:
1287                 val = DMA_CCMD_GLOBAL_INVL;
1288                 break;
1289         case DMA_CCMD_DOMAIN_INVL:
1290                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1291                 break;
1292         case DMA_CCMD_DEVICE_INVL:
1293                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1294                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1295                 break;
1296         default:
1297                 BUG();
1298         }
1299         val |= DMA_CCMD_ICC;
1300
1301         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1303
1304         /* Make sure hardware complete it */
1305         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1306                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1307
1308         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1309 }
1310
1311 /* return value determine if we need a write buffer flush */
1312 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1313                                 u64 addr, unsigned int size_order, u64 type)
1314 {
1315         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1316         u64 val = 0, val_iva = 0;
1317         unsigned long flag;
1318
1319         switch (type) {
1320         case DMA_TLB_GLOBAL_FLUSH:
1321                 /* global flush doesn't need set IVA_REG */
1322                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1323                 break;
1324         case DMA_TLB_DSI_FLUSH:
1325                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1326                 break;
1327         case DMA_TLB_PSI_FLUSH:
1328                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1329                 /* IH bit is passed in as part of address */
1330                 val_iva = size_order | addr;
1331                 break;
1332         default:
1333                 BUG();
1334         }
1335         /* Note: set drain read/write */
1336 #if 0
1337         /*
1338          * This is probably to be super secure.. Looks like we can
1339          * ignore it without any impact.
1340          */
1341         if (cap_read_drain(iommu->cap))
1342                 val |= DMA_TLB_READ_DRAIN;
1343 #endif
1344         if (cap_write_drain(iommu->cap))
1345                 val |= DMA_TLB_WRITE_DRAIN;
1346
1347         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1348         /* Note: Only uses first TLB reg currently */
1349         if (val_iva)
1350                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1351         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1352
1353         /* Make sure hardware complete it */
1354         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1355                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1356
1357         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1358
1359         /* check IOTLB invalidation granularity */
1360         if (DMA_TLB_IAIG(val) == 0)
1361                 pr_err("Flush IOTLB failed\n");
1362         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1363                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1364                         (unsigned long long)DMA_TLB_IIRG(type),
1365                         (unsigned long long)DMA_TLB_IAIG(val));
1366 }
1367
1368 static struct device_domain_info *
1369 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1370                          u8 bus, u8 devfn)
1371 {
1372         struct device_domain_info *info;
1373
1374         assert_spin_locked(&device_domain_lock);
1375
1376         if (!iommu->qi)
1377                 return NULL;
1378
1379         list_for_each_entry(info, &domain->devices, link)
1380                 if (info->iommu == iommu && info->bus == bus &&
1381                     info->devfn == devfn) {
1382                         if (info->ats_supported && info->dev)
1383                                 return info;
1384                         break;
1385                 }
1386
1387         return NULL;
1388 }
1389
1390 static void domain_update_iotlb(struct dmar_domain *domain)
1391 {
1392         struct device_domain_info *info;
1393         bool has_iotlb_device = false;
1394
1395         assert_spin_locked(&device_domain_lock);
1396
1397         list_for_each_entry(info, &domain->devices, link) {
1398                 struct pci_dev *pdev;
1399
1400                 if (!info->dev || !dev_is_pci(info->dev))
1401                         continue;
1402
1403                 pdev = to_pci_dev(info->dev);
1404                 if (pdev->ats_enabled) {
1405                         has_iotlb_device = true;
1406                         break;
1407                 }
1408         }
1409
1410         domain->has_iotlb_device = has_iotlb_device;
1411 }
1412
1413 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1414 {
1415         struct pci_dev *pdev;
1416
1417         assert_spin_locked(&device_domain_lock);
1418
1419         if (!info || !dev_is_pci(info->dev))
1420                 return;
1421
1422         pdev = to_pci_dev(info->dev);
1423         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1424          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1425          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1426          * reserved, which should be set to 0.
1427          */
1428         if (!ecap_dit(info->iommu->ecap))
1429                 info->pfsid = 0;
1430         else {
1431                 struct pci_dev *pf_pdev;
1432
1433                 /* pdev will be returned if device is not a vf */
1434                 pf_pdev = pci_physfn(pdev);
1435                 info->pfsid = pci_dev_id(pf_pdev);
1436         }
1437
1438 #ifdef CONFIG_INTEL_IOMMU_SVM
1439         /* The PCIe spec, in its wisdom, declares that the behaviour of
1440            the device if you enable PASID support after ATS support is
1441            undefined. So always enable PASID support on devices which
1442            have it, even if we can't yet know if we're ever going to
1443            use it. */
1444         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1445                 info->pasid_enabled = 1;
1446
1447         if (info->pri_supported &&
1448             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1449             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1450                 info->pri_enabled = 1;
1451 #endif
1452         if (!pdev->untrusted && info->ats_supported &&
1453             pci_ats_page_aligned(pdev) &&
1454             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1455                 info->ats_enabled = 1;
1456                 domain_update_iotlb(info->domain);
1457                 info->ats_qdep = pci_ats_queue_depth(pdev);
1458         }
1459 }
1460
1461 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1462 {
1463         struct pci_dev *pdev;
1464
1465         assert_spin_locked(&device_domain_lock);
1466
1467         if (!dev_is_pci(info->dev))
1468                 return;
1469
1470         pdev = to_pci_dev(info->dev);
1471
1472         if (info->ats_enabled) {
1473                 pci_disable_ats(pdev);
1474                 info->ats_enabled = 0;
1475                 domain_update_iotlb(info->domain);
1476         }
1477 #ifdef CONFIG_INTEL_IOMMU_SVM
1478         if (info->pri_enabled) {
1479                 pci_disable_pri(pdev);
1480                 info->pri_enabled = 0;
1481         }
1482         if (info->pasid_enabled) {
1483                 pci_disable_pasid(pdev);
1484                 info->pasid_enabled = 0;
1485         }
1486 #endif
1487 }
1488
1489 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1490                                   u64 addr, unsigned mask)
1491 {
1492         u16 sid, qdep;
1493         unsigned long flags;
1494         struct device_domain_info *info;
1495
1496         if (!domain->has_iotlb_device)
1497                 return;
1498
1499         spin_lock_irqsave(&device_domain_lock, flags);
1500         list_for_each_entry(info, &domain->devices, link) {
1501                 if (!info->ats_enabled)
1502                         continue;
1503
1504                 sid = info->bus << 8 | info->devfn;
1505                 qdep = info->ats_qdep;
1506                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1507                                 qdep, addr, mask);
1508         }
1509         spin_unlock_irqrestore(&device_domain_lock, flags);
1510 }
1511
1512 static void domain_flush_piotlb(struct intel_iommu *iommu,
1513                                 struct dmar_domain *domain,
1514                                 u64 addr, unsigned long npages, bool ih)
1515 {
1516         u16 did = domain->iommu_did[iommu->seq_id];
1517
1518         if (domain->default_pasid)
1519                 qi_flush_piotlb(iommu, did, domain->default_pasid,
1520                                 addr, npages, ih);
1521
1522         if (!list_empty(&domain->devices))
1523                 qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, npages, ih);
1524 }
1525
1526 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1527                                   struct dmar_domain *domain,
1528                                   unsigned long pfn, unsigned int pages,
1529                                   int ih, int map)
1530 {
1531         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1532         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1533         u16 did = domain->iommu_did[iommu->seq_id];
1534
1535         BUG_ON(pages == 0);
1536
1537         if (ih)
1538                 ih = 1 << 6;
1539
1540         if (domain_use_first_level(domain)) {
1541                 domain_flush_piotlb(iommu, domain, addr, pages, ih);
1542         } else {
1543                 /*
1544                  * Fallback to domain selective flush if no PSI support or
1545                  * the size is too big. PSI requires page size to be 2 ^ x,
1546                  * and the base address is naturally aligned to the size.
1547                  */
1548                 if (!cap_pgsel_inv(iommu->cap) ||
1549                     mask > cap_max_amask_val(iommu->cap))
1550                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1551                                                         DMA_TLB_DSI_FLUSH);
1552                 else
1553                         iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1554                                                         DMA_TLB_PSI_FLUSH);
1555         }
1556
1557         /*
1558          * In caching mode, changes of pages from non-present to present require
1559          * flush. However, device IOTLB doesn't need to be flushed in this case.
1560          */
1561         if (!cap_caching_mode(iommu->cap) || !map)
1562                 iommu_flush_dev_iotlb(domain, addr, mask);
1563 }
1564
1565 /* Notification for newly created mappings */
1566 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1567                                         struct dmar_domain *domain,
1568                                         unsigned long pfn, unsigned int pages)
1569 {
1570         /*
1571          * It's a non-present to present mapping. Only flush if caching mode
1572          * and second level.
1573          */
1574         if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1575                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1576         else
1577                 iommu_flush_write_buffer(iommu);
1578 }
1579
1580 static void iommu_flush_iova(struct iova_domain *iovad)
1581 {
1582         struct dmar_domain *domain;
1583         int idx;
1584
1585         domain = container_of(iovad, struct dmar_domain, iovad);
1586
1587         for_each_domain_iommu(idx, domain) {
1588                 struct intel_iommu *iommu = g_iommus[idx];
1589                 u16 did = domain->iommu_did[iommu->seq_id];
1590
1591                 if (domain_use_first_level(domain))
1592                         domain_flush_piotlb(iommu, domain, 0, -1, 0);
1593                 else
1594                         iommu->flush.flush_iotlb(iommu, did, 0, 0,
1595                                                  DMA_TLB_DSI_FLUSH);
1596
1597                 if (!cap_caching_mode(iommu->cap))
1598                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1599                                               0, MAX_AGAW_PFN_WIDTH);
1600         }
1601 }
1602
1603 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1604 {
1605         u32 pmen;
1606         unsigned long flags;
1607
1608         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1609                 return;
1610
1611         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1612         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1613         pmen &= ~DMA_PMEN_EPM;
1614         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1615
1616         /* wait for the protected region status bit to clear */
1617         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1618                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1619
1620         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1621 }
1622
1623 static void iommu_enable_translation(struct intel_iommu *iommu)
1624 {
1625         u32 sts;
1626         unsigned long flags;
1627
1628         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1629         iommu->gcmd |= DMA_GCMD_TE;
1630         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1631
1632         /* Make sure hardware complete it */
1633         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1634                       readl, (sts & DMA_GSTS_TES), sts);
1635
1636         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1637 }
1638
1639 static void iommu_disable_translation(struct intel_iommu *iommu)
1640 {
1641         u32 sts;
1642         unsigned long flag;
1643
1644         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1645         iommu->gcmd &= ~DMA_GCMD_TE;
1646         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1647
1648         /* Make sure hardware complete it */
1649         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1650                       readl, (!(sts & DMA_GSTS_TES)), sts);
1651
1652         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1653 }
1654
1655 static int iommu_init_domains(struct intel_iommu *iommu)
1656 {
1657         u32 ndomains, nlongs;
1658         size_t size;
1659
1660         ndomains = cap_ndoms(iommu->cap);
1661         pr_debug("%s: Number of Domains supported <%d>\n",
1662                  iommu->name, ndomains);
1663         nlongs = BITS_TO_LONGS(ndomains);
1664
1665         spin_lock_init(&iommu->lock);
1666
1667         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1668         if (!iommu->domain_ids) {
1669                 pr_err("%s: Allocating domain id array failed\n",
1670                        iommu->name);
1671                 return -ENOMEM;
1672         }
1673
1674         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1675         iommu->domains = kzalloc(size, GFP_KERNEL);
1676
1677         if (iommu->domains) {
1678                 size = 256 * sizeof(struct dmar_domain *);
1679                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1680         }
1681
1682         if (!iommu->domains || !iommu->domains[0]) {
1683                 pr_err("%s: Allocating domain array failed\n",
1684                        iommu->name);
1685                 kfree(iommu->domain_ids);
1686                 kfree(iommu->domains);
1687                 iommu->domain_ids = NULL;
1688                 iommu->domains    = NULL;
1689                 return -ENOMEM;
1690         }
1691
1692         /*
1693          * If Caching mode is set, then invalid translations are tagged
1694          * with domain-id 0, hence we need to pre-allocate it. We also
1695          * use domain-id 0 as a marker for non-allocated domain-id, so
1696          * make sure it is not used for a real domain.
1697          */
1698         set_bit(0, iommu->domain_ids);
1699
1700         /*
1701          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1702          * entry for first-level or pass-through translation modes should
1703          * be programmed with a domain id different from those used for
1704          * second-level or nested translation. We reserve a domain id for
1705          * this purpose.
1706          */
1707         if (sm_supported(iommu))
1708                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1709
1710         return 0;
1711 }
1712
1713 static void disable_dmar_iommu(struct intel_iommu *iommu)
1714 {
1715         struct device_domain_info *info, *tmp;
1716         unsigned long flags;
1717
1718         if (!iommu->domains || !iommu->domain_ids)
1719                 return;
1720
1721         spin_lock_irqsave(&device_domain_lock, flags);
1722         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1723                 if (info->iommu != iommu)
1724                         continue;
1725
1726                 if (!info->dev || !info->domain)
1727                         continue;
1728
1729                 __dmar_remove_one_dev_info(info);
1730         }
1731         spin_unlock_irqrestore(&device_domain_lock, flags);
1732
1733         if (iommu->gcmd & DMA_GCMD_TE)
1734                 iommu_disable_translation(iommu);
1735 }
1736
1737 static void free_dmar_iommu(struct intel_iommu *iommu)
1738 {
1739         if ((iommu->domains) && (iommu->domain_ids)) {
1740                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1741                 int i;
1742
1743                 for (i = 0; i < elems; i++)
1744                         kfree(iommu->domains[i]);
1745                 kfree(iommu->domains);
1746                 kfree(iommu->domain_ids);
1747                 iommu->domains = NULL;
1748                 iommu->domain_ids = NULL;
1749         }
1750
1751         g_iommus[iommu->seq_id] = NULL;
1752
1753         /* free context mapping */
1754         free_context_table(iommu);
1755
1756 #ifdef CONFIG_INTEL_IOMMU_SVM
1757         if (pasid_supported(iommu)) {
1758                 if (ecap_prs(iommu->ecap))
1759                         intel_svm_finish_prq(iommu);
1760         }
1761 #endif
1762 }
1763
1764 /*
1765  * Check and return whether first level is used by default for
1766  * DMA translation.
1767  */
1768 static bool first_level_by_default(void)
1769 {
1770         struct dmar_drhd_unit *drhd;
1771         struct intel_iommu *iommu;
1772         static int first_level_support = -1;
1773
1774         if (likely(first_level_support != -1))
1775                 return first_level_support;
1776
1777         first_level_support = 1;
1778
1779         rcu_read_lock();
1780         for_each_active_iommu(iommu, drhd) {
1781                 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) {
1782                         first_level_support = 0;
1783                         break;
1784                 }
1785         }
1786         rcu_read_unlock();
1787
1788         return first_level_support;
1789 }
1790
1791 static struct dmar_domain *alloc_domain(int flags)
1792 {
1793         struct dmar_domain *domain;
1794
1795         domain = alloc_domain_mem();
1796         if (!domain)
1797                 return NULL;
1798
1799         memset(domain, 0, sizeof(*domain));
1800         domain->nid = NUMA_NO_NODE;
1801         domain->flags = flags;
1802         if (first_level_by_default())
1803                 domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1804         domain->has_iotlb_device = false;
1805         INIT_LIST_HEAD(&domain->devices);
1806
1807         return domain;
1808 }
1809
1810 /* Must be called with iommu->lock */
1811 static int domain_attach_iommu(struct dmar_domain *domain,
1812                                struct intel_iommu *iommu)
1813 {
1814         unsigned long ndomains;
1815         int num;
1816
1817         assert_spin_locked(&device_domain_lock);
1818         assert_spin_locked(&iommu->lock);
1819
1820         domain->iommu_refcnt[iommu->seq_id] += 1;
1821         domain->iommu_count += 1;
1822         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1823                 ndomains = cap_ndoms(iommu->cap);
1824                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1825
1826                 if (num >= ndomains) {
1827                         pr_err("%s: No free domain ids\n", iommu->name);
1828                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1829                         domain->iommu_count -= 1;
1830                         return -ENOSPC;
1831                 }
1832
1833                 set_bit(num, iommu->domain_ids);
1834                 set_iommu_domain(iommu, num, domain);
1835
1836                 domain->iommu_did[iommu->seq_id] = num;
1837                 domain->nid                      = iommu->node;
1838
1839                 domain_update_iommu_cap(domain);
1840         }
1841
1842         return 0;
1843 }
1844
1845 static int domain_detach_iommu(struct dmar_domain *domain,
1846                                struct intel_iommu *iommu)
1847 {
1848         int num, count;
1849
1850         assert_spin_locked(&device_domain_lock);
1851         assert_spin_locked(&iommu->lock);
1852
1853         domain->iommu_refcnt[iommu->seq_id] -= 1;
1854         count = --domain->iommu_count;
1855         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1856                 num = domain->iommu_did[iommu->seq_id];
1857                 clear_bit(num, iommu->domain_ids);
1858                 set_iommu_domain(iommu, num, NULL);
1859
1860                 domain_update_iommu_cap(domain);
1861                 domain->iommu_did[iommu->seq_id] = 0;
1862         }
1863
1864         return count;
1865 }
1866
1867 static struct iova_domain reserved_iova_list;
1868 static struct lock_class_key reserved_rbtree_key;
1869
1870 static int dmar_init_reserved_ranges(void)
1871 {
1872         struct pci_dev *pdev = NULL;
1873         struct iova *iova;
1874         int i;
1875
1876         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1877
1878         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1879                 &reserved_rbtree_key);
1880
1881         /* IOAPIC ranges shouldn't be accessed by DMA */
1882         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1883                 IOVA_PFN(IOAPIC_RANGE_END));
1884         if (!iova) {
1885                 pr_err("Reserve IOAPIC range failed\n");
1886                 return -ENODEV;
1887         }
1888
1889         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1890         for_each_pci_dev(pdev) {
1891                 struct resource *r;
1892
1893                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1894                         r = &pdev->resource[i];
1895                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1896                                 continue;
1897                         iova = reserve_iova(&reserved_iova_list,
1898                                             IOVA_PFN(r->start),
1899                                             IOVA_PFN(r->end));
1900                         if (!iova) {
1901                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1902                                 return -ENODEV;
1903                         }
1904                 }
1905         }
1906         return 0;
1907 }
1908
1909 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1910 {
1911         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1912 }
1913
1914 static inline int guestwidth_to_adjustwidth(int gaw)
1915 {
1916         int agaw;
1917         int r = (gaw - 12) % 9;
1918
1919         if (r == 0)
1920                 agaw = gaw;
1921         else
1922                 agaw = gaw + 9 - r;
1923         if (agaw > 64)
1924                 agaw = 64;
1925         return agaw;
1926 }
1927
1928 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1929                        int guest_width)
1930 {
1931         int adjust_width, agaw;
1932         unsigned long sagaw;
1933         int ret;
1934
1935         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1936
1937         if (!intel_iommu_strict) {
1938                 ret = init_iova_flush_queue(&domain->iovad,
1939                                             iommu_flush_iova, iova_entry_free);
1940                 if (ret)
1941                         pr_info("iova flush queue initialization failed\n");
1942         }
1943
1944         domain_reserve_special_ranges(domain);
1945
1946         /* calculate AGAW */
1947         if (guest_width > cap_mgaw(iommu->cap))
1948                 guest_width = cap_mgaw(iommu->cap);
1949         domain->gaw = guest_width;
1950         adjust_width = guestwidth_to_adjustwidth(guest_width);
1951         agaw = width_to_agaw(adjust_width);
1952         sagaw = cap_sagaw(iommu->cap);
1953         if (!test_bit(agaw, &sagaw)) {
1954                 /* hardware doesn't support it, choose a bigger one */
1955                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1956                 agaw = find_next_bit(&sagaw, 5, agaw);
1957                 if (agaw >= 5)
1958                         return -ENODEV;
1959         }
1960         domain->agaw = agaw;
1961
1962         if (ecap_coherent(iommu->ecap))
1963                 domain->iommu_coherency = 1;
1964         else
1965                 domain->iommu_coherency = 0;
1966
1967         if (ecap_sc_support(iommu->ecap))
1968                 domain->iommu_snooping = 1;
1969         else
1970                 domain->iommu_snooping = 0;
1971
1972         if (intel_iommu_superpage)
1973                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1974         else
1975                 domain->iommu_superpage = 0;
1976
1977         domain->nid = iommu->node;
1978
1979         /* always allocate the top pgd */
1980         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1981         if (!domain->pgd)
1982                 return -ENOMEM;
1983         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1984         return 0;
1985 }
1986
1987 static void domain_exit(struct dmar_domain *domain)
1988 {
1989
1990         /* Remove associated devices and clear attached or cached domains */
1991         domain_remove_dev_info(domain);
1992
1993         /* destroy iovas */
1994         put_iova_domain(&domain->iovad);
1995
1996         if (domain->pgd) {
1997                 struct page *freelist;
1998
1999                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
2000                 dma_free_pagelist(freelist);
2001         }
2002
2003         free_domain_mem(domain);
2004 }
2005
2006 /*
2007  * Get the PASID directory size for scalable mode context entry.
2008  * Value of X in the PDTS field of a scalable mode context entry
2009  * indicates PASID directory with 2^(X + 7) entries.
2010  */
2011 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
2012 {
2013         int pds, max_pde;
2014
2015         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
2016         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
2017         if (pds < 7)
2018                 return 0;
2019
2020         return pds - 7;
2021 }
2022
2023 /*
2024  * Set the RID_PASID field of a scalable mode context entry. The
2025  * IOMMU hardware will use the PASID value set in this field for
2026  * DMA translations of DMA requests without PASID.
2027  */
2028 static inline void
2029 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
2030 {
2031         context->hi |= pasid & ((1 << 20) - 1);
2032         context->hi |= (1 << 20);
2033 }
2034
2035 /*
2036  * Set the DTE(Device-TLB Enable) field of a scalable mode context
2037  * entry.
2038  */
2039 static inline void context_set_sm_dte(struct context_entry *context)
2040 {
2041         context->lo |= (1 << 2);
2042 }
2043
2044 /*
2045  * Set the PRE(Page Request Enable) field of a scalable mode context
2046  * entry.
2047  */
2048 static inline void context_set_sm_pre(struct context_entry *context)
2049 {
2050         context->lo |= (1 << 4);
2051 }
2052
2053 /* Convert value to context PASID directory size field coding. */
2054 #define context_pdts(pds)       (((pds) & 0x7) << 9)
2055
2056 static int domain_context_mapping_one(struct dmar_domain *domain,
2057                                       struct intel_iommu *iommu,
2058                                       struct pasid_table *table,
2059                                       u8 bus, u8 devfn)
2060 {
2061         u16 did = domain->iommu_did[iommu->seq_id];
2062         int translation = CONTEXT_TT_MULTI_LEVEL;
2063         struct device_domain_info *info = NULL;
2064         struct context_entry *context;
2065         unsigned long flags;
2066         int ret;
2067
2068         WARN_ON(did == 0);
2069
2070         if (hw_pass_through && domain_type_is_si(domain))
2071                 translation = CONTEXT_TT_PASS_THROUGH;
2072
2073         pr_debug("Set context mapping for %02x:%02x.%d\n",
2074                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
2075
2076         BUG_ON(!domain->pgd);
2077
2078         spin_lock_irqsave(&device_domain_lock, flags);
2079         spin_lock(&iommu->lock);
2080
2081         ret = -ENOMEM;
2082         context = iommu_context_addr(iommu, bus, devfn, 1);
2083         if (!context)
2084                 goto out_unlock;
2085
2086         ret = 0;
2087         if (context_present(context))
2088                 goto out_unlock;
2089
2090         /*
2091          * For kdump cases, old valid entries may be cached due to the
2092          * in-flight DMA and copied pgtable, but there is no unmapping
2093          * behaviour for them, thus we need an explicit cache flush for
2094          * the newly-mapped device. For kdump, at this point, the device
2095          * is supposed to finish reset at its driver probe stage, so no
2096          * in-flight DMA will exist, and we don't need to worry anymore
2097          * hereafter.
2098          */
2099         if (context_copied(context)) {
2100                 u16 did_old = context_domain_id(context);
2101
2102                 if (did_old < cap_ndoms(iommu->cap)) {
2103                         iommu->flush.flush_context(iommu, did_old,
2104                                                    (((u16)bus) << 8) | devfn,
2105                                                    DMA_CCMD_MASK_NOBIT,
2106                                                    DMA_CCMD_DEVICE_INVL);
2107                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2108                                                  DMA_TLB_DSI_FLUSH);
2109                 }
2110         }
2111
2112         context_clear_entry(context);
2113
2114         if (sm_supported(iommu)) {
2115                 unsigned long pds;
2116
2117                 WARN_ON(!table);
2118
2119                 /* Setup the PASID DIR pointer: */
2120                 pds = context_get_sm_pds(table);
2121                 context->lo = (u64)virt_to_phys(table->table) |
2122                                 context_pdts(pds);
2123
2124                 /* Setup the RID_PASID field: */
2125                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2126
2127                 /*
2128                  * Setup the Device-TLB enable bit and Page request
2129                  * Enable bit:
2130                  */
2131                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2132                 if (info && info->ats_supported)
2133                         context_set_sm_dte(context);
2134                 if (info && info->pri_supported)
2135                         context_set_sm_pre(context);
2136         } else {
2137                 struct dma_pte *pgd = domain->pgd;
2138                 int agaw;
2139
2140                 context_set_domain_id(context, did);
2141
2142                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2143                         /*
2144                          * Skip top levels of page tables for iommu which has
2145                          * less agaw than default. Unnecessary for PT mode.
2146                          */
2147                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2148                                 ret = -ENOMEM;
2149                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2150                                 if (!dma_pte_present(pgd))
2151                                         goto out_unlock;
2152                         }
2153
2154                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2155                         if (info && info->ats_supported)
2156                                 translation = CONTEXT_TT_DEV_IOTLB;
2157                         else
2158                                 translation = CONTEXT_TT_MULTI_LEVEL;
2159
2160                         context_set_address_root(context, virt_to_phys(pgd));
2161                         context_set_address_width(context, agaw);
2162                 } else {
2163                         /*
2164                          * In pass through mode, AW must be programmed to
2165                          * indicate the largest AGAW value supported by
2166                          * hardware. And ASR is ignored by hardware.
2167                          */
2168                         context_set_address_width(context, iommu->msagaw);
2169                 }
2170
2171                 context_set_translation_type(context, translation);
2172         }
2173
2174         context_set_fault_enable(context);
2175         context_set_present(context);
2176         domain_flush_cache(domain, context, sizeof(*context));
2177
2178         /*
2179          * It's a non-present to present mapping. If hardware doesn't cache
2180          * non-present entry we only need to flush the write-buffer. If the
2181          * _does_ cache non-present entries, then it does so in the special
2182          * domain #0, which we have to flush:
2183          */
2184         if (cap_caching_mode(iommu->cap)) {
2185                 iommu->flush.flush_context(iommu, 0,
2186                                            (((u16)bus) << 8) | devfn,
2187                                            DMA_CCMD_MASK_NOBIT,
2188                                            DMA_CCMD_DEVICE_INVL);
2189                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2190         } else {
2191                 iommu_flush_write_buffer(iommu);
2192         }
2193         iommu_enable_dev_iotlb(info);
2194
2195         ret = 0;
2196
2197 out_unlock:
2198         spin_unlock(&iommu->lock);
2199         spin_unlock_irqrestore(&device_domain_lock, flags);
2200
2201         return ret;
2202 }
2203
2204 struct domain_context_mapping_data {
2205         struct dmar_domain *domain;
2206         struct intel_iommu *iommu;
2207         struct pasid_table *table;
2208 };
2209
2210 static int domain_context_mapping_cb(struct pci_dev *pdev,
2211                                      u16 alias, void *opaque)
2212 {
2213         struct domain_context_mapping_data *data = opaque;
2214
2215         return domain_context_mapping_one(data->domain, data->iommu,
2216                                           data->table, PCI_BUS_NUM(alias),
2217                                           alias & 0xff);
2218 }
2219
2220 static int
2221 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2222 {
2223         struct domain_context_mapping_data data;
2224         struct pasid_table *table;
2225         struct intel_iommu *iommu;
2226         u8 bus, devfn;
2227
2228         iommu = device_to_iommu(dev, &bus, &devfn);
2229         if (!iommu)
2230                 return -ENODEV;
2231
2232         table = intel_pasid_get_table(dev);
2233
2234         if (!dev_is_pci(dev))
2235                 return domain_context_mapping_one(domain, iommu, table,
2236                                                   bus, devfn);
2237
2238         data.domain = domain;
2239         data.iommu = iommu;
2240         data.table = table;
2241
2242         return pci_for_each_dma_alias(to_pci_dev(dev),
2243                                       &domain_context_mapping_cb, &data);
2244 }
2245
2246 static int domain_context_mapped_cb(struct pci_dev *pdev,
2247                                     u16 alias, void *opaque)
2248 {
2249         struct intel_iommu *iommu = opaque;
2250
2251         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2252 }
2253
2254 static int domain_context_mapped(struct device *dev)
2255 {
2256         struct intel_iommu *iommu;
2257         u8 bus, devfn;
2258
2259         iommu = device_to_iommu(dev, &bus, &devfn);
2260         if (!iommu)
2261                 return -ENODEV;
2262
2263         if (!dev_is_pci(dev))
2264                 return device_context_mapped(iommu, bus, devfn);
2265
2266         return !pci_for_each_dma_alias(to_pci_dev(dev),
2267                                        domain_context_mapped_cb, iommu);
2268 }
2269
2270 /* Returns a number of VTD pages, but aligned to MM page size */
2271 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2272                                             size_t size)
2273 {
2274         host_addr &= ~PAGE_MASK;
2275         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2276 }
2277
2278 /* Return largest possible superpage level for a given mapping */
2279 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2280                                           unsigned long iov_pfn,
2281                                           unsigned long phy_pfn,
2282                                           unsigned long pages)
2283 {
2284         int support, level = 1;
2285         unsigned long pfnmerge;
2286
2287         support = domain->iommu_superpage;
2288
2289         /* To use a large page, the virtual *and* physical addresses
2290            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2291            of them will mean we have to use smaller pages. So just
2292            merge them and check both at once. */
2293         pfnmerge = iov_pfn | phy_pfn;
2294
2295         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2296                 pages >>= VTD_STRIDE_SHIFT;
2297                 if (!pages)
2298                         break;
2299                 pfnmerge >>= VTD_STRIDE_SHIFT;
2300                 level++;
2301                 support--;
2302         }
2303         return level;
2304 }
2305
2306 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2307                             struct scatterlist *sg, unsigned long phys_pfn,
2308                             unsigned long nr_pages, int prot)
2309 {
2310         struct dma_pte *first_pte = NULL, *pte = NULL;
2311         phys_addr_t uninitialized_var(pteval);
2312         unsigned long sg_res = 0;
2313         unsigned int largepage_lvl = 0;
2314         unsigned long lvl_pages = 0;
2315         u64 attr;
2316
2317         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2318
2319         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2320                 return -EINVAL;
2321
2322         attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2323         if (domain_use_first_level(domain))
2324                 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_XD;
2325
2326         if (!sg) {
2327                 sg_res = nr_pages;
2328                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2329         }
2330
2331         while (nr_pages > 0) {
2332                 uint64_t tmp;
2333
2334                 if (!sg_res) {
2335                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2336
2337                         sg_res = aligned_nrpages(sg->offset, sg->length);
2338                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2339                         sg->dma_length = sg->length;
2340                         pteval = (sg_phys(sg) - pgoff) | attr;
2341                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2342                 }
2343
2344                 if (!pte) {
2345                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2346
2347                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2348                         if (!pte)
2349                                 return -ENOMEM;
2350                         /* It is large page*/
2351                         if (largepage_lvl > 1) {
2352                                 unsigned long nr_superpages, end_pfn;
2353
2354                                 pteval |= DMA_PTE_LARGE_PAGE;
2355                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2356
2357                                 nr_superpages = sg_res / lvl_pages;
2358                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2359
2360                                 /*
2361                                  * Ensure that old small page tables are
2362                                  * removed to make room for superpage(s).
2363                                  * We're adding new large pages, so make sure
2364                                  * we don't remove their parent tables.
2365                                  */
2366                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2367                                                        largepage_lvl + 1);
2368                         } else {
2369                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2370                         }
2371
2372                 }
2373                 /* We don't need lock here, nobody else
2374                  * touches the iova range
2375                  */
2376                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2377                 if (tmp) {
2378                         static int dumps = 5;
2379                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2380                                 iov_pfn, tmp, (unsigned long long)pteval);
2381                         if (dumps) {
2382                                 dumps--;
2383                                 debug_dma_dump_mappings(NULL);
2384                         }
2385                         WARN_ON(1);
2386                 }
2387
2388                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2389
2390                 BUG_ON(nr_pages < lvl_pages);
2391                 BUG_ON(sg_res < lvl_pages);
2392
2393                 nr_pages -= lvl_pages;
2394                 iov_pfn += lvl_pages;
2395                 phys_pfn += lvl_pages;
2396                 pteval += lvl_pages * VTD_PAGE_SIZE;
2397                 sg_res -= lvl_pages;
2398
2399                 /* If the next PTE would be the first in a new page, then we
2400                    need to flush the cache on the entries we've just written.
2401                    And then we'll need to recalculate 'pte', so clear it and
2402                    let it get set again in the if (!pte) block above.
2403
2404                    If we're done (!nr_pages) we need to flush the cache too.
2405
2406                    Also if we've been setting superpages, we may need to
2407                    recalculate 'pte' and switch back to smaller pages for the
2408                    end of the mapping, if the trailing size is not enough to
2409                    use another superpage (i.e. sg_res < lvl_pages). */
2410                 pte++;
2411                 if (!nr_pages || first_pte_in_page(pte) ||
2412                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2413                         domain_flush_cache(domain, first_pte,
2414                                            (void *)pte - (void *)first_pte);
2415                         pte = NULL;
2416                 }
2417
2418                 if (!sg_res && nr_pages)
2419                         sg = sg_next(sg);
2420         }
2421         return 0;
2422 }
2423
2424 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2425                           struct scatterlist *sg, unsigned long phys_pfn,
2426                           unsigned long nr_pages, int prot)
2427 {
2428         int iommu_id, ret;
2429         struct intel_iommu *iommu;
2430
2431         /* Do the real mapping first */
2432         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2433         if (ret)
2434                 return ret;
2435
2436         for_each_domain_iommu(iommu_id, domain) {
2437                 iommu = g_iommus[iommu_id];
2438                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2439         }
2440
2441         return 0;
2442 }
2443
2444 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2445                                     struct scatterlist *sg, unsigned long nr_pages,
2446                                     int prot)
2447 {
2448         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2449 }
2450
2451 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2452                                      unsigned long phys_pfn, unsigned long nr_pages,
2453                                      int prot)
2454 {
2455         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2456 }
2457
2458 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2459 {
2460         unsigned long flags;
2461         struct context_entry *context;
2462         u16 did_old;
2463
2464         if (!iommu)
2465                 return;
2466
2467         spin_lock_irqsave(&iommu->lock, flags);
2468         context = iommu_context_addr(iommu, bus, devfn, 0);
2469         if (!context) {
2470                 spin_unlock_irqrestore(&iommu->lock, flags);
2471                 return;
2472         }
2473         did_old = context_domain_id(context);
2474         context_clear_entry(context);
2475         __iommu_flush_cache(iommu, context, sizeof(*context));
2476         spin_unlock_irqrestore(&iommu->lock, flags);
2477         iommu->flush.flush_context(iommu,
2478                                    did_old,
2479                                    (((u16)bus) << 8) | devfn,
2480                                    DMA_CCMD_MASK_NOBIT,
2481                                    DMA_CCMD_DEVICE_INVL);
2482         iommu->flush.flush_iotlb(iommu,
2483                                  did_old,
2484                                  0,
2485                                  0,
2486                                  DMA_TLB_DSI_FLUSH);
2487 }
2488
2489 static inline void unlink_domain_info(struct device_domain_info *info)
2490 {
2491         assert_spin_locked(&device_domain_lock);
2492         list_del(&info->link);
2493         list_del(&info->global);
2494         if (info->dev)
2495                 info->dev->archdata.iommu = NULL;
2496 }
2497
2498 static void domain_remove_dev_info(struct dmar_domain *domain)
2499 {
2500         struct device_domain_info *info, *tmp;
2501         unsigned long flags;
2502
2503         spin_lock_irqsave(&device_domain_lock, flags);
2504         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2505                 __dmar_remove_one_dev_info(info);
2506         spin_unlock_irqrestore(&device_domain_lock, flags);
2507 }
2508
2509 struct dmar_domain *find_domain(struct device *dev)
2510 {
2511         struct device_domain_info *info;
2512
2513         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO ||
2514                      dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO))
2515                 return NULL;
2516
2517         if (dev_is_pci(dev))
2518                 dev = &pci_real_dma_dev(to_pci_dev(dev))->dev;
2519
2520         /* No lock here, assumes no domain exit in normal case */
2521         info = dev->archdata.iommu;
2522         if (likely(info))
2523                 return info->domain;
2524
2525         return NULL;
2526 }
2527
2528 static struct dmar_domain *deferred_attach_domain(struct device *dev)
2529 {
2530         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2531                 struct iommu_domain *domain;
2532
2533                 dev->archdata.iommu = NULL;
2534                 domain = iommu_get_domain_for_dev(dev);
2535                 if (domain)
2536                         intel_iommu_attach_device(domain, dev);
2537         }
2538
2539         return find_domain(dev);
2540 }
2541
2542 static inline struct device_domain_info *
2543 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2544 {
2545         struct device_domain_info *info;
2546
2547         list_for_each_entry(info, &device_domain_list, global)
2548                 if (info->iommu->segment == segment && info->bus == bus &&
2549                     info->devfn == devfn)
2550                         return info;
2551
2552         return NULL;
2553 }
2554
2555 static int domain_setup_first_level(struct intel_iommu *iommu,
2556                                     struct dmar_domain *domain,
2557                                     struct device *dev,
2558                                     int pasid)
2559 {
2560         int flags = PASID_FLAG_SUPERVISOR_MODE;
2561         struct dma_pte *pgd = domain->pgd;
2562         int agaw, level;
2563
2564         /*
2565          * Skip top levels of page tables for iommu which has
2566          * less agaw than default. Unnecessary for PT mode.
2567          */
2568         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2569                 pgd = phys_to_virt(dma_pte_addr(pgd));
2570                 if (!dma_pte_present(pgd))
2571                         return -ENOMEM;
2572         }
2573
2574         level = agaw_to_level(agaw);
2575         if (level != 4 && level != 5)
2576                 return -EINVAL;
2577
2578         flags |= (level == 5) ? PASID_FLAG_FL5LP : 0;
2579
2580         return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2581                                              domain->iommu_did[iommu->seq_id],
2582                                              flags);
2583 }
2584
2585 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2586                                                     int bus, int devfn,
2587                                                     struct device *dev,
2588                                                     struct dmar_domain *domain)
2589 {
2590         struct dmar_domain *found = NULL;
2591         struct device_domain_info *info;
2592         unsigned long flags;
2593         int ret;
2594
2595         info = alloc_devinfo_mem();
2596         if (!info)
2597                 return NULL;
2598
2599         info->bus = bus;
2600         info->devfn = devfn;
2601         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2602         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2603         info->ats_qdep = 0;
2604         info->dev = dev;
2605         info->domain = domain;
2606         info->iommu = iommu;
2607         info->pasid_table = NULL;
2608         info->auxd_enabled = 0;
2609         INIT_LIST_HEAD(&info->auxiliary_domains);
2610
2611         if (dev && dev_is_pci(dev)) {
2612                 struct pci_dev *pdev = to_pci_dev(info->dev);
2613
2614                 if (!pdev->untrusted &&
2615                     !pci_ats_disabled() &&
2616                     ecap_dev_iotlb_support(iommu->ecap) &&
2617                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2618                     dmar_find_matched_atsr_unit(pdev))
2619                         info->ats_supported = 1;
2620
2621                 if (sm_supported(iommu)) {
2622                         if (pasid_supported(iommu)) {
2623                                 int features = pci_pasid_features(pdev);
2624                                 if (features >= 0)
2625                                         info->pasid_supported = features | 1;
2626                         }
2627
2628                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2629                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2630                                 info->pri_supported = 1;
2631                 }
2632         }
2633
2634         spin_lock_irqsave(&device_domain_lock, flags);
2635         if (dev)
2636                 found = find_domain(dev);
2637
2638         if (!found) {
2639                 struct device_domain_info *info2;
2640                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2641                 if (info2) {
2642                         found      = info2->domain;
2643                         info2->dev = dev;
2644                 }
2645         }
2646
2647         if (found) {
2648                 spin_unlock_irqrestore(&device_domain_lock, flags);
2649                 free_devinfo_mem(info);
2650                 /* Caller must free the original domain */
2651                 return found;
2652         }
2653
2654         spin_lock(&iommu->lock);
2655         ret = domain_attach_iommu(domain, iommu);
2656         spin_unlock(&iommu->lock);
2657
2658         if (ret) {
2659                 spin_unlock_irqrestore(&device_domain_lock, flags);
2660                 free_devinfo_mem(info);
2661                 return NULL;
2662         }
2663
2664         list_add(&info->link, &domain->devices);
2665         list_add(&info->global, &device_domain_list);
2666         if (dev)
2667                 dev->archdata.iommu = info;
2668         spin_unlock_irqrestore(&device_domain_lock, flags);
2669
2670         /* PASID table is mandatory for a PCI device in scalable mode. */
2671         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2672                 ret = intel_pasid_alloc_table(dev);
2673                 if (ret) {
2674                         dev_err(dev, "PASID table allocation failed\n");
2675                         dmar_remove_one_dev_info(dev);
2676                         return NULL;
2677                 }
2678
2679                 /* Setup the PASID entry for requests without PASID: */
2680                 spin_lock(&iommu->lock);
2681                 if (hw_pass_through && domain_type_is_si(domain))
2682                         ret = intel_pasid_setup_pass_through(iommu, domain,
2683                                         dev, PASID_RID2PASID);
2684                 else if (domain_use_first_level(domain))
2685                         ret = domain_setup_first_level(iommu, domain, dev,
2686                                         PASID_RID2PASID);
2687                 else
2688                         ret = intel_pasid_setup_second_level(iommu, domain,
2689                                         dev, PASID_RID2PASID);
2690                 spin_unlock(&iommu->lock);
2691                 if (ret) {
2692                         dev_err(dev, "Setup RID2PASID failed\n");
2693                         dmar_remove_one_dev_info(dev);
2694                         return NULL;
2695                 }
2696         }
2697
2698         if (dev && domain_context_mapping(domain, dev)) {
2699                 dev_err(dev, "Domain context map failed\n");
2700                 dmar_remove_one_dev_info(dev);
2701                 return NULL;
2702         }
2703
2704         return domain;
2705 }
2706
2707 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2708 {
2709         *(u16 *)opaque = alias;
2710         return 0;
2711 }
2712
2713 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2714 {
2715         struct device_domain_info *info;
2716         struct dmar_domain *domain = NULL;
2717         struct intel_iommu *iommu;
2718         u16 dma_alias;
2719         unsigned long flags;
2720         u8 bus, devfn;
2721
2722         iommu = device_to_iommu(dev, &bus, &devfn);
2723         if (!iommu)
2724                 return NULL;
2725
2726         if (dev_is_pci(dev)) {
2727                 struct pci_dev *pdev = to_pci_dev(dev);
2728
2729                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2730
2731                 spin_lock_irqsave(&device_domain_lock, flags);
2732                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2733                                                       PCI_BUS_NUM(dma_alias),
2734                                                       dma_alias & 0xff);
2735                 if (info) {
2736                         iommu = info->iommu;
2737                         domain = info->domain;
2738                 }
2739                 spin_unlock_irqrestore(&device_domain_lock, flags);
2740
2741                 /* DMA alias already has a domain, use it */
2742                 if (info)
2743                         goto out;
2744         }
2745
2746         /* Allocate and initialize new domain for the device */
2747         domain = alloc_domain(0);
2748         if (!domain)
2749                 return NULL;
2750         if (domain_init(domain, iommu, gaw)) {
2751                 domain_exit(domain);
2752                 return NULL;
2753         }
2754
2755 out:
2756         return domain;
2757 }
2758
2759 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2760                                               struct dmar_domain *domain)
2761 {
2762         struct intel_iommu *iommu;
2763         struct dmar_domain *tmp;
2764         u16 req_id, dma_alias;
2765         u8 bus, devfn;
2766
2767         iommu = device_to_iommu(dev, &bus, &devfn);
2768         if (!iommu)
2769                 return NULL;
2770
2771         req_id = ((u16)bus << 8) | devfn;
2772
2773         if (dev_is_pci(dev)) {
2774                 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2777
2778                 /* register PCI DMA alias device */
2779                 if (req_id != dma_alias) {
2780                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2781                                         dma_alias & 0xff, NULL, domain);
2782
2783                         if (!tmp || tmp != domain)
2784                                 return tmp;
2785                 }
2786         }
2787
2788         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2789         if (!tmp || tmp != domain)
2790                 return tmp;
2791
2792         return domain;
2793 }
2794
2795 static int iommu_domain_identity_map(struct dmar_domain *domain,
2796                                      unsigned long long start,
2797                                      unsigned long long end)
2798 {
2799         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2800         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2801
2802         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2803                           dma_to_mm_pfn(last_vpfn))) {
2804                 pr_err("Reserving iova failed\n");
2805                 return -ENOMEM;
2806         }
2807
2808         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2809         /*
2810          * RMRR range might have overlap with physical memory range,
2811          * clear it first
2812          */
2813         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2814
2815         return __domain_mapping(domain, first_vpfn, NULL,
2816                                 first_vpfn, last_vpfn - first_vpfn + 1,
2817                                 DMA_PTE_READ|DMA_PTE_WRITE);
2818 }
2819
2820 static int domain_prepare_identity_map(struct device *dev,
2821                                        struct dmar_domain *domain,
2822                                        unsigned long long start,
2823                                        unsigned long long end)
2824 {
2825         /* For _hardware_ passthrough, don't bother. But for software
2826            passthrough, we do it anyway -- it may indicate a memory
2827            range which is reserved in E820, so which didn't get set
2828            up to start with in si_domain */
2829         if (domain == si_domain && hw_pass_through) {
2830                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2831                          start, end);
2832                 return 0;
2833         }
2834
2835         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2836
2837         if (end < start) {
2838                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2839                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2840                         dmi_get_system_info(DMI_BIOS_VENDOR),
2841                         dmi_get_system_info(DMI_BIOS_VERSION),
2842                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2843                 return -EIO;
2844         }
2845
2846         if (end >> agaw_to_width(domain->agaw)) {
2847                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2848                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2849                      agaw_to_width(domain->agaw),
2850                      dmi_get_system_info(DMI_BIOS_VENDOR),
2851                      dmi_get_system_info(DMI_BIOS_VERSION),
2852                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2853                 return -EIO;
2854         }
2855
2856         return iommu_domain_identity_map(domain, start, end);
2857 }
2858
2859 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2860
2861 static int __init si_domain_init(int hw)
2862 {
2863         struct dmar_rmrr_unit *rmrr;
2864         struct device *dev;
2865         int i, nid, ret;
2866
2867         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2868         if (!si_domain)
2869                 return -EFAULT;
2870
2871         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2872                 domain_exit(si_domain);
2873                 return -EFAULT;
2874         }
2875
2876         if (hw)
2877                 return 0;
2878
2879         for_each_online_node(nid) {
2880                 unsigned long start_pfn, end_pfn;
2881                 int i;
2882
2883                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2884                         ret = iommu_domain_identity_map(si_domain,
2885                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2886                         if (ret)
2887                                 return ret;
2888                 }
2889         }
2890
2891         /*
2892          * Identity map the RMRRs so that devices with RMRRs could also use
2893          * the si_domain.
2894          */
2895         for_each_rmrr_units(rmrr) {
2896                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2897                                           i, dev) {
2898                         unsigned long long start = rmrr->base_address;
2899                         unsigned long long end = rmrr->end_address;
2900
2901                         if (WARN_ON(end < start ||
2902                                     end >> agaw_to_width(si_domain->agaw)))
2903                                 continue;
2904
2905                         ret = iommu_domain_identity_map(si_domain, start, end);
2906                         if (ret)
2907                                 return ret;
2908                 }
2909         }
2910
2911         return 0;
2912 }
2913
2914 static int identity_mapping(struct device *dev)
2915 {
2916         struct device_domain_info *info;
2917
2918         info = dev->archdata.iommu;
2919         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2920                 return (info->domain == si_domain);
2921
2922         return 0;
2923 }
2924
2925 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2926 {
2927         struct dmar_domain *ndomain;
2928         struct intel_iommu *iommu;
2929         u8 bus, devfn;
2930
2931         iommu = device_to_iommu(dev, &bus, &devfn);
2932         if (!iommu)
2933                 return -ENODEV;
2934
2935         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2936         if (ndomain != domain)
2937                 return -EBUSY;
2938
2939         return 0;
2940 }
2941
2942 static bool device_has_rmrr(struct device *dev)
2943 {
2944         struct dmar_rmrr_unit *rmrr;
2945         struct device *tmp;
2946         int i;
2947
2948         rcu_read_lock();
2949         for_each_rmrr_units(rmrr) {
2950                 /*
2951                  * Return TRUE if this RMRR contains the device that
2952                  * is passed in.
2953                  */
2954                 for_each_active_dev_scope(rmrr->devices,
2955                                           rmrr->devices_cnt, i, tmp)
2956                         if (tmp == dev ||
2957                             is_downstream_to_pci_bridge(dev, tmp)) {
2958                                 rcu_read_unlock();
2959                                 return true;
2960                         }
2961         }
2962         rcu_read_unlock();
2963         return false;
2964 }
2965
2966 /**
2967  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2968  * is relaxable (ie. is allowed to be not enforced under some conditions)
2969  * @dev: device handle
2970  *
2971  * We assume that PCI USB devices with RMRRs have them largely
2972  * for historical reasons and that the RMRR space is not actively used post
2973  * boot.  This exclusion may change if vendors begin to abuse it.
2974  *
2975  * The same exception is made for graphics devices, with the requirement that
2976  * any use of the RMRR regions will be torn down before assigning the device
2977  * to a guest.
2978  *
2979  * Return: true if the RMRR is relaxable, false otherwise
2980  */
2981 static bool device_rmrr_is_relaxable(struct device *dev)
2982 {
2983         struct pci_dev *pdev;
2984
2985         if (!dev_is_pci(dev))
2986                 return false;
2987
2988         pdev = to_pci_dev(dev);
2989         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2990                 return true;
2991         else
2992                 return false;
2993 }
2994
2995 /*
2996  * There are a couple cases where we need to restrict the functionality of
2997  * devices associated with RMRRs.  The first is when evaluating a device for
2998  * identity mapping because problems exist when devices are moved in and out
2999  * of domains and their respective RMRR information is lost.  This means that
3000  * a device with associated RMRRs will never be in a "passthrough" domain.
3001  * The second is use of the device through the IOMMU API.  This interface
3002  * expects to have full control of the IOVA space for the device.  We cannot
3003  * satisfy both the requirement that RMRR access is maintained and have an
3004  * unencumbered IOVA space.  We also have no ability to quiesce the device's
3005  * use of the RMRR space or even inform the IOMMU API user of the restriction.
3006  * We therefore prevent devices associated with an RMRR from participating in
3007  * the IOMMU API, which eliminates them from device assignment.
3008  *
3009  * In both cases, devices which have relaxable RMRRs are not concerned by this
3010  * restriction. See device_rmrr_is_relaxable comment.
3011  */
3012 static bool device_is_rmrr_locked(struct device *dev)
3013 {
3014         if (!device_has_rmrr(dev))
3015                 return false;
3016
3017         if (device_rmrr_is_relaxable(dev))
3018                 return false;
3019
3020         return true;
3021 }
3022
3023 /*
3024  * Return the required default domain type for a specific device.
3025  *
3026  * @dev: the device in query
3027  * @startup: true if this is during early boot
3028  *
3029  * Returns:
3030  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
3031  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
3032  *  - 0: both identity and dynamic domains work for this device
3033  */
3034 static int device_def_domain_type(struct device *dev)
3035 {
3036         if (dev_is_pci(dev)) {
3037                 struct pci_dev *pdev = to_pci_dev(dev);
3038
3039                 /*
3040                  * Prevent any device marked as untrusted from getting
3041                  * placed into the statically identity mapping domain.
3042                  */
3043                 if (pdev->untrusted)
3044                         return IOMMU_DOMAIN_DMA;
3045
3046                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
3047                         return IOMMU_DOMAIN_IDENTITY;
3048
3049                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
3050                         return IOMMU_DOMAIN_IDENTITY;
3051
3052                 /*
3053                  * We want to start off with all devices in the 1:1 domain, and
3054                  * take them out later if we find they can't access all of memory.
3055                  *
3056                  * However, we can't do this for PCI devices behind bridges,
3057                  * because all PCI devices behind the same bridge will end up
3058                  * with the same source-id on their transactions.
3059                  *
3060                  * Practically speaking, we can't change things around for these
3061                  * devices at run-time, because we can't be sure there'll be no
3062                  * DMA transactions in flight for any of their siblings.
3063                  *
3064                  * So PCI devices (unless they're on the root bus) as well as
3065                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
3066                  * the 1:1 domain, just in _case_ one of their siblings turns out
3067                  * not to be able to map all of memory.
3068                  */
3069                 if (!pci_is_pcie(pdev)) {
3070                         if (!pci_is_root_bus(pdev->bus))
3071                                 return IOMMU_DOMAIN_DMA;
3072                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
3073                                 return IOMMU_DOMAIN_DMA;
3074                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
3075                         return IOMMU_DOMAIN_DMA;
3076         }
3077
3078         return 0;
3079 }
3080
3081 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3082 {
3083         /*
3084          * Start from the sane iommu hardware state.
3085          * If the queued invalidation is already initialized by us
3086          * (for example, while enabling interrupt-remapping) then
3087          * we got the things already rolling from a sane state.
3088          */
3089         if (!iommu->qi) {
3090                 /*
3091                  * Clear any previous faults.
3092                  */
3093                 dmar_fault(-1, iommu);
3094                 /*
3095                  * Disable queued invalidation if supported and already enabled
3096                  * before OS handover.
3097                  */
3098                 dmar_disable_qi(iommu);
3099         }
3100
3101         if (dmar_enable_qi(iommu)) {
3102                 /*
3103                  * Queued Invalidate not enabled, use Register Based Invalidate
3104                  */
3105                 iommu->flush.flush_context = __iommu_flush_context;
3106                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3107                 pr_info("%s: Using Register based invalidation\n",
3108                         iommu->name);
3109         } else {
3110                 iommu->flush.flush_context = qi_flush_context;
3111                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3112                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3113         }
3114 }
3115
3116 static int copy_context_table(struct intel_iommu *iommu,
3117                               struct root_entry *old_re,
3118                               struct context_entry **tbl,
3119                               int bus, bool ext)
3120 {
3121         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3122         struct context_entry *new_ce = NULL, ce;
3123         struct context_entry *old_ce = NULL;
3124         struct root_entry re;
3125         phys_addr_t old_ce_phys;
3126
3127         tbl_idx = ext ? bus * 2 : bus;
3128         memcpy(&re, old_re, sizeof(re));
3129
3130         for (devfn = 0; devfn < 256; devfn++) {
3131                 /* First calculate the correct index */
3132                 idx = (ext ? devfn * 2 : devfn) % 256;
3133
3134                 if (idx == 0) {
3135                         /* First save what we may have and clean up */
3136                         if (new_ce) {
3137                                 tbl[tbl_idx] = new_ce;
3138                                 __iommu_flush_cache(iommu, new_ce,
3139                                                     VTD_PAGE_SIZE);
3140                                 pos = 1;
3141                         }
3142
3143                         if (old_ce)
3144                                 memunmap(old_ce);
3145
3146                         ret = 0;
3147                         if (devfn < 0x80)
3148                                 old_ce_phys = root_entry_lctp(&re);
3149                         else
3150                                 old_ce_phys = root_entry_uctp(&re);
3151
3152                         if (!old_ce_phys) {
3153                                 if (ext && devfn == 0) {
3154                                         /* No LCTP, try UCTP */
3155                                         devfn = 0x7f;
3156                                         continue;
3157                                 } else {
3158                                         goto out;
3159                                 }
3160                         }
3161
3162                         ret = -ENOMEM;
3163                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3164                                         MEMREMAP_WB);
3165                         if (!old_ce)
3166                                 goto out;
3167
3168                         new_ce = alloc_pgtable_page(iommu->node);
3169                         if (!new_ce)
3170                                 goto out_unmap;
3171
3172                         ret = 0;
3173                 }
3174
3175                 /* Now copy the context entry */
3176                 memcpy(&ce, old_ce + idx, sizeof(ce));
3177
3178                 if (!__context_present(&ce))
3179                         continue;
3180
3181                 did = context_domain_id(&ce);
3182                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3183                         set_bit(did, iommu->domain_ids);
3184
3185                 /*
3186                  * We need a marker for copied context entries. This
3187                  * marker needs to work for the old format as well as
3188                  * for extended context entries.
3189                  *
3190                  * Bit 67 of the context entry is used. In the old
3191                  * format this bit is available to software, in the
3192                  * extended format it is the PGE bit, but PGE is ignored
3193                  * by HW if PASIDs are disabled (and thus still
3194                  * available).
3195                  *
3196                  * So disable PASIDs first and then mark the entry
3197                  * copied. This means that we don't copy PASID
3198                  * translations from the old kernel, but this is fine as
3199                  * faults there are not fatal.
3200                  */
3201                 context_clear_pasid_enable(&ce);
3202                 context_set_copied(&ce);
3203
3204                 new_ce[idx] = ce;
3205         }
3206
3207         tbl[tbl_idx + pos] = new_ce;
3208
3209         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3210
3211 out_unmap:
3212         memunmap(old_ce);
3213
3214 out:
3215         return ret;
3216 }
3217
3218 static int copy_translation_tables(struct intel_iommu *iommu)
3219 {
3220         struct context_entry **ctxt_tbls;
3221         struct root_entry *old_rt;
3222         phys_addr_t old_rt_phys;
3223         int ctxt_table_entries;
3224         unsigned long flags;
3225         u64 rtaddr_reg;
3226         int bus, ret;
3227         bool new_ext, ext;
3228
3229         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3230         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3231         new_ext    = !!ecap_ecs(iommu->ecap);
3232
3233         /*
3234          * The RTT bit can only be changed when translation is disabled,
3235          * but disabling translation means to open a window for data
3236          * corruption. So bail out and don't copy anything if we would
3237          * have to change the bit.
3238          */
3239         if (new_ext != ext)
3240                 return -EINVAL;
3241
3242         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3243         if (!old_rt_phys)
3244                 return -EINVAL;
3245
3246         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3247         if (!old_rt)
3248                 return -ENOMEM;
3249
3250         /* This is too big for the stack - allocate it from slab */
3251         ctxt_table_entries = ext ? 512 : 256;
3252         ret = -ENOMEM;
3253         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3254         if (!ctxt_tbls)
3255                 goto out_unmap;
3256
3257         for (bus = 0; bus < 256; bus++) {
3258                 ret = copy_context_table(iommu, &old_rt[bus],
3259                                          ctxt_tbls, bus, ext);
3260                 if (ret) {
3261                         pr_err("%s: Failed to copy context table for bus %d\n",
3262                                 iommu->name, bus);
3263                         continue;
3264                 }
3265         }
3266
3267         spin_lock_irqsave(&iommu->lock, flags);
3268
3269         /* Context tables are copied, now write them to the root_entry table */
3270         for (bus = 0; bus < 256; bus++) {
3271                 int idx = ext ? bus * 2 : bus;
3272                 u64 val;
3273
3274                 if (ctxt_tbls[idx]) {
3275                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3276                         iommu->root_entry[bus].lo = val;
3277                 }
3278
3279                 if (!ext || !ctxt_tbls[idx + 1])
3280                         continue;
3281
3282                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3283                 iommu->root_entry[bus].hi = val;
3284         }
3285
3286         spin_unlock_irqrestore(&iommu->lock, flags);
3287
3288         kfree(ctxt_tbls);
3289
3290         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3291
3292         ret = 0;
3293
3294 out_unmap:
3295         memunmap(old_rt);
3296
3297         return ret;
3298 }
3299
3300 static int __init init_dmars(void)
3301 {
3302         struct dmar_drhd_unit *drhd;
3303         struct intel_iommu *iommu;
3304         int ret;
3305
3306         /*
3307          * for each drhd
3308          *    allocate root
3309          *    initialize and program root entry to not present
3310          * endfor
3311          */
3312         for_each_drhd_unit(drhd) {
3313                 /*
3314                  * lock not needed as this is only incremented in the single
3315                  * threaded kernel __init code path all other access are read
3316                  * only
3317                  */
3318                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3319                         g_num_of_iommus++;
3320                         continue;
3321                 }
3322                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3323         }
3324
3325         /* Preallocate enough resources for IOMMU hot-addition */
3326         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3327                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3328
3329         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3330                         GFP_KERNEL);
3331         if (!g_iommus) {
3332                 pr_err("Allocating global iommu array failed\n");
3333                 ret = -ENOMEM;
3334                 goto error;
3335         }
3336
3337         for_each_iommu(iommu, drhd) {
3338                 if (drhd->ignored) {
3339                         iommu_disable_translation(iommu);
3340                         continue;
3341                 }
3342
3343                 /*
3344                  * Find the max pasid size of all IOMMU's in the system.
3345                  * We need to ensure the system pasid table is no bigger
3346                  * than the smallest supported.
3347                  */
3348                 if (pasid_supported(iommu)) {
3349                         u32 temp = 2 << ecap_pss(iommu->ecap);
3350
3351                         intel_pasid_max_id = min_t(u32, temp,
3352                                                    intel_pasid_max_id);
3353                 }
3354
3355                 g_iommus[iommu->seq_id] = iommu;
3356
3357                 intel_iommu_init_qi(iommu);
3358
3359                 ret = iommu_init_domains(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362
3363                 init_translation_status(iommu);
3364
3365                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3366                         iommu_disable_translation(iommu);
3367                         clear_translation_pre_enabled(iommu);
3368                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3369                                 iommu->name);
3370                 }
3371
3372                 /*
3373                  * TBD:
3374                  * we could share the same root & context tables
3375                  * among all IOMMU's. Need to Split it later.
3376                  */
3377                 ret = iommu_alloc_root_entry(iommu);
3378                 if (ret)
3379                         goto free_iommu;
3380
3381                 if (translation_pre_enabled(iommu)) {
3382                         pr_info("Translation already enabled - trying to copy translation structures\n");
3383
3384                         ret = copy_translation_tables(iommu);
3385                         if (ret) {
3386                                 /*
3387                                  * We found the IOMMU with translation
3388                                  * enabled - but failed to copy over the
3389                                  * old root-entry table. Try to proceed
3390                                  * by disabling translation now and
3391                                  * allocating a clean root-entry table.
3392                                  * This might cause DMAR faults, but
3393                                  * probably the dump will still succeed.
3394                                  */
3395                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3396                                        iommu->name);
3397                                 iommu_disable_translation(iommu);
3398                                 clear_translation_pre_enabled(iommu);
3399                         } else {
3400                                 pr_info("Copied translation tables from previous kernel for %s\n",
3401                                         iommu->name);
3402                         }
3403                 }
3404
3405                 if (!ecap_pass_through(iommu->ecap))
3406                         hw_pass_through = 0;
3407                 intel_svm_check(iommu);
3408         }
3409
3410         /*
3411          * Now that qi is enabled on all iommus, set the root entry and flush
3412          * caches. This is required on some Intel X58 chipsets, otherwise the
3413          * flush_context function will loop forever and the boot hangs.
3414          */
3415         for_each_active_iommu(iommu, drhd) {
3416                 iommu_flush_write_buffer(iommu);
3417                 iommu_set_root_entry(iommu);
3418                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3419                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3420         }
3421
3422 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3423         dmar_map_gfx = 0;
3424 #endif
3425
3426         if (!dmar_map_gfx)
3427                 iommu_identity_mapping |= IDENTMAP_GFX;
3428
3429         check_tylersburg_isoch();
3430
3431         ret = si_domain_init(hw_pass_through);
3432         if (ret)
3433                 goto free_iommu;
3434
3435         /*
3436          * for each drhd
3437          *   enable fault log
3438          *   global invalidate context cache
3439          *   global invalidate iotlb
3440          *   enable translation
3441          */
3442         for_each_iommu(iommu, drhd) {
3443                 if (drhd->ignored) {
3444                         /*
3445                          * we always have to disable PMRs or DMA may fail on
3446                          * this device
3447                          */
3448                         if (force_on)
3449                                 iommu_disable_protect_mem_regions(iommu);
3450                         continue;
3451                 }
3452
3453                 iommu_flush_write_buffer(iommu);
3454
3455 #ifdef CONFIG_INTEL_IOMMU_SVM
3456                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3457                         /*
3458                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3459                          * could cause possible lock race condition.
3460                          */
3461                         up_write(&dmar_global_lock);
3462                         ret = intel_svm_enable_prq(iommu);
3463                         down_write(&dmar_global_lock);
3464                         if (ret)
3465                                 goto free_iommu;
3466                 }
3467 #endif
3468                 ret = dmar_set_interrupt(iommu);
3469                 if (ret)
3470                         goto free_iommu;
3471         }
3472
3473         return 0;
3474
3475 free_iommu:
3476         for_each_active_iommu(iommu, drhd) {
3477                 disable_dmar_iommu(iommu);
3478                 free_dmar_iommu(iommu);
3479         }
3480
3481         kfree(g_iommus);
3482
3483 error:
3484         return ret;
3485 }
3486
3487 /* This takes a number of _MM_ pages, not VTD pages */
3488 static unsigned long intel_alloc_iova(struct device *dev,
3489                                      struct dmar_domain *domain,
3490                                      unsigned long nrpages, uint64_t dma_mask)
3491 {
3492         unsigned long iova_pfn;
3493
3494         /*
3495          * Restrict dma_mask to the width that the iommu can handle.
3496          * First-level translation restricts the input-address to a
3497          * canonical address (i.e., address bits 63:N have the same
3498          * value as address bit [N-1], where N is 48-bits with 4-level
3499          * paging and 57-bits with 5-level paging). Hence, skip bit
3500          * [N-1].
3501          */
3502         if (domain_use_first_level(domain))
3503                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw - 1),
3504                                  dma_mask);
3505         else
3506                 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw),
3507                                  dma_mask);
3508
3509         /* Ensure we reserve the whole size-aligned region */
3510         nrpages = __roundup_pow_of_two(nrpages);
3511
3512         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3513                 /*
3514                  * First try to allocate an io virtual address in
3515                  * DMA_BIT_MASK(32) and if that fails then try allocating
3516                  * from higher range
3517                  */
3518                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3519                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3520                 if (iova_pfn)
3521                         return iova_pfn;
3522         }
3523         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3524                                    IOVA_PFN(dma_mask), true);
3525         if (unlikely(!iova_pfn)) {
3526                 dev_err_once(dev, "Allocating %ld-page iova failed\n",
3527                              nrpages);
3528                 return 0;
3529         }
3530
3531         return iova_pfn;
3532 }
3533
3534 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3535 {
3536         struct dmar_domain *domain, *tmp;
3537         struct dmar_rmrr_unit *rmrr;
3538         struct device *i_dev;
3539         int i, ret;
3540
3541         /* Device shouldn't be attached by any domains. */
3542         domain = find_domain(dev);
3543         if (domain)
3544                 return NULL;
3545
3546         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3547         if (!domain)
3548                 goto out;
3549
3550         /* We have a new domain - setup possible RMRRs for the device */
3551         rcu_read_lock();
3552         for_each_rmrr_units(rmrr) {
3553                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3554                                           i, i_dev) {
3555                         if (i_dev != dev)
3556                                 continue;
3557
3558                         ret = domain_prepare_identity_map(dev, domain,
3559                                                           rmrr->base_address,
3560                                                           rmrr->end_address);
3561                         if (ret)
3562                                 dev_err(dev, "Mapping reserved region failed\n");
3563                 }
3564         }
3565         rcu_read_unlock();
3566
3567         tmp = set_domain_for_dev(dev, domain);
3568         if (!tmp || domain != tmp) {
3569                 domain_exit(domain);
3570                 domain = tmp;
3571         }
3572
3573 out:
3574         if (!domain)
3575                 dev_err(dev, "Allocating domain failed\n");
3576         else
3577                 domain->domain.type = IOMMU_DOMAIN_DMA;
3578
3579         return domain;
3580 }
3581
3582 /* Check if the dev needs to go through non-identity map and unmap process.*/
3583 static bool iommu_need_mapping(struct device *dev)
3584 {
3585         int ret;
3586
3587         if (iommu_dummy(dev))
3588                 return false;
3589
3590         ret = identity_mapping(dev);
3591         if (ret) {
3592                 u64 dma_mask = *dev->dma_mask;
3593
3594                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3595                         dma_mask = dev->coherent_dma_mask;
3596
3597                 if (dma_mask >= dma_direct_get_required_mask(dev))
3598                         return false;
3599
3600                 /*
3601                  * 32 bit DMA is removed from si_domain and fall back to
3602                  * non-identity mapping.
3603                  */
3604                 dmar_remove_one_dev_info(dev);
3605                 ret = iommu_request_dma_domain_for_dev(dev);
3606                 if (ret) {
3607                         struct iommu_domain *domain;
3608                         struct dmar_domain *dmar_domain;
3609
3610                         domain = iommu_get_domain_for_dev(dev);
3611                         if (domain) {
3612                                 dmar_domain = to_dmar_domain(domain);
3613                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3614                         }
3615                         dmar_remove_one_dev_info(dev);
3616                         get_private_domain_for_dev(dev);
3617                 }
3618
3619                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3620         }
3621
3622         return true;
3623 }
3624
3625 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3626                                      size_t size, int dir, u64 dma_mask)
3627 {
3628         struct dmar_domain *domain;
3629         phys_addr_t start_paddr;
3630         unsigned long iova_pfn;
3631         int prot = 0;
3632         int ret;
3633         struct intel_iommu *iommu;
3634         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3635
3636         BUG_ON(dir == DMA_NONE);
3637
3638         domain = deferred_attach_domain(dev);
3639         if (!domain)
3640                 return DMA_MAPPING_ERROR;
3641
3642         iommu = domain_get_iommu(domain);
3643         size = aligned_nrpages(paddr, size);
3644
3645         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3646         if (!iova_pfn)
3647                 goto error;
3648
3649         /*
3650          * Check if DMAR supports zero-length reads on write only
3651          * mappings..
3652          */
3653         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3654                         !cap_zlr(iommu->cap))
3655                 prot |= DMA_PTE_READ;
3656         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3657                 prot |= DMA_PTE_WRITE;
3658         /*
3659          * paddr - (paddr + size) might be partial page, we should map the whole
3660          * page.  Note: if two part of one page are separately mapped, we
3661          * might have two guest_addr mapping to the same host paddr, but this
3662          * is not a big problem
3663          */
3664         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3665                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3666         if (ret)
3667                 goto error;
3668
3669         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3670         start_paddr += paddr & ~PAGE_MASK;
3671
3672         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3673
3674         return start_paddr;
3675
3676 error:
3677         if (iova_pfn)
3678                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3679         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3680                 size, (unsigned long long)paddr, dir);
3681         return DMA_MAPPING_ERROR;
3682 }
3683
3684 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3685                                  unsigned long offset, size_t size,
3686                                  enum dma_data_direction dir,
3687                                  unsigned long attrs)
3688 {
3689         if (iommu_need_mapping(dev))
3690                 return __intel_map_single(dev, page_to_phys(page) + offset,
3691                                 size, dir, *dev->dma_mask);
3692         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3693 }
3694
3695 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3696                                      size_t size, enum dma_data_direction dir,
3697                                      unsigned long attrs)
3698 {
3699         if (iommu_need_mapping(dev))
3700                 return __intel_map_single(dev, phys_addr, size, dir,
3701                                 *dev->dma_mask);
3702         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3703 }
3704
3705 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3706 {
3707         struct dmar_domain *domain;
3708         unsigned long start_pfn, last_pfn;
3709         unsigned long nrpages;
3710         unsigned long iova_pfn;
3711         struct intel_iommu *iommu;
3712         struct page *freelist;
3713         struct pci_dev *pdev = NULL;
3714
3715         domain = find_domain(dev);
3716         BUG_ON(!domain);
3717
3718         iommu = domain_get_iommu(domain);
3719
3720         iova_pfn = IOVA_PFN(dev_addr);
3721
3722         nrpages = aligned_nrpages(dev_addr, size);
3723         start_pfn = mm_to_dma_pfn(iova_pfn);
3724         last_pfn = start_pfn + nrpages - 1;
3725
3726         if (dev_is_pci(dev))
3727                 pdev = to_pci_dev(dev);
3728
3729         freelist = domain_unmap(domain, start_pfn, last_pfn);
3730         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3731                         !has_iova_flush_queue(&domain->iovad)) {
3732                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3733                                       nrpages, !freelist, 0);
3734                 /* free iova */
3735                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3736                 dma_free_pagelist(freelist);
3737         } else {
3738                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3739                            (unsigned long)freelist);
3740                 /*
3741                  * queue up the release of the unmap to save the 1/6th of the
3742                  * cpu used up by the iotlb flush operation...
3743                  */
3744         }
3745
3746         trace_unmap_single(dev, dev_addr, size);
3747 }
3748
3749 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3750                              size_t size, enum dma_data_direction dir,
3751                              unsigned long attrs)
3752 {
3753         if (iommu_need_mapping(dev))
3754                 intel_unmap(dev, dev_addr, size);
3755         else
3756                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3757 }
3758
3759 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3760                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3761 {
3762         if (iommu_need_mapping(dev))
3763                 intel_unmap(dev, dev_addr, size);
3764 }
3765
3766 static void *intel_alloc_coherent(struct device *dev, size_t size,
3767                                   dma_addr_t *dma_handle, gfp_t flags,
3768                                   unsigned long attrs)
3769 {
3770         struct page *page = NULL;
3771         int order;
3772
3773         if (!iommu_need_mapping(dev))
3774                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3775
3776         size = PAGE_ALIGN(size);
3777         order = get_order(size);
3778
3779         if (gfpflags_allow_blocking(flags)) {
3780                 unsigned int count = size >> PAGE_SHIFT;
3781
3782                 page = dma_alloc_from_contiguous(dev, count, order,
3783                                                  flags & __GFP_NOWARN);
3784         }
3785
3786         if (!page)
3787                 page = alloc_pages(flags, order);
3788         if (!page)
3789                 return NULL;
3790         memset(page_address(page), 0, size);
3791
3792         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3793                                          DMA_BIDIRECTIONAL,
3794                                          dev->coherent_dma_mask);
3795         if (*dma_handle != DMA_MAPPING_ERROR)
3796                 return page_address(page);
3797         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3798                 __free_pages(page, order);
3799
3800         return NULL;
3801 }
3802
3803 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3804                                 dma_addr_t dma_handle, unsigned long attrs)
3805 {
3806         int order;
3807         struct page *page = virt_to_page(vaddr);
3808
3809         if (!iommu_need_mapping(dev))
3810                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3811
3812         size = PAGE_ALIGN(size);
3813         order = get_order(size);
3814
3815         intel_unmap(dev, dma_handle, size);
3816         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3817                 __free_pages(page, order);
3818 }
3819
3820 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3821                            int nelems, enum dma_data_direction dir,
3822                            unsigned long attrs)
3823 {
3824         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3825         unsigned long nrpages = 0;
3826         struct scatterlist *sg;
3827         int i;
3828
3829         if (!iommu_need_mapping(dev))
3830                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3831
3832         for_each_sg(sglist, sg, nelems, i) {
3833                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3834         }
3835
3836         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3837
3838         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3839 }
3840
3841 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3842                         enum dma_data_direction dir, unsigned long attrs)
3843 {
3844         int i;
3845         struct dmar_domain *domain;
3846         size_t size = 0;
3847         int prot = 0;
3848         unsigned long iova_pfn;
3849         int ret;
3850         struct scatterlist *sg;
3851         unsigned long start_vpfn;
3852         struct intel_iommu *iommu;
3853
3854         BUG_ON(dir == DMA_NONE);
3855         if (!iommu_need_mapping(dev))
3856                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3857
3858         domain = deferred_attach_domain(dev);
3859         if (!domain)
3860                 return 0;
3861
3862         iommu = domain_get_iommu(domain);
3863
3864         for_each_sg(sglist, sg, nelems, i)
3865                 size += aligned_nrpages(sg->offset, sg->length);
3866
3867         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3868                                 *dev->dma_mask);
3869         if (!iova_pfn) {
3870                 sglist->dma_length = 0;
3871                 return 0;
3872         }
3873
3874         /*
3875          * Check if DMAR supports zero-length reads on write only
3876          * mappings..
3877          */
3878         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3879                         !cap_zlr(iommu->cap))
3880                 prot |= DMA_PTE_READ;
3881         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3882                 prot |= DMA_PTE_WRITE;
3883
3884         start_vpfn = mm_to_dma_pfn(iova_pfn);
3885
3886         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3887         if (unlikely(ret)) {
3888                 dma_pte_free_pagetable(domain, start_vpfn,
3889                                        start_vpfn + size - 1,
3890                                        agaw_to_level(domain->agaw) + 1);
3891                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3892                 return 0;
3893         }
3894
3895         for_each_sg(sglist, sg, nelems, i)
3896                 trace_map_sg(dev, i + 1, nelems, sg);
3897
3898         return nelems;
3899 }
3900
3901 static u64 intel_get_required_mask(struct device *dev)
3902 {
3903         if (!iommu_need_mapping(dev))
3904                 return dma_direct_get_required_mask(dev);
3905         return DMA_BIT_MASK(32);
3906 }
3907
3908 static const struct dma_map_ops intel_dma_ops = {
3909         .alloc = intel_alloc_coherent,
3910         .free = intel_free_coherent,
3911         .map_sg = intel_map_sg,
3912         .unmap_sg = intel_unmap_sg,
3913         .map_page = intel_map_page,
3914         .unmap_page = intel_unmap_page,
3915         .map_resource = intel_map_resource,
3916         .unmap_resource = intel_unmap_resource,
3917         .dma_supported = dma_direct_supported,
3918         .mmap = dma_common_mmap,
3919         .get_sgtable = dma_common_get_sgtable,
3920         .get_required_mask = intel_get_required_mask,
3921 };
3922
3923 static void
3924 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3925                    enum dma_data_direction dir, enum dma_sync_target target)
3926 {
3927         struct dmar_domain *domain;
3928         phys_addr_t tlb_addr;
3929
3930         domain = find_domain(dev);
3931         if (WARN_ON(!domain))
3932                 return;
3933
3934         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3935         if (is_swiotlb_buffer(tlb_addr))
3936                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3937 }
3938
3939 static dma_addr_t
3940 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3941                   enum dma_data_direction dir, unsigned long attrs,
3942                   u64 dma_mask)
3943 {
3944         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3945         struct dmar_domain *domain;
3946         struct intel_iommu *iommu;
3947         unsigned long iova_pfn;
3948         unsigned long nrpages;
3949         phys_addr_t tlb_addr;
3950         int prot = 0;
3951         int ret;
3952
3953         domain = deferred_attach_domain(dev);
3954         if (WARN_ON(dir == DMA_NONE || !domain))
3955                 return DMA_MAPPING_ERROR;
3956
3957         iommu = domain_get_iommu(domain);
3958         if (WARN_ON(!iommu))
3959                 return DMA_MAPPING_ERROR;
3960
3961         nrpages = aligned_nrpages(0, size);
3962         iova_pfn = intel_alloc_iova(dev, domain,
3963                                     dma_to_mm_pfn(nrpages), dma_mask);
3964         if (!iova_pfn)
3965                 return DMA_MAPPING_ERROR;
3966
3967         /*
3968          * Check if DMAR supports zero-length reads on write only
3969          * mappings..
3970          */
3971         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3972                         !cap_zlr(iommu->cap))
3973                 prot |= DMA_PTE_READ;
3974         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3975                 prot |= DMA_PTE_WRITE;
3976
3977         /*
3978          * If both the physical buffer start address and size are
3979          * page aligned, we don't need to use a bounce page.
3980          */
3981         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3982                 tlb_addr = swiotlb_tbl_map_single(dev,
3983                                 __phys_to_dma(dev, io_tlb_start),
3984                                 paddr, size, aligned_size, dir, attrs);
3985                 if (tlb_addr == DMA_MAPPING_ERROR) {
3986                         goto swiotlb_error;
3987                 } else {
3988                         /* Cleanup the padding area. */
3989                         void *padding_start = phys_to_virt(tlb_addr);
3990                         size_t padding_size = aligned_size;
3991
3992                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3993                             (dir == DMA_TO_DEVICE ||
3994                              dir == DMA_BIDIRECTIONAL)) {
3995                                 padding_start += size;
3996                                 padding_size -= size;
3997                         }
3998
3999                         memset(padding_start, 0, padding_size);
4000                 }
4001         } else {
4002                 tlb_addr = paddr;
4003         }
4004
4005         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
4006                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
4007         if (ret)
4008                 goto mapping_error;
4009
4010         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
4011
4012         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
4013
4014 mapping_error:
4015         if (is_swiotlb_buffer(tlb_addr))
4016                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4017                                          aligned_size, dir, attrs);
4018 swiotlb_error:
4019         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
4020         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
4021                 size, (unsigned long long)paddr, dir);
4022
4023         return DMA_MAPPING_ERROR;
4024 }
4025
4026 static void
4027 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
4028                     enum dma_data_direction dir, unsigned long attrs)
4029 {
4030         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
4031         struct dmar_domain *domain;
4032         phys_addr_t tlb_addr;
4033
4034         domain = find_domain(dev);
4035         if (WARN_ON(!domain))
4036                 return;
4037
4038         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
4039         if (WARN_ON(!tlb_addr))
4040                 return;
4041
4042         intel_unmap(dev, dev_addr, size);
4043         if (is_swiotlb_buffer(tlb_addr))
4044                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
4045                                          aligned_size, dir, attrs);
4046
4047         trace_bounce_unmap_single(dev, dev_addr, size);
4048 }
4049
4050 static dma_addr_t
4051 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
4052                 size_t size, enum dma_data_direction dir, unsigned long attrs)
4053 {
4054         return bounce_map_single(dev, page_to_phys(page) + offset,
4055                                  size, dir, attrs, *dev->dma_mask);
4056 }
4057
4058 static dma_addr_t
4059 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
4060                     enum dma_data_direction dir, unsigned long attrs)
4061 {
4062         return bounce_map_single(dev, phys_addr, size,
4063                                  dir, attrs, *dev->dma_mask);
4064 }
4065
4066 static void
4067 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
4068                   enum dma_data_direction dir, unsigned long attrs)
4069 {
4070         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4071 }
4072
4073 static void
4074 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
4075                       enum dma_data_direction dir, unsigned long attrs)
4076 {
4077         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
4078 }
4079
4080 static void
4081 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4082                 enum dma_data_direction dir, unsigned long attrs)
4083 {
4084         struct scatterlist *sg;
4085         int i;
4086
4087         for_each_sg(sglist, sg, nelems, i)
4088                 bounce_unmap_page(dev, sg->dma_address,
4089                                   sg_dma_len(sg), dir, attrs);
4090 }
4091
4092 static int
4093 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
4094               enum dma_data_direction dir, unsigned long attrs)
4095 {
4096         int i;
4097         struct scatterlist *sg;
4098
4099         for_each_sg(sglist, sg, nelems, i) {
4100                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
4101                                                   sg->offset, sg->length,
4102                                                   dir, attrs);
4103                 if (sg->dma_address == DMA_MAPPING_ERROR)
4104                         goto out_unmap;
4105                 sg_dma_len(sg) = sg->length;
4106         }
4107
4108         for_each_sg(sglist, sg, nelems, i)
4109                 trace_bounce_map_sg(dev, i + 1, nelems, sg);
4110
4111         return nelems;
4112
4113 out_unmap:
4114         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
4115         return 0;
4116 }
4117
4118 static void
4119 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
4120                            size_t size, enum dma_data_direction dir)
4121 {
4122         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
4123 }
4124
4125 static void
4126 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4127                               size_t size, enum dma_data_direction dir)
4128 {
4129         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4130 }
4131
4132 static void
4133 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4134                        int nelems, enum dma_data_direction dir)
4135 {
4136         struct scatterlist *sg;
4137         int i;
4138
4139         for_each_sg(sglist, sg, nelems, i)
4140                 bounce_sync_single(dev, sg_dma_address(sg),
4141                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4142 }
4143
4144 static void
4145 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4146                           int nelems, enum dma_data_direction dir)
4147 {
4148         struct scatterlist *sg;
4149         int i;
4150
4151         for_each_sg(sglist, sg, nelems, i)
4152                 bounce_sync_single(dev, sg_dma_address(sg),
4153                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4154 }
4155
4156 static const struct dma_map_ops bounce_dma_ops = {
4157         .alloc                  = intel_alloc_coherent,
4158         .free                   = intel_free_coherent,
4159         .map_sg                 = bounce_map_sg,
4160         .unmap_sg               = bounce_unmap_sg,
4161         .map_page               = bounce_map_page,
4162         .unmap_page             = bounce_unmap_page,
4163         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4164         .sync_single_for_device = bounce_sync_single_for_device,
4165         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4166         .sync_sg_for_device     = bounce_sync_sg_for_device,
4167         .map_resource           = bounce_map_resource,
4168         .unmap_resource         = bounce_unmap_resource,
4169         .dma_supported          = dma_direct_supported,
4170 };
4171
4172 static inline int iommu_domain_cache_init(void)
4173 {
4174         int ret = 0;
4175
4176         iommu_domain_cache = kmem_cache_create("iommu_domain",
4177                                          sizeof(struct dmar_domain),
4178                                          0,
4179                                          SLAB_HWCACHE_ALIGN,
4180
4181                                          NULL);
4182         if (!iommu_domain_cache) {
4183                 pr_err("Couldn't create iommu_domain cache\n");
4184                 ret = -ENOMEM;
4185         }
4186
4187         return ret;
4188 }
4189
4190 static inline int iommu_devinfo_cache_init(void)
4191 {
4192         int ret = 0;
4193
4194         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4195                                          sizeof(struct device_domain_info),
4196                                          0,
4197                                          SLAB_HWCACHE_ALIGN,
4198                                          NULL);
4199         if (!iommu_devinfo_cache) {
4200                 pr_err("Couldn't create devinfo cache\n");
4201                 ret = -ENOMEM;
4202         }
4203
4204         return ret;
4205 }
4206
4207 static int __init iommu_init_mempool(void)
4208 {
4209         int ret;
4210         ret = iova_cache_get();
4211         if (ret)
4212                 return ret;
4213
4214         ret = iommu_domain_cache_init();
4215         if (ret)
4216                 goto domain_error;
4217
4218         ret = iommu_devinfo_cache_init();
4219         if (!ret)
4220                 return ret;
4221
4222         kmem_cache_destroy(iommu_domain_cache);
4223 domain_error:
4224         iova_cache_put();
4225
4226         return -ENOMEM;
4227 }
4228
4229 static void __init iommu_exit_mempool(void)
4230 {
4231         kmem_cache_destroy(iommu_devinfo_cache);
4232         kmem_cache_destroy(iommu_domain_cache);
4233         iova_cache_put();
4234 }
4235
4236 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4237 {
4238         struct dmar_drhd_unit *drhd;
4239         u32 vtbar;
4240         int rc;
4241
4242         /* We know that this device on this chipset has its own IOMMU.
4243          * If we find it under a different IOMMU, then the BIOS is lying
4244          * to us. Hope that the IOMMU for this device is actually
4245          * disabled, and it needs no translation...
4246          */
4247         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4248         if (rc) {
4249                 /* "can't" happen */
4250                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4251                 return;
4252         }
4253         vtbar &= 0xffff0000;
4254
4255         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4256         drhd = dmar_find_matched_drhd_unit(pdev);
4257         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4258                             TAINT_FIRMWARE_WORKAROUND,
4259                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4260                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4261 }
4262 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4263
4264 static void __init init_no_remapping_devices(void)
4265 {
4266         struct dmar_drhd_unit *drhd;
4267         struct device *dev;
4268         int i;
4269
4270         for_each_drhd_unit(drhd) {
4271                 if (!drhd->include_all) {
4272                         for_each_active_dev_scope(drhd->devices,
4273                                                   drhd->devices_cnt, i, dev)
4274                                 break;
4275                         /* ignore DMAR unit if no devices exist */
4276                         if (i == drhd->devices_cnt)
4277                                 drhd->ignored = 1;
4278                 }
4279         }
4280
4281         for_each_active_drhd_unit(drhd) {
4282                 if (drhd->include_all)
4283                         continue;
4284
4285                 for_each_active_dev_scope(drhd->devices,
4286                                           drhd->devices_cnt, i, dev)
4287                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4288                                 break;
4289                 if (i < drhd->devices_cnt)
4290                         continue;
4291
4292                 /* This IOMMU has *only* gfx devices. Either bypass it or
4293                    set the gfx_mapped flag, as appropriate */
4294                 if (!dmar_map_gfx) {
4295                         drhd->ignored = 1;
4296                         for_each_active_dev_scope(drhd->devices,
4297                                                   drhd->devices_cnt, i, dev)
4298                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4299                 }
4300         }
4301 }
4302
4303 #ifdef CONFIG_SUSPEND
4304 static int init_iommu_hw(void)
4305 {
4306         struct dmar_drhd_unit *drhd;
4307         struct intel_iommu *iommu = NULL;
4308
4309         for_each_active_iommu(iommu, drhd)
4310                 if (iommu->qi)
4311                         dmar_reenable_qi(iommu);
4312
4313         for_each_iommu(iommu, drhd) {
4314                 if (drhd->ignored) {
4315                         /*
4316                          * we always have to disable PMRs or DMA may fail on
4317                          * this device
4318                          */
4319                         if (force_on)
4320                                 iommu_disable_protect_mem_regions(iommu);
4321                         continue;
4322                 }
4323
4324                 iommu_flush_write_buffer(iommu);
4325
4326                 iommu_set_root_entry(iommu);
4327
4328                 iommu->flush.flush_context(iommu, 0, 0, 0,
4329                                            DMA_CCMD_GLOBAL_INVL);
4330                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4331                 iommu_enable_translation(iommu);
4332                 iommu_disable_protect_mem_regions(iommu);
4333         }
4334
4335         return 0;
4336 }
4337
4338 static void iommu_flush_all(void)
4339 {
4340         struct dmar_drhd_unit *drhd;
4341         struct intel_iommu *iommu;
4342
4343         for_each_active_iommu(iommu, drhd) {
4344                 iommu->flush.flush_context(iommu, 0, 0, 0,
4345                                            DMA_CCMD_GLOBAL_INVL);
4346                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4347                                          DMA_TLB_GLOBAL_FLUSH);
4348         }
4349 }
4350
4351 static int iommu_suspend(void)
4352 {
4353         struct dmar_drhd_unit *drhd;
4354         struct intel_iommu *iommu = NULL;
4355         unsigned long flag;
4356
4357         for_each_active_iommu(iommu, drhd) {
4358                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4359                                                  GFP_ATOMIC);
4360                 if (!iommu->iommu_state)
4361                         goto nomem;
4362         }
4363
4364         iommu_flush_all();
4365
4366         for_each_active_iommu(iommu, drhd) {
4367                 iommu_disable_translation(iommu);
4368
4369                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4370
4371                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4372                         readl(iommu->reg + DMAR_FECTL_REG);
4373                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4374                         readl(iommu->reg + DMAR_FEDATA_REG);
4375                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4376                         readl(iommu->reg + DMAR_FEADDR_REG);
4377                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4378                         readl(iommu->reg + DMAR_FEUADDR_REG);
4379
4380                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4381         }
4382         return 0;
4383
4384 nomem:
4385         for_each_active_iommu(iommu, drhd)
4386                 kfree(iommu->iommu_state);
4387
4388         return -ENOMEM;
4389 }
4390
4391 static void iommu_resume(void)
4392 {
4393         struct dmar_drhd_unit *drhd;
4394         struct intel_iommu *iommu = NULL;
4395         unsigned long flag;
4396
4397         if (init_iommu_hw()) {
4398                 if (force_on)
4399                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4400                 else
4401                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4402                 return;
4403         }
4404
4405         for_each_active_iommu(iommu, drhd) {
4406
4407                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4408
4409                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4410                         iommu->reg + DMAR_FECTL_REG);
4411                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4412                         iommu->reg + DMAR_FEDATA_REG);
4413                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4414                         iommu->reg + DMAR_FEADDR_REG);
4415                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4416                         iommu->reg + DMAR_FEUADDR_REG);
4417
4418                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4419         }
4420
4421         for_each_active_iommu(iommu, drhd)
4422                 kfree(iommu->iommu_state);
4423 }
4424
4425 static struct syscore_ops iommu_syscore_ops = {
4426         .resume         = iommu_resume,
4427         .suspend        = iommu_suspend,
4428 };
4429
4430 static void __init init_iommu_pm_ops(void)
4431 {
4432         register_syscore_ops(&iommu_syscore_ops);
4433 }
4434
4435 #else
4436 static inline void init_iommu_pm_ops(void) {}
4437 #endif  /* CONFIG_PM */
4438
4439 static int rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
4440 {
4441         if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
4442             !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
4443             rmrr->end_address <= rmrr->base_address ||
4444             arch_rmrr_sanity_check(rmrr))
4445                 return -EINVAL;
4446
4447         return 0;
4448 }
4449
4450 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4451 {
4452         struct acpi_dmar_reserved_memory *rmrr;
4453         struct dmar_rmrr_unit *rmrru;
4454
4455         rmrr = (struct acpi_dmar_reserved_memory *)header;
4456         if (rmrr_sanity_check(rmrr))
4457                 WARN_TAINT(1, TAINT_FIRMWARE_WORKAROUND,
4458                            "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
4459                            "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4460                            rmrr->base_address, rmrr->end_address,
4461                            dmi_get_system_info(DMI_BIOS_VENDOR),
4462                            dmi_get_system_info(DMI_BIOS_VERSION),
4463                            dmi_get_system_info(DMI_PRODUCT_VERSION));
4464
4465         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4466         if (!rmrru)
4467                 goto out;
4468
4469         rmrru->hdr = header;
4470
4471         rmrru->base_address = rmrr->base_address;
4472         rmrru->end_address = rmrr->end_address;
4473
4474         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4475                                 ((void *)rmrr) + rmrr->header.length,
4476                                 &rmrru->devices_cnt);
4477         if (rmrru->devices_cnt && rmrru->devices == NULL)
4478                 goto free_rmrru;
4479
4480         list_add(&rmrru->list, &dmar_rmrr_units);
4481
4482         return 0;
4483 free_rmrru:
4484         kfree(rmrru);
4485 out:
4486         return -ENOMEM;
4487 }
4488
4489 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4490 {
4491         struct dmar_atsr_unit *atsru;
4492         struct acpi_dmar_atsr *tmp;
4493
4494         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4495                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4496                 if (atsr->segment != tmp->segment)
4497                         continue;
4498                 if (atsr->header.length != tmp->header.length)
4499                         continue;
4500                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4501                         return atsru;
4502         }
4503
4504         return NULL;
4505 }
4506
4507 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4508 {
4509         struct acpi_dmar_atsr *atsr;
4510         struct dmar_atsr_unit *atsru;
4511
4512         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4513                 return 0;
4514
4515         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4516         atsru = dmar_find_atsr(atsr);
4517         if (atsru)
4518                 return 0;
4519
4520         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4521         if (!atsru)
4522                 return -ENOMEM;
4523
4524         /*
4525          * If memory is allocated from slab by ACPI _DSM method, we need to
4526          * copy the memory content because the memory buffer will be freed
4527          * on return.
4528          */
4529         atsru->hdr = (void *)(atsru + 1);
4530         memcpy(atsru->hdr, hdr, hdr->length);
4531         atsru->include_all = atsr->flags & 0x1;
4532         if (!atsru->include_all) {
4533                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4534                                 (void *)atsr + atsr->header.length,
4535                                 &atsru->devices_cnt);
4536                 if (atsru->devices_cnt && atsru->devices == NULL) {
4537                         kfree(atsru);
4538                         return -ENOMEM;
4539                 }
4540         }
4541
4542         list_add_rcu(&atsru->list, &dmar_atsr_units);
4543
4544         return 0;
4545 }
4546
4547 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4548 {
4549         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4550         kfree(atsru);
4551 }
4552
4553 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4554 {
4555         struct acpi_dmar_atsr *atsr;
4556         struct dmar_atsr_unit *atsru;
4557
4558         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4559         atsru = dmar_find_atsr(atsr);
4560         if (atsru) {
4561                 list_del_rcu(&atsru->list);
4562                 synchronize_rcu();
4563                 intel_iommu_free_atsr(atsru);
4564         }
4565
4566         return 0;
4567 }
4568
4569 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4570 {
4571         int i;
4572         struct device *dev;
4573         struct acpi_dmar_atsr *atsr;
4574         struct dmar_atsr_unit *atsru;
4575
4576         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4577         atsru = dmar_find_atsr(atsr);
4578         if (!atsru)
4579                 return 0;
4580
4581         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4582                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4583                                           i, dev)
4584                         return -EBUSY;
4585         }
4586
4587         return 0;
4588 }
4589
4590 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4591 {
4592         int sp, ret;
4593         struct intel_iommu *iommu = dmaru->iommu;
4594
4595         if (g_iommus[iommu->seq_id])
4596                 return 0;
4597
4598         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4599                 pr_warn("%s: Doesn't support hardware pass through.\n",
4600                         iommu->name);
4601                 return -ENXIO;
4602         }
4603         if (!ecap_sc_support(iommu->ecap) &&
4604             domain_update_iommu_snooping(iommu)) {
4605                 pr_warn("%s: Doesn't support snooping.\n",
4606                         iommu->name);
4607                 return -ENXIO;
4608         }
4609         sp = domain_update_iommu_superpage(NULL, iommu) - 1;
4610         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4611                 pr_warn("%s: Doesn't support large page.\n",
4612                         iommu->name);
4613                 return -ENXIO;
4614         }
4615
4616         /*
4617          * Disable translation if already enabled prior to OS handover.
4618          */
4619         if (iommu->gcmd & DMA_GCMD_TE)
4620                 iommu_disable_translation(iommu);
4621
4622         g_iommus[iommu->seq_id] = iommu;
4623         ret = iommu_init_domains(iommu);
4624         if (ret == 0)
4625                 ret = iommu_alloc_root_entry(iommu);
4626         if (ret)
4627                 goto out;
4628
4629         intel_svm_check(iommu);
4630
4631         if (dmaru->ignored) {
4632                 /*
4633                  * we always have to disable PMRs or DMA may fail on this device
4634                  */
4635                 if (force_on)
4636                         iommu_disable_protect_mem_regions(iommu);
4637                 return 0;
4638         }
4639
4640         intel_iommu_init_qi(iommu);
4641         iommu_flush_write_buffer(iommu);
4642
4643 #ifdef CONFIG_INTEL_IOMMU_SVM
4644         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4645                 ret = intel_svm_enable_prq(iommu);
4646                 if (ret)
4647                         goto disable_iommu;
4648         }
4649 #endif
4650         ret = dmar_set_interrupt(iommu);
4651         if (ret)
4652                 goto disable_iommu;
4653
4654         iommu_set_root_entry(iommu);
4655         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4656         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4657         iommu_enable_translation(iommu);
4658
4659         iommu_disable_protect_mem_regions(iommu);
4660         return 0;
4661
4662 disable_iommu:
4663         disable_dmar_iommu(iommu);
4664 out:
4665         free_dmar_iommu(iommu);
4666         return ret;
4667 }
4668
4669 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4670 {
4671         int ret = 0;
4672         struct intel_iommu *iommu = dmaru->iommu;
4673
4674         if (!intel_iommu_enabled)
4675                 return 0;
4676         if (iommu == NULL)
4677                 return -EINVAL;
4678
4679         if (insert) {
4680                 ret = intel_iommu_add(dmaru);
4681         } else {
4682                 disable_dmar_iommu(iommu);
4683                 free_dmar_iommu(iommu);
4684         }
4685
4686         return ret;
4687 }
4688
4689 static void intel_iommu_free_dmars(void)
4690 {
4691         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4692         struct dmar_atsr_unit *atsru, *atsr_n;
4693
4694         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4695                 list_del(&rmrru->list);
4696                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4697                 kfree(rmrru);
4698         }
4699
4700         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4701                 list_del(&atsru->list);
4702                 intel_iommu_free_atsr(atsru);
4703         }
4704 }
4705
4706 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4707 {
4708         int i, ret = 1;
4709         struct pci_bus *bus;
4710         struct pci_dev *bridge = NULL;
4711         struct device *tmp;
4712         struct acpi_dmar_atsr *atsr;
4713         struct dmar_atsr_unit *atsru;
4714
4715         dev = pci_physfn(dev);
4716         for (bus = dev->bus; bus; bus = bus->parent) {
4717                 bridge = bus->self;
4718                 /* If it's an integrated device, allow ATS */
4719                 if (!bridge)
4720                         return 1;
4721                 /* Connected via non-PCIe: no ATS */
4722                 if (!pci_is_pcie(bridge) ||
4723                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4724                         return 0;
4725                 /* If we found the root port, look it up in the ATSR */
4726                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4727                         break;
4728         }
4729
4730         rcu_read_lock();
4731         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4732                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4733                 if (atsr->segment != pci_domain_nr(dev->bus))
4734                         continue;
4735
4736                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4737                         if (tmp == &bridge->dev)
4738                                 goto out;
4739
4740                 if (atsru->include_all)
4741                         goto out;
4742         }
4743         ret = 0;
4744 out:
4745         rcu_read_unlock();
4746
4747         return ret;
4748 }
4749
4750 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4751 {
4752         int ret;
4753         struct dmar_rmrr_unit *rmrru;
4754         struct dmar_atsr_unit *atsru;
4755         struct acpi_dmar_atsr *atsr;
4756         struct acpi_dmar_reserved_memory *rmrr;
4757
4758         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4759                 return 0;
4760
4761         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4762                 rmrr = container_of(rmrru->hdr,
4763                                     struct acpi_dmar_reserved_memory, header);
4764                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4765                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4766                                 ((void *)rmrr) + rmrr->header.length,
4767                                 rmrr->segment, rmrru->devices,
4768                                 rmrru->devices_cnt);
4769                         if (ret < 0)
4770                                 return ret;
4771                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4772                         dmar_remove_dev_scope(info, rmrr->segment,
4773                                 rmrru->devices, rmrru->devices_cnt);
4774                 }
4775         }
4776
4777         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4778                 if (atsru->include_all)
4779                         continue;
4780
4781                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4782                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4783                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4784                                         (void *)atsr + atsr->header.length,
4785                                         atsr->segment, atsru->devices,
4786                                         atsru->devices_cnt);
4787                         if (ret > 0)
4788                                 break;
4789                         else if (ret < 0)
4790                                 return ret;
4791                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4792                         if (dmar_remove_dev_scope(info, atsr->segment,
4793                                         atsru->devices, atsru->devices_cnt))
4794                                 break;
4795                 }
4796         }
4797
4798         return 0;
4799 }
4800
4801 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4802                                        unsigned long val, void *v)
4803 {
4804         struct memory_notify *mhp = v;
4805         unsigned long long start, end;
4806         unsigned long start_vpfn, last_vpfn;
4807
4808         switch (val) {
4809         case MEM_GOING_ONLINE:
4810                 start = mhp->start_pfn << PAGE_SHIFT;
4811                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4812                 if (iommu_domain_identity_map(si_domain, start, end)) {
4813                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4814                                 start, end);
4815                         return NOTIFY_BAD;
4816                 }
4817                 break;
4818
4819         case MEM_OFFLINE:
4820         case MEM_CANCEL_ONLINE:
4821                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4822                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4823                 while (start_vpfn <= last_vpfn) {
4824                         struct iova *iova;
4825                         struct dmar_drhd_unit *drhd;
4826                         struct intel_iommu *iommu;
4827                         struct page *freelist;
4828
4829                         iova = find_iova(&si_domain->iovad, start_vpfn);
4830                         if (iova == NULL) {
4831                                 pr_debug("Failed get IOVA for PFN %lx\n",
4832                                          start_vpfn);
4833                                 break;
4834                         }
4835
4836                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4837                                                      start_vpfn, last_vpfn);
4838                         if (iova == NULL) {
4839                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4840                                         start_vpfn, last_vpfn);
4841                                 return NOTIFY_BAD;
4842                         }
4843
4844                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4845                                                iova->pfn_hi);
4846
4847                         rcu_read_lock();
4848                         for_each_active_iommu(iommu, drhd)
4849                                 iommu_flush_iotlb_psi(iommu, si_domain,
4850                                         iova->pfn_lo, iova_size(iova),
4851                                         !freelist, 0);
4852                         rcu_read_unlock();
4853                         dma_free_pagelist(freelist);
4854
4855                         start_vpfn = iova->pfn_hi + 1;
4856                         free_iova_mem(iova);
4857                 }
4858                 break;
4859         }
4860
4861         return NOTIFY_OK;
4862 }
4863
4864 static struct notifier_block intel_iommu_memory_nb = {
4865         .notifier_call = intel_iommu_memory_notifier,
4866         .priority = 0
4867 };
4868
4869 static void free_all_cpu_cached_iovas(unsigned int cpu)
4870 {
4871         int i;
4872
4873         for (i = 0; i < g_num_of_iommus; i++) {
4874                 struct intel_iommu *iommu = g_iommus[i];
4875                 struct dmar_domain *domain;
4876                 int did;
4877
4878                 if (!iommu)
4879                         continue;
4880
4881                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4882                         domain = get_iommu_domain(iommu, (u16)did);
4883
4884                         if (!domain)
4885                                 continue;
4886                         free_cpu_cached_iovas(cpu, &domain->iovad);
4887                 }
4888         }
4889 }
4890
4891 static int intel_iommu_cpu_dead(unsigned int cpu)
4892 {
4893         free_all_cpu_cached_iovas(cpu);
4894         return 0;
4895 }
4896
4897 static void intel_disable_iommus(void)
4898 {
4899         struct intel_iommu *iommu = NULL;
4900         struct dmar_drhd_unit *drhd;
4901
4902         for_each_iommu(iommu, drhd)
4903                 iommu_disable_translation(iommu);
4904 }
4905
4906 void intel_iommu_shutdown(void)
4907 {
4908         struct dmar_drhd_unit *drhd;
4909         struct intel_iommu *iommu = NULL;
4910
4911         if (no_iommu || dmar_disabled)
4912                 return;
4913
4914         down_write(&dmar_global_lock);
4915
4916         /* Disable PMRs explicitly here. */
4917         for_each_iommu(iommu, drhd)
4918                 iommu_disable_protect_mem_regions(iommu);
4919
4920         /* Make sure the IOMMUs are switched off */
4921         intel_disable_iommus();
4922
4923         up_write(&dmar_global_lock);
4924 }
4925
4926 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4927 {
4928         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4929
4930         return container_of(iommu_dev, struct intel_iommu, iommu);
4931 }
4932
4933 static ssize_t intel_iommu_show_version(struct device *dev,
4934                                         struct device_attribute *attr,
4935                                         char *buf)
4936 {
4937         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4938         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4939         return sprintf(buf, "%d:%d\n",
4940                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4941 }
4942 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4943
4944 static ssize_t intel_iommu_show_address(struct device *dev,
4945                                         struct device_attribute *attr,
4946                                         char *buf)
4947 {
4948         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4949         return sprintf(buf, "%llx\n", iommu->reg_phys);
4950 }
4951 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4952
4953 static ssize_t intel_iommu_show_cap(struct device *dev,
4954                                     struct device_attribute *attr,
4955                                     char *buf)
4956 {
4957         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4958         return sprintf(buf, "%llx\n", iommu->cap);
4959 }
4960 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4961
4962 static ssize_t intel_iommu_show_ecap(struct device *dev,
4963                                     struct device_attribute *attr,
4964                                     char *buf)
4965 {
4966         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4967         return sprintf(buf, "%llx\n", iommu->ecap);
4968 }
4969 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4970
4971 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4972                                       struct device_attribute *attr,
4973                                       char *buf)
4974 {
4975         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4976         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4977 }
4978 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4979
4980 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4981                                            struct device_attribute *attr,
4982                                            char *buf)
4983 {
4984         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4985         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4986                                                   cap_ndoms(iommu->cap)));
4987 }
4988 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4989
4990 static struct attribute *intel_iommu_attrs[] = {
4991         &dev_attr_version.attr,
4992         &dev_attr_address.attr,
4993         &dev_attr_cap.attr,
4994         &dev_attr_ecap.attr,
4995         &dev_attr_domains_supported.attr,
4996         &dev_attr_domains_used.attr,
4997         NULL,
4998 };
4999
5000 static struct attribute_group intel_iommu_group = {
5001         .name = "intel-iommu",
5002         .attrs = intel_iommu_attrs,
5003 };
5004
5005 const struct attribute_group *intel_iommu_groups[] = {
5006         &intel_iommu_group,
5007         NULL,
5008 };
5009
5010 static inline bool has_untrusted_dev(void)
5011 {
5012         struct pci_dev *pdev = NULL;
5013
5014         for_each_pci_dev(pdev)
5015                 if (pdev->untrusted)
5016                         return true;
5017
5018         return false;
5019 }
5020
5021 static int __init platform_optin_force_iommu(void)
5022 {
5023         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
5024                 return 0;
5025
5026         if (no_iommu || dmar_disabled)
5027                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
5028
5029         /*
5030          * If Intel-IOMMU is disabled by default, we will apply identity
5031          * map for all devices except those marked as being untrusted.
5032          */
5033         if (dmar_disabled)
5034                 iommu_set_default_passthrough(false);
5035
5036         dmar_disabled = 0;
5037         no_iommu = 0;
5038
5039         return 1;
5040 }
5041
5042 static int __init probe_acpi_namespace_devices(void)
5043 {
5044         struct dmar_drhd_unit *drhd;
5045         /* To avoid a -Wunused-but-set-variable warning. */
5046         struct intel_iommu *iommu __maybe_unused;
5047         struct device *dev;
5048         int i, ret = 0;
5049
5050         for_each_active_iommu(iommu, drhd) {
5051                 for_each_active_dev_scope(drhd->devices,
5052                                           drhd->devices_cnt, i, dev) {
5053                         struct acpi_device_physical_node *pn;
5054                         struct iommu_group *group;
5055                         struct acpi_device *adev;
5056
5057                         if (dev->bus != &acpi_bus_type)
5058                                 continue;
5059
5060                         adev = to_acpi_device(dev);
5061                         mutex_lock(&adev->physical_node_lock);
5062                         list_for_each_entry(pn,
5063                                             &adev->physical_node_list, node) {
5064                                 group = iommu_group_get(pn->dev);
5065                                 if (group) {
5066                                         iommu_group_put(group);
5067                                         continue;
5068                                 }
5069
5070                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
5071                                 ret = iommu_probe_device(pn->dev);
5072                                 if (ret)
5073                                         break;
5074                         }
5075                         mutex_unlock(&adev->physical_node_lock);
5076
5077                         if (ret)
5078                                 return ret;
5079                 }
5080         }
5081
5082         return 0;
5083 }
5084
5085 int __init intel_iommu_init(void)
5086 {
5087         int ret = -ENODEV;
5088         struct dmar_drhd_unit *drhd;
5089         struct intel_iommu *iommu;
5090
5091         /*
5092          * Intel IOMMU is required for a TXT/tboot launch or platform
5093          * opt in, so enforce that.
5094          */
5095         force_on = tboot_force_iommu() || platform_optin_force_iommu();
5096
5097         if (iommu_init_mempool()) {
5098                 if (force_on)
5099                         panic("tboot: Failed to initialize iommu memory\n");
5100                 return -ENOMEM;
5101         }
5102
5103         down_write(&dmar_global_lock);
5104         if (dmar_table_init()) {
5105                 if (force_on)
5106                         panic("tboot: Failed to initialize DMAR table\n");
5107                 goto out_free_dmar;
5108         }
5109
5110         if (dmar_dev_scope_init() < 0) {
5111                 if (force_on)
5112                         panic("tboot: Failed to initialize DMAR device scope\n");
5113                 goto out_free_dmar;
5114         }
5115
5116         up_write(&dmar_global_lock);
5117
5118         /*
5119          * The bus notifier takes the dmar_global_lock, so lockdep will
5120          * complain later when we register it under the lock.
5121          */
5122         dmar_register_bus_notifier();
5123
5124         down_write(&dmar_global_lock);
5125
5126         if (no_iommu || dmar_disabled) {
5127                 /*
5128                  * We exit the function here to ensure IOMMU's remapping and
5129                  * mempool aren't setup, which means that the IOMMU's PMRs
5130                  * won't be disabled via the call to init_dmars(). So disable
5131                  * it explicitly here. The PMRs were setup by tboot prior to
5132                  * calling SENTER, but the kernel is expected to reset/tear
5133                  * down the PMRs.
5134                  */
5135                 if (intel_iommu_tboot_noforce) {
5136                         for_each_iommu(iommu, drhd)
5137                                 iommu_disable_protect_mem_regions(iommu);
5138                 }
5139
5140                 /*
5141                  * Make sure the IOMMUs are switched off, even when we
5142                  * boot into a kexec kernel and the previous kernel left
5143                  * them enabled
5144                  */
5145                 intel_disable_iommus();
5146                 goto out_free_dmar;
5147         }
5148
5149         if (list_empty(&dmar_rmrr_units))
5150                 pr_info("No RMRR found\n");
5151
5152         if (list_empty(&dmar_atsr_units))
5153                 pr_info("No ATSR found\n");
5154
5155         if (dmar_init_reserved_ranges()) {
5156                 if (force_on)
5157                         panic("tboot: Failed to reserve iommu ranges\n");
5158                 goto out_free_reserved_range;
5159         }
5160
5161         if (dmar_map_gfx)
5162                 intel_iommu_gfx_mapped = 1;
5163
5164         init_no_remapping_devices();
5165
5166         ret = init_dmars();
5167         if (ret) {
5168                 if (force_on)
5169                         panic("tboot: Failed to initialize DMARs\n");
5170                 pr_err("Initialization failed\n");
5171                 goto out_free_reserved_range;
5172         }
5173         up_write(&dmar_global_lock);
5174
5175 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5176         /*
5177          * If the system has no untrusted device or the user has decided
5178          * to disable the bounce page mechanisms, we don't need swiotlb.
5179          * Mark this and the pre-allocated bounce pages will be released
5180          * later.
5181          */
5182         if (!has_untrusted_dev() || intel_no_bounce)
5183                 swiotlb = 0;
5184 #endif
5185         dma_ops = &intel_dma_ops;
5186
5187         init_iommu_pm_ops();
5188
5189         for_each_active_iommu(iommu, drhd) {
5190                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5191                                        intel_iommu_groups,
5192                                        "%s", iommu->name);
5193                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5194                 iommu_device_register(&iommu->iommu);
5195         }
5196
5197         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5198         if (si_domain && !hw_pass_through)
5199                 register_memory_notifier(&intel_iommu_memory_nb);
5200         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5201                           intel_iommu_cpu_dead);
5202
5203         down_read(&dmar_global_lock);
5204         if (probe_acpi_namespace_devices())
5205                 pr_warn("ACPI name space devices didn't probe correctly\n");
5206         up_read(&dmar_global_lock);
5207
5208         /* Finally, we enable the DMA remapping hardware. */
5209         for_each_iommu(iommu, drhd) {
5210                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5211                         iommu_enable_translation(iommu);
5212
5213                 iommu_disable_protect_mem_regions(iommu);
5214         }
5215         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5216
5217         intel_iommu_enabled = 1;
5218         intel_iommu_debugfs_init();
5219
5220         return 0;
5221
5222 out_free_reserved_range:
5223         put_iova_domain(&reserved_iova_list);
5224 out_free_dmar:
5225         intel_iommu_free_dmars();
5226         up_write(&dmar_global_lock);
5227         iommu_exit_mempool();
5228         return ret;
5229 }
5230
5231 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5232 {
5233         struct intel_iommu *iommu = opaque;
5234
5235         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5236         return 0;
5237 }
5238
5239 /*
5240  * NB - intel-iommu lacks any sort of reference counting for the users of
5241  * dependent devices.  If multiple endpoints have intersecting dependent
5242  * devices, unbinding the driver from any one of them will possibly leave
5243  * the others unable to operate.
5244  */
5245 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5246 {
5247         if (!iommu || !dev || !dev_is_pci(dev))
5248                 return;
5249
5250         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5251 }
5252
5253 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5254 {
5255         struct dmar_domain *domain;
5256         struct intel_iommu *iommu;
5257         unsigned long flags;
5258
5259         assert_spin_locked(&device_domain_lock);
5260
5261         if (WARN_ON(!info))
5262                 return;
5263
5264         iommu = info->iommu;
5265         domain = info->domain;
5266
5267         if (info->dev) {
5268                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5269                         intel_pasid_tear_down_entry(iommu, info->dev,
5270                                         PASID_RID2PASID);
5271
5272                 iommu_disable_dev_iotlb(info);
5273                 domain_context_clear(iommu, info->dev);
5274                 intel_pasid_free_table(info->dev);
5275         }
5276
5277         unlink_domain_info(info);
5278
5279         spin_lock_irqsave(&iommu->lock, flags);
5280         domain_detach_iommu(domain, iommu);
5281         spin_unlock_irqrestore(&iommu->lock, flags);
5282
5283         /* free the private domain */
5284         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5285             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5286             list_empty(&domain->devices))
5287                 domain_exit(info->domain);
5288
5289         free_devinfo_mem(info);
5290 }
5291
5292 static void dmar_remove_one_dev_info(struct device *dev)
5293 {
5294         struct device_domain_info *info;
5295         unsigned long flags;
5296
5297         spin_lock_irqsave(&device_domain_lock, flags);
5298         info = dev->archdata.iommu;
5299         if (info && info != DEFER_DEVICE_DOMAIN_INFO
5300             && info != DUMMY_DEVICE_DOMAIN_INFO)
5301                 __dmar_remove_one_dev_info(info);
5302         spin_unlock_irqrestore(&device_domain_lock, flags);
5303 }
5304
5305 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5306 {
5307         int adjust_width;
5308
5309         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5310         domain_reserve_special_ranges(domain);
5311
5312         /* calculate AGAW */
5313         domain->gaw = guest_width;
5314         adjust_width = guestwidth_to_adjustwidth(guest_width);
5315         domain->agaw = width_to_agaw(adjust_width);
5316
5317         domain->iommu_coherency = 0;
5318         domain->iommu_snooping = 0;
5319         domain->iommu_superpage = 0;
5320         domain->max_addr = 0;
5321
5322         /* always allocate the top pgd */
5323         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5324         if (!domain->pgd)
5325                 return -ENOMEM;
5326         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5327         return 0;
5328 }
5329
5330 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5331 {
5332         struct dmar_domain *dmar_domain;
5333         struct iommu_domain *domain;
5334         int ret;
5335
5336         switch (type) {
5337         case IOMMU_DOMAIN_DMA:
5338         /* fallthrough */
5339         case IOMMU_DOMAIN_UNMANAGED:
5340                 dmar_domain = alloc_domain(0);
5341                 if (!dmar_domain) {
5342                         pr_err("Can't allocate dmar_domain\n");
5343                         return NULL;
5344                 }
5345                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5346                         pr_err("Domain initialization failed\n");
5347                         domain_exit(dmar_domain);
5348                         return NULL;
5349                 }
5350
5351                 if (!intel_iommu_strict && type == IOMMU_DOMAIN_DMA) {
5352                         ret = init_iova_flush_queue(&dmar_domain->iovad,
5353                                                     iommu_flush_iova,
5354                                                     iova_entry_free);
5355                         if (ret)
5356                                 pr_info("iova flush queue initialization failed\n");
5357                 }
5358
5359                 domain_update_iommu_cap(dmar_domain);
5360
5361                 domain = &dmar_domain->domain;
5362                 domain->geometry.aperture_start = 0;
5363                 domain->geometry.aperture_end   =
5364                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5365                 domain->geometry.force_aperture = true;
5366
5367                 return domain;
5368         case IOMMU_DOMAIN_IDENTITY:
5369                 return &si_domain->domain;
5370         default:
5371                 return NULL;
5372         }
5373
5374         return NULL;
5375 }
5376
5377 static void intel_iommu_domain_free(struct iommu_domain *domain)
5378 {
5379         if (domain != &si_domain->domain)
5380                 domain_exit(to_dmar_domain(domain));
5381 }
5382
5383 /*
5384  * Check whether a @domain could be attached to the @dev through the
5385  * aux-domain attach/detach APIs.
5386  */
5387 static inline bool
5388 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5389 {
5390         struct device_domain_info *info = dev->archdata.iommu;
5391
5392         return info && info->auxd_enabled &&
5393                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5394 }
5395
5396 static void auxiliary_link_device(struct dmar_domain *domain,
5397                                   struct device *dev)
5398 {
5399         struct device_domain_info *info = dev->archdata.iommu;
5400
5401         assert_spin_locked(&device_domain_lock);
5402         if (WARN_ON(!info))
5403                 return;
5404
5405         domain->auxd_refcnt++;
5406         list_add(&domain->auxd, &info->auxiliary_domains);
5407 }
5408
5409 static void auxiliary_unlink_device(struct dmar_domain *domain,
5410                                     struct device *dev)
5411 {
5412         struct device_domain_info *info = dev->archdata.iommu;
5413
5414         assert_spin_locked(&device_domain_lock);
5415         if (WARN_ON(!info))
5416                 return;
5417
5418         list_del(&domain->auxd);
5419         domain->auxd_refcnt--;
5420
5421         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5422                 ioasid_free(domain->default_pasid);
5423 }
5424
5425 static int aux_domain_add_dev(struct dmar_domain *domain,
5426                               struct device *dev)
5427 {
5428         int ret;
5429         u8 bus, devfn;
5430         unsigned long flags;
5431         struct intel_iommu *iommu;
5432
5433         iommu = device_to_iommu(dev, &bus, &devfn);
5434         if (!iommu)
5435                 return -ENODEV;
5436
5437         if (domain->default_pasid <= 0) {
5438                 int pasid;
5439
5440                 /* No private data needed for the default pasid */
5441                 pasid = ioasid_alloc(NULL, PASID_MIN,
5442                                      pci_max_pasids(to_pci_dev(dev)) - 1,
5443                                      NULL);
5444                 if (pasid == INVALID_IOASID) {
5445                         pr_err("Can't allocate default pasid\n");
5446                         return -ENODEV;
5447                 }
5448                 domain->default_pasid = pasid;
5449         }
5450
5451         spin_lock_irqsave(&device_domain_lock, flags);
5452         /*
5453          * iommu->lock must be held to attach domain to iommu and setup the
5454          * pasid entry for second level translation.
5455          */
5456         spin_lock(&iommu->lock);
5457         ret = domain_attach_iommu(domain, iommu);
5458         if (ret)
5459                 goto attach_failed;
5460
5461         /* Setup the PASID entry for mediated devices: */
5462         if (domain_use_first_level(domain))
5463                 ret = domain_setup_first_level(iommu, domain, dev,
5464                                                domain->default_pasid);
5465         else
5466                 ret = intel_pasid_setup_second_level(iommu, domain, dev,
5467                                                      domain->default_pasid);
5468         if (ret)
5469                 goto table_failed;
5470         spin_unlock(&iommu->lock);
5471
5472         auxiliary_link_device(domain, dev);
5473
5474         spin_unlock_irqrestore(&device_domain_lock, flags);
5475
5476         return 0;
5477
5478 table_failed:
5479         domain_detach_iommu(domain, iommu);
5480 attach_failed:
5481         spin_unlock(&iommu->lock);
5482         spin_unlock_irqrestore(&device_domain_lock, flags);
5483         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5484                 ioasid_free(domain->default_pasid);
5485
5486         return ret;
5487 }
5488
5489 static void aux_domain_remove_dev(struct dmar_domain *domain,
5490                                   struct device *dev)
5491 {
5492         struct device_domain_info *info;
5493         struct intel_iommu *iommu;
5494         unsigned long flags;
5495
5496         if (!is_aux_domain(dev, &domain->domain))
5497                 return;
5498
5499         spin_lock_irqsave(&device_domain_lock, flags);
5500         info = dev->archdata.iommu;
5501         iommu = info->iommu;
5502
5503         auxiliary_unlink_device(domain, dev);
5504
5505         spin_lock(&iommu->lock);
5506         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5507         domain_detach_iommu(domain, iommu);
5508         spin_unlock(&iommu->lock);
5509
5510         spin_unlock_irqrestore(&device_domain_lock, flags);
5511 }
5512
5513 static int prepare_domain_attach_device(struct iommu_domain *domain,
5514                                         struct device *dev)
5515 {
5516         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5517         struct intel_iommu *iommu;
5518         int addr_width;
5519         u8 bus, devfn;
5520
5521         iommu = device_to_iommu(dev, &bus, &devfn);
5522         if (!iommu)
5523                 return -ENODEV;
5524
5525         /* check if this iommu agaw is sufficient for max mapped address */
5526         addr_width = agaw_to_width(iommu->agaw);
5527         if (addr_width > cap_mgaw(iommu->cap))
5528                 addr_width = cap_mgaw(iommu->cap);
5529
5530         if (dmar_domain->max_addr > (1LL << addr_width)) {
5531                 dev_err(dev, "%s: iommu width (%d) is not "
5532                         "sufficient for the mapped address (%llx)\n",
5533                         __func__, addr_width, dmar_domain->max_addr);
5534                 return -EFAULT;
5535         }
5536         dmar_domain->gaw = addr_width;
5537
5538         /*
5539          * Knock out extra levels of page tables if necessary
5540          */
5541         while (iommu->agaw < dmar_domain->agaw) {
5542                 struct dma_pte *pte;
5543
5544                 pte = dmar_domain->pgd;
5545                 if (dma_pte_present(pte)) {
5546                         dmar_domain->pgd = (struct dma_pte *)
5547                                 phys_to_virt(dma_pte_addr(pte));
5548                         free_pgtable_page(pte);
5549                 }
5550                 dmar_domain->agaw--;
5551         }
5552
5553         return 0;
5554 }
5555
5556 static int intel_iommu_attach_device(struct iommu_domain *domain,
5557                                      struct device *dev)
5558 {
5559         int ret;
5560
5561         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5562             device_is_rmrr_locked(dev)) {
5563                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5564                 return -EPERM;
5565         }
5566
5567         if (is_aux_domain(dev, domain))
5568                 return -EPERM;
5569
5570         /* normally dev is not mapped */
5571         if (unlikely(domain_context_mapped(dev))) {
5572                 struct dmar_domain *old_domain;
5573
5574                 old_domain = find_domain(dev);
5575                 if (old_domain)
5576                         dmar_remove_one_dev_info(dev);
5577         }
5578
5579         ret = prepare_domain_attach_device(domain, dev);
5580         if (ret)
5581                 return ret;
5582
5583         return domain_add_dev_info(to_dmar_domain(domain), dev);
5584 }
5585
5586 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5587                                          struct device *dev)
5588 {
5589         int ret;
5590
5591         if (!is_aux_domain(dev, domain))
5592                 return -EPERM;
5593
5594         ret = prepare_domain_attach_device(domain, dev);
5595         if (ret)
5596                 return ret;
5597
5598         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5599 }
5600
5601 static void intel_iommu_detach_device(struct iommu_domain *domain,
5602                                       struct device *dev)
5603 {
5604         dmar_remove_one_dev_info(dev);
5605 }
5606
5607 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5608                                           struct device *dev)
5609 {
5610         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5611 }
5612
5613 static int intel_iommu_map(struct iommu_domain *domain,
5614                            unsigned long iova, phys_addr_t hpa,
5615                            size_t size, int iommu_prot, gfp_t gfp)
5616 {
5617         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5618         u64 max_addr;
5619         int prot = 0;
5620         int ret;
5621
5622         if (iommu_prot & IOMMU_READ)
5623                 prot |= DMA_PTE_READ;
5624         if (iommu_prot & IOMMU_WRITE)
5625                 prot |= DMA_PTE_WRITE;
5626         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5627                 prot |= DMA_PTE_SNP;
5628
5629         max_addr = iova + size;
5630         if (dmar_domain->max_addr < max_addr) {
5631                 u64 end;
5632
5633                 /* check if minimum agaw is sufficient for mapped address */
5634                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5635                 if (end < max_addr) {
5636                         pr_err("%s: iommu width (%d) is not "
5637                                "sufficient for the mapped address (%llx)\n",
5638                                __func__, dmar_domain->gaw, max_addr);
5639                         return -EFAULT;
5640                 }
5641                 dmar_domain->max_addr = max_addr;
5642         }
5643         /* Round up size to next multiple of PAGE_SIZE, if it and
5644            the low bits of hpa would take us onto the next page */
5645         size = aligned_nrpages(hpa, size);
5646         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5647                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5648         return ret;
5649 }
5650
5651 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5652                                 unsigned long iova, size_t size,
5653                                 struct iommu_iotlb_gather *gather)
5654 {
5655         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5656         struct page *freelist = NULL;
5657         unsigned long start_pfn, last_pfn;
5658         unsigned int npages;
5659         int iommu_id, level = 0;
5660
5661         /* Cope with horrid API which requires us to unmap more than the
5662            size argument if it happens to be a large-page mapping. */
5663         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5664
5665         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5666                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5667
5668         start_pfn = iova >> VTD_PAGE_SHIFT;
5669         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5670
5671         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5672
5673         npages = last_pfn - start_pfn + 1;
5674
5675         for_each_domain_iommu(iommu_id, dmar_domain)
5676                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5677                                       start_pfn, npages, !freelist, 0);
5678
5679         dma_free_pagelist(freelist);
5680
5681         if (dmar_domain->max_addr == iova + size)
5682                 dmar_domain->max_addr = iova;
5683
5684         return size;
5685 }
5686
5687 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5688                                             dma_addr_t iova)
5689 {
5690         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5691         struct dma_pte *pte;
5692         int level = 0;
5693         u64 phys = 0;
5694
5695         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5696         if (pte)
5697                 phys = dma_pte_addr(pte);
5698
5699         return phys;
5700 }
5701
5702 static inline bool scalable_mode_support(void)
5703 {
5704         struct dmar_drhd_unit *drhd;
5705         struct intel_iommu *iommu;
5706         bool ret = true;
5707
5708         rcu_read_lock();
5709         for_each_active_iommu(iommu, drhd) {
5710                 if (!sm_supported(iommu)) {
5711                         ret = false;
5712                         break;
5713                 }
5714         }
5715         rcu_read_unlock();
5716
5717         return ret;
5718 }
5719
5720 static inline bool iommu_pasid_support(void)
5721 {
5722         struct dmar_drhd_unit *drhd;
5723         struct intel_iommu *iommu;
5724         bool ret = true;
5725
5726         rcu_read_lock();
5727         for_each_active_iommu(iommu, drhd) {
5728                 if (!pasid_supported(iommu)) {
5729                         ret = false;
5730                         break;
5731                 }
5732         }
5733         rcu_read_unlock();
5734
5735         return ret;
5736 }
5737
5738 static inline bool nested_mode_support(void)
5739 {
5740         struct dmar_drhd_unit *drhd;
5741         struct intel_iommu *iommu;
5742         bool ret = true;
5743
5744         rcu_read_lock();
5745         for_each_active_iommu(iommu, drhd) {
5746                 if (!sm_supported(iommu) || !ecap_nest(iommu->ecap)) {
5747                         ret = false;
5748                         break;
5749                 }
5750         }
5751         rcu_read_unlock();
5752
5753         return ret;
5754 }
5755
5756 static bool intel_iommu_capable(enum iommu_cap cap)
5757 {
5758         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5759                 return domain_update_iommu_snooping(NULL) == 1;
5760         if (cap == IOMMU_CAP_INTR_REMAP)
5761                 return irq_remapping_enabled == 1;
5762
5763         return false;
5764 }
5765
5766 static int intel_iommu_add_device(struct device *dev)
5767 {
5768         struct dmar_domain *dmar_domain;
5769         struct iommu_domain *domain;
5770         struct intel_iommu *iommu;
5771         struct iommu_group *group;
5772         u8 bus, devfn;
5773         int ret;
5774
5775         iommu = device_to_iommu(dev, &bus, &devfn);
5776         if (!iommu)
5777                 return -ENODEV;
5778
5779         iommu_device_link(&iommu->iommu, dev);
5780
5781         if (translation_pre_enabled(iommu))
5782                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5783
5784         group = iommu_group_get_for_dev(dev);
5785
5786         if (IS_ERR(group)) {
5787                 ret = PTR_ERR(group);
5788                 goto unlink;
5789         }
5790
5791         iommu_group_put(group);
5792
5793         domain = iommu_get_domain_for_dev(dev);
5794         dmar_domain = to_dmar_domain(domain);
5795         if (domain->type == IOMMU_DOMAIN_DMA) {
5796                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5797                         ret = iommu_request_dm_for_dev(dev);
5798                         if (ret) {
5799                                 dmar_remove_one_dev_info(dev);
5800                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5801                                 domain_add_dev_info(si_domain, dev);
5802                                 dev_info(dev,
5803                                          "Device uses a private identity domain.\n");
5804                         }
5805                 }
5806         } else {
5807                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5808                         ret = iommu_request_dma_domain_for_dev(dev);
5809                         if (ret) {
5810                                 dmar_remove_one_dev_info(dev);
5811                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5812                                 if (!get_private_domain_for_dev(dev)) {
5813                                         dev_warn(dev,
5814                                                  "Failed to get a private domain.\n");
5815                                         ret = -ENOMEM;
5816                                         goto unlink;
5817                                 }
5818
5819                                 dev_info(dev,
5820                                          "Device uses a private dma domain.\n");
5821                         }
5822                 }
5823         }
5824
5825         if (device_needs_bounce(dev)) {
5826                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5827                 set_dma_ops(dev, &bounce_dma_ops);
5828         }
5829
5830         return 0;
5831
5832 unlink:
5833         iommu_device_unlink(&iommu->iommu, dev);
5834         return ret;
5835 }
5836
5837 static void intel_iommu_remove_device(struct device *dev)
5838 {
5839         struct intel_iommu *iommu;
5840         u8 bus, devfn;
5841
5842         iommu = device_to_iommu(dev, &bus, &devfn);
5843         if (!iommu)
5844                 return;
5845
5846         dmar_remove_one_dev_info(dev);
5847
5848         iommu_group_remove_device(dev);
5849
5850         iommu_device_unlink(&iommu->iommu, dev);
5851
5852         if (device_needs_bounce(dev))
5853                 set_dma_ops(dev, NULL);
5854 }
5855
5856 static void intel_iommu_get_resv_regions(struct device *device,
5857                                          struct list_head *head)
5858 {
5859         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5860         struct iommu_resv_region *reg;
5861         struct dmar_rmrr_unit *rmrr;
5862         struct device *i_dev;
5863         int i;
5864
5865         down_read(&dmar_global_lock);
5866         for_each_rmrr_units(rmrr) {
5867                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5868                                           i, i_dev) {
5869                         struct iommu_resv_region *resv;
5870                         enum iommu_resv_type type;
5871                         size_t length;
5872
5873                         if (i_dev != device &&
5874                             !is_downstream_to_pci_bridge(device, i_dev))
5875                                 continue;
5876
5877                         length = rmrr->end_address - rmrr->base_address + 1;
5878
5879                         type = device_rmrr_is_relaxable(device) ?
5880                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5881
5882                         resv = iommu_alloc_resv_region(rmrr->base_address,
5883                                                        length, prot, type);
5884                         if (!resv)
5885                                 break;
5886
5887                         list_add_tail(&resv->list, head);
5888                 }
5889         }
5890         up_read(&dmar_global_lock);
5891
5892 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5893         if (dev_is_pci(device)) {
5894                 struct pci_dev *pdev = to_pci_dev(device);
5895
5896                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5897                         reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
5898                                                    IOMMU_RESV_DIRECT_RELAXABLE);
5899                         if (reg)
5900                                 list_add_tail(&reg->list, head);
5901                 }
5902         }
5903 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5904
5905         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5906                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5907                                       0, IOMMU_RESV_MSI);
5908         if (!reg)
5909                 return;
5910         list_add_tail(&reg->list, head);
5911 }
5912
5913 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5914 {
5915         struct device_domain_info *info;
5916         struct context_entry *context;
5917         struct dmar_domain *domain;
5918         unsigned long flags;
5919         u64 ctx_lo;
5920         int ret;
5921
5922         domain = find_domain(dev);
5923         if (!domain)
5924                 return -EINVAL;
5925
5926         spin_lock_irqsave(&device_domain_lock, flags);
5927         spin_lock(&iommu->lock);
5928
5929         ret = -EINVAL;
5930         info = dev->archdata.iommu;
5931         if (!info || !info->pasid_supported)
5932                 goto out;
5933
5934         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5935         if (WARN_ON(!context))
5936                 goto out;
5937
5938         ctx_lo = context[0].lo;
5939
5940         if (!(ctx_lo & CONTEXT_PASIDE)) {
5941                 ctx_lo |= CONTEXT_PASIDE;
5942                 context[0].lo = ctx_lo;
5943                 wmb();
5944                 iommu->flush.flush_context(iommu,
5945                                            domain->iommu_did[iommu->seq_id],
5946                                            PCI_DEVID(info->bus, info->devfn),
5947                                            DMA_CCMD_MASK_NOBIT,
5948                                            DMA_CCMD_DEVICE_INVL);
5949         }
5950
5951         /* Enable PASID support in the device, if it wasn't already */
5952         if (!info->pasid_enabled)
5953                 iommu_enable_dev_iotlb(info);
5954
5955         ret = 0;
5956
5957  out:
5958         spin_unlock(&iommu->lock);
5959         spin_unlock_irqrestore(&device_domain_lock, flags);
5960
5961         return ret;
5962 }
5963
5964 static void intel_iommu_apply_resv_region(struct device *dev,
5965                                           struct iommu_domain *domain,
5966                                           struct iommu_resv_region *region)
5967 {
5968         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5969         unsigned long start, end;
5970
5971         start = IOVA_PFN(region->start);
5972         end   = IOVA_PFN(region->start + region->length - 1);
5973
5974         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5975 }
5976
5977 static struct iommu_group *intel_iommu_device_group(struct device *dev)
5978 {
5979         if (dev_is_pci(dev))
5980                 return pci_device_group(dev);
5981         return generic_device_group(dev);
5982 }
5983
5984 #ifdef CONFIG_INTEL_IOMMU_SVM
5985 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5986 {
5987         struct intel_iommu *iommu;
5988         u8 bus, devfn;
5989
5990         if (iommu_dummy(dev)) {
5991                 dev_warn(dev,
5992                          "No IOMMU translation for device; cannot enable SVM\n");
5993                 return NULL;
5994         }
5995
5996         iommu = device_to_iommu(dev, &bus, &devfn);
5997         if ((!iommu)) {
5998                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5999                 return NULL;
6000         }
6001
6002         return iommu;
6003 }
6004 #endif /* CONFIG_INTEL_IOMMU_SVM */
6005
6006 static int intel_iommu_enable_auxd(struct device *dev)
6007 {
6008         struct device_domain_info *info;
6009         struct intel_iommu *iommu;
6010         unsigned long flags;
6011         u8 bus, devfn;
6012         int ret;
6013
6014         iommu = device_to_iommu(dev, &bus, &devfn);
6015         if (!iommu || dmar_disabled)
6016                 return -EINVAL;
6017
6018         if (!sm_supported(iommu) || !pasid_supported(iommu))
6019                 return -EINVAL;
6020
6021         ret = intel_iommu_enable_pasid(iommu, dev);
6022         if (ret)
6023                 return -ENODEV;
6024
6025         spin_lock_irqsave(&device_domain_lock, flags);
6026         info = dev->archdata.iommu;
6027         info->auxd_enabled = 1;
6028         spin_unlock_irqrestore(&device_domain_lock, flags);
6029
6030         return 0;
6031 }
6032
6033 static int intel_iommu_disable_auxd(struct device *dev)
6034 {
6035         struct device_domain_info *info;
6036         unsigned long flags;
6037
6038         spin_lock_irqsave(&device_domain_lock, flags);
6039         info = dev->archdata.iommu;
6040         if (!WARN_ON(!info))
6041                 info->auxd_enabled = 0;
6042         spin_unlock_irqrestore(&device_domain_lock, flags);
6043
6044         return 0;
6045 }
6046
6047 /*
6048  * A PCI express designated vendor specific extended capability is defined
6049  * in the section 3.7 of Intel scalable I/O virtualization technical spec
6050  * for system software and tools to detect endpoint devices supporting the
6051  * Intel scalable IO virtualization without host driver dependency.
6052  *
6053  * Returns the address of the matching extended capability structure within
6054  * the device's PCI configuration space or 0 if the device does not support
6055  * it.
6056  */
6057 static int siov_find_pci_dvsec(struct pci_dev *pdev)
6058 {
6059         int pos;
6060         u16 vendor, id;
6061
6062         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
6063         while (pos) {
6064                 pci_read_config_word(pdev, pos + 4, &vendor);
6065                 pci_read_config_word(pdev, pos + 8, &id);
6066                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
6067                         return pos;
6068
6069                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
6070         }
6071
6072         return 0;
6073 }
6074
6075 static bool
6076 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
6077 {
6078         if (feat == IOMMU_DEV_FEAT_AUX) {
6079                 int ret;
6080
6081                 if (!dev_is_pci(dev) || dmar_disabled ||
6082                     !scalable_mode_support() || !iommu_pasid_support())
6083                         return false;
6084
6085                 ret = pci_pasid_features(to_pci_dev(dev));
6086                 if (ret < 0)
6087                         return false;
6088
6089                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
6090         }
6091
6092         return false;
6093 }
6094
6095 static int
6096 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
6097 {
6098         if (feat == IOMMU_DEV_FEAT_AUX)
6099                 return intel_iommu_enable_auxd(dev);
6100
6101         return -ENODEV;
6102 }
6103
6104 static int
6105 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
6106 {
6107         if (feat == IOMMU_DEV_FEAT_AUX)
6108                 return intel_iommu_disable_auxd(dev);
6109
6110         return -ENODEV;
6111 }
6112
6113 static bool
6114 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
6115 {
6116         struct device_domain_info *info = dev->archdata.iommu;
6117
6118         if (feat == IOMMU_DEV_FEAT_AUX)
6119                 return scalable_mode_support() && info && info->auxd_enabled;
6120
6121         return false;
6122 }
6123
6124 static int
6125 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
6126 {
6127         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6128
6129         return dmar_domain->default_pasid > 0 ?
6130                         dmar_domain->default_pasid : -EINVAL;
6131 }
6132
6133 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
6134                                            struct device *dev)
6135 {
6136         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
6137 }
6138
6139 static int
6140 intel_iommu_domain_set_attr(struct iommu_domain *domain,
6141                             enum iommu_attr attr, void *data)
6142 {
6143         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
6144         unsigned long flags;
6145         int ret = 0;
6146
6147         if (domain->type != IOMMU_DOMAIN_UNMANAGED)
6148                 return -EINVAL;
6149
6150         switch (attr) {
6151         case DOMAIN_ATTR_NESTING:
6152                 spin_lock_irqsave(&device_domain_lock, flags);
6153                 if (nested_mode_support() &&
6154                     list_empty(&dmar_domain->devices)) {
6155                         dmar_domain->flags |= DOMAIN_FLAG_NESTING_MODE;
6156                         dmar_domain->flags &= ~DOMAIN_FLAG_USE_FIRST_LEVEL;
6157                 } else {
6158                         ret = -ENODEV;
6159                 }
6160                 spin_unlock_irqrestore(&device_domain_lock, flags);
6161                 break;
6162         default:
6163                 ret = -EINVAL;
6164                 break;
6165         }
6166
6167         return ret;
6168 }
6169
6170 const struct iommu_ops intel_iommu_ops = {
6171         .capable                = intel_iommu_capable,
6172         .domain_alloc           = intel_iommu_domain_alloc,
6173         .domain_free            = intel_iommu_domain_free,
6174         .domain_set_attr        = intel_iommu_domain_set_attr,
6175         .attach_dev             = intel_iommu_attach_device,
6176         .detach_dev             = intel_iommu_detach_device,
6177         .aux_attach_dev         = intel_iommu_aux_attach_device,
6178         .aux_detach_dev         = intel_iommu_aux_detach_device,
6179         .aux_get_pasid          = intel_iommu_aux_get_pasid,
6180         .map                    = intel_iommu_map,
6181         .unmap                  = intel_iommu_unmap,
6182         .iova_to_phys           = intel_iommu_iova_to_phys,
6183         .add_device             = intel_iommu_add_device,
6184         .remove_device          = intel_iommu_remove_device,
6185         .get_resv_regions       = intel_iommu_get_resv_regions,
6186         .put_resv_regions       = generic_iommu_put_resv_regions,
6187         .apply_resv_region      = intel_iommu_apply_resv_region,
6188         .device_group           = intel_iommu_device_group,
6189         .dev_has_feat           = intel_iommu_dev_has_feat,
6190         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
6191         .dev_enable_feat        = intel_iommu_dev_enable_feat,
6192         .dev_disable_feat       = intel_iommu_dev_disable_feat,
6193         .is_attach_deferred     = intel_iommu_is_attach_deferred,
6194         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
6195 };
6196
6197 static void quirk_iommu_igfx(struct pci_dev *dev)
6198 {
6199         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
6200         dmar_map_gfx = 0;
6201 }
6202
6203 /* G4x/GM45 integrated gfx dmar support is totally busted. */
6204 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
6205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
6206 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
6207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
6208 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
6209 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
6210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
6211
6212 /* Broadwell igfx malfunctions with dmar */
6213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
6214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
6215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
6216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
6217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
6218 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6219 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6220 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6221 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6222 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6230 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6231 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6232 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6233 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6234 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6235 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6236 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6237
6238 static void quirk_iommu_rwbf(struct pci_dev *dev)
6239 {
6240         /*
6241          * Mobile 4 Series Chipset neglects to set RWBF capability,
6242          * but needs it. Same seems to hold for the desktop versions.
6243          */
6244         pci_info(dev, "Forcing write-buffer flush capability\n");
6245         rwbf_quirk = 1;
6246 }
6247
6248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6249 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6250 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6251 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6252 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6253 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6254 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6255
6256 #define GGC 0x52
6257 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6258 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6259 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6260 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6261 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6262 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6263 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6264 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6265
6266 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6267 {
6268         unsigned short ggc;
6269
6270         if (pci_read_config_word(dev, GGC, &ggc))
6271                 return;
6272
6273         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6274                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6275                 dmar_map_gfx = 0;
6276         } else if (dmar_map_gfx) {
6277                 /* we have to ensure the gfx device is idle before we flush */
6278                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6279                 intel_iommu_strict = 1;
6280        }
6281 }
6282 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6283 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6284 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6285 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6286
6287 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6288    ISOCH DMAR unit for the Azalia sound device, but not give it any
6289    TLB entries, which causes it to deadlock. Check for that.  We do
6290    this in a function called from init_dmars(), instead of in a PCI
6291    quirk, because we don't want to print the obnoxious "BIOS broken"
6292    message if VT-d is actually disabled.
6293 */
6294 static void __init check_tylersburg_isoch(void)
6295 {
6296         struct pci_dev *pdev;
6297         uint32_t vtisochctrl;
6298
6299         /* If there's no Azalia in the system anyway, forget it. */
6300         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6301         if (!pdev)
6302                 return;
6303         pci_dev_put(pdev);
6304
6305         /* System Management Registers. Might be hidden, in which case
6306            we can't do the sanity check. But that's OK, because the
6307            known-broken BIOSes _don't_ actually hide it, so far. */
6308         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6309         if (!pdev)
6310                 return;
6311
6312         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6313                 pci_dev_put(pdev);
6314                 return;
6315         }
6316
6317         pci_dev_put(pdev);
6318
6319         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6320         if (vtisochctrl & 1)
6321                 return;
6322
6323         /* Drop all bits other than the number of TLB entries */
6324         vtisochctrl &= 0x1c;
6325
6326         /* If we have the recommended number of TLB entries (16), fine. */
6327         if (vtisochctrl == 0x10)
6328                 return;
6329
6330         /* Zero TLB entries? You get to ride the short bus to school. */
6331         if (!vtisochctrl) {
6332                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6333                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6334                      dmi_get_system_info(DMI_BIOS_VENDOR),
6335                      dmi_get_system_info(DMI_BIOS_VERSION),
6336                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6337                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6338                 return;
6339         }
6340
6341         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6342                vtisochctrl);
6343 }