]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
Merge tag 'rpmsg-v5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/andersson...
[linux.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <linux/swiotlb.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48 #include <trace/events/intel_iommu.h>
49
50 #include "irq_remapping.h"
51 #include "intel-pasid.h"
52
53 #define ROOT_SIZE               VTD_PAGE_SIZE
54 #define CONTEXT_SIZE            VTD_PAGE_SIZE
55
56 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
57 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
58 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
59 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
60
61 #define IOAPIC_RANGE_START      (0xfee00000)
62 #define IOAPIC_RANGE_END        (0xfeefffff)
63 #define IOVA_START_ADDR         (0x1000)
64
65 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
66
67 #define MAX_AGAW_WIDTH 64
68 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
69
70 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
71 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
72
73 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
74    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
75 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
76                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
77 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
78
79 /* IO virtual address start page frame number */
80 #define IOVA_START_PFN          (1)
81
82 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
83
84 /* page table handling */
85 #define LEVEL_STRIDE            (9)
86 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
87
88 /*
89  * This bitmap is used to advertise the page sizes our hardware support
90  * to the IOMMU core, which will then use this information to split
91  * physically contiguous memory regions it is mapping into page sizes
92  * that we support.
93  *
94  * Traditionally the IOMMU core just handed us the mappings directly,
95  * after making sure the size is an order of a 4KiB page and that the
96  * mapping has natural alignment.
97  *
98  * To retain this behavior, we currently advertise that we support
99  * all page sizes that are an order of 4KiB.
100  *
101  * If at some point we'd like to utilize the IOMMU core's new behavior,
102  * we could change this to advertise the real page sizes we support.
103  */
104 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
105
106 static inline int agaw_to_level(int agaw)
107 {
108         return agaw + 2;
109 }
110
111 static inline int agaw_to_width(int agaw)
112 {
113         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
114 }
115
116 static inline int width_to_agaw(int width)
117 {
118         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
119 }
120
121 static inline unsigned int level_to_offset_bits(int level)
122 {
123         return (level - 1) * LEVEL_STRIDE;
124 }
125
126 static inline int pfn_level_offset(unsigned long pfn, int level)
127 {
128         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
129 }
130
131 static inline unsigned long level_mask(int level)
132 {
133         return -1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long level_size(int level)
137 {
138         return 1UL << level_to_offset_bits(level);
139 }
140
141 static inline unsigned long align_to_level(unsigned long pfn, int level)
142 {
143         return (pfn + level_size(level) - 1) & level_mask(level);
144 }
145
146 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
147 {
148         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
149 }
150
151 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
152    are never going to work. */
153 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
154 {
155         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157
158 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
159 {
160         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
161 }
162 static inline unsigned long page_to_dma_pfn(struct page *pg)
163 {
164         return mm_to_dma_pfn(page_to_pfn(pg));
165 }
166 static inline unsigned long virt_to_dma_pfn(void *p)
167 {
168         return page_to_dma_pfn(virt_to_page(p));
169 }
170
171 /* global iommu list, set NULL for ignored DMAR units */
172 static struct intel_iommu **g_iommus;
173
174 static void __init check_tylersburg_isoch(void);
175 static int rwbf_quirk;
176
177 /*
178  * set to 1 to panic kernel if can't successfully enable VT-d
179  * (used when kernel is launched w/ TXT)
180  */
181 static int force_on = 0;
182 int intel_iommu_tboot_noforce;
183 static int no_platform_optin;
184
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186
187 /*
188  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
189  * if marked present.
190  */
191 static phys_addr_t root_entry_lctp(struct root_entry *re)
192 {
193         if (!(re->lo & 1))
194                 return 0;
195
196         return re->lo & VTD_PAGE_MASK;
197 }
198
199 /*
200  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
201  * if marked present.
202  */
203 static phys_addr_t root_entry_uctp(struct root_entry *re)
204 {
205         if (!(re->hi & 1))
206                 return 0;
207
208         return re->hi & VTD_PAGE_MASK;
209 }
210
211 static inline void context_clear_pasid_enable(struct context_entry *context)
212 {
213         context->lo &= ~(1ULL << 11);
214 }
215
216 static inline bool context_pasid_enabled(struct context_entry *context)
217 {
218         return !!(context->lo & (1ULL << 11));
219 }
220
221 static inline void context_set_copied(struct context_entry *context)
222 {
223         context->hi |= (1ull << 3);
224 }
225
226 static inline bool context_copied(struct context_entry *context)
227 {
228         return !!(context->hi & (1ULL << 3));
229 }
230
231 static inline bool __context_present(struct context_entry *context)
232 {
233         return (context->lo & 1);
234 }
235
236 bool context_present(struct context_entry *context)
237 {
238         return context_pasid_enabled(context) ?
239              __context_present(context) :
240              __context_present(context) && !context_copied(context);
241 }
242
243 static inline void context_set_present(struct context_entry *context)
244 {
245         context->lo |= 1;
246 }
247
248 static inline void context_set_fault_enable(struct context_entry *context)
249 {
250         context->lo &= (((u64)-1) << 2) | 1;
251 }
252
253 static inline void context_set_translation_type(struct context_entry *context,
254                                                 unsigned long value)
255 {
256         context->lo &= (((u64)-1) << 4) | 3;
257         context->lo |= (value & 3) << 2;
258 }
259
260 static inline void context_set_address_root(struct context_entry *context,
261                                             unsigned long value)
262 {
263         context->lo &= ~VTD_PAGE_MASK;
264         context->lo |= value & VTD_PAGE_MASK;
265 }
266
267 static inline void context_set_address_width(struct context_entry *context,
268                                              unsigned long value)
269 {
270         context->hi |= value & 7;
271 }
272
273 static inline void context_set_domain_id(struct context_entry *context,
274                                          unsigned long value)
275 {
276         context->hi |= (value & ((1 << 16) - 1)) << 8;
277 }
278
279 static inline int context_domain_id(struct context_entry *c)
280 {
281         return((c->hi >> 8) & 0xffff);
282 }
283
284 static inline void context_clear_entry(struct context_entry *context)
285 {
286         context->lo = 0;
287         context->hi = 0;
288 }
289
290 /*
291  * This domain is a statically identity mapping domain.
292  *      1. This domain creats a static 1:1 mapping to all usable memory.
293  *      2. It maps to each iommu if successful.
294  *      3. Each iommu mapps to this domain if successful.
295  */
296 static struct dmar_domain *si_domain;
297 static int hw_pass_through = 1;
298
299 /* si_domain contains mulitple devices */
300 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
301
302 /*
303  * This is a DMA domain allocated through the iommu domain allocation
304  * interface. But one or more devices belonging to this domain have
305  * been chosen to use a private domain. We should avoid to use the
306  * map/unmap/iova_to_phys APIs on it.
307  */
308 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
309
310 #define for_each_domain_iommu(idx, domain)                      \
311         for (idx = 0; idx < g_num_of_iommus; idx++)             \
312                 if (domain->iommu_refcnt[idx])
313
314 struct dmar_rmrr_unit {
315         struct list_head list;          /* list of rmrr units   */
316         struct acpi_dmar_header *hdr;   /* ACPI header          */
317         u64     base_address;           /* reserved base address*/
318         u64     end_address;            /* reserved end address */
319         struct dmar_dev_scope *devices; /* target devices */
320         int     devices_cnt;            /* target device count */
321 };
322
323 struct dmar_atsr_unit {
324         struct list_head list;          /* list of ATSR units */
325         struct acpi_dmar_header *hdr;   /* ACPI header */
326         struct dmar_dev_scope *devices; /* target devices */
327         int devices_cnt;                /* target device count */
328         u8 include_all:1;               /* include all ports */
329 };
330
331 static LIST_HEAD(dmar_atsr_units);
332 static LIST_HEAD(dmar_rmrr_units);
333
334 #define for_each_rmrr_units(rmrr) \
335         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
336
337 /* bitmap for indexing intel_iommus */
338 static int g_num_of_iommus;
339
340 static void domain_exit(struct dmar_domain *domain);
341 static void domain_remove_dev_info(struct dmar_domain *domain);
342 static void dmar_remove_one_dev_info(struct device *dev);
343 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
344 static void domain_context_clear(struct intel_iommu *iommu,
345                                  struct device *dev);
346 static int domain_detach_iommu(struct dmar_domain *domain,
347                                struct intel_iommu *iommu);
348 static bool device_is_rmrr_locked(struct device *dev);
349 static int intel_iommu_attach_device(struct iommu_domain *domain,
350                                      struct device *dev);
351 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
352                                             dma_addr_t iova);
353
354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
355 int dmar_disabled = 0;
356 #else
357 int dmar_disabled = 1;
358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
359
360 int intel_iommu_sm;
361 int intel_iommu_enabled = 0;
362 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
363
364 static int dmar_map_gfx = 1;
365 static int dmar_forcedac;
366 static int intel_iommu_strict;
367 static int intel_iommu_superpage = 1;
368 static int iommu_identity_mapping;
369 static int intel_no_bounce;
370
371 #define IDENTMAP_ALL            1
372 #define IDENTMAP_GFX            2
373 #define IDENTMAP_AZALIA         4
374
375 int intel_iommu_gfx_mapped;
376 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
377
378 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
379 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
380 static DEFINE_SPINLOCK(device_domain_lock);
381 static LIST_HEAD(device_domain_list);
382
383 #define device_needs_bounce(d) (!intel_no_bounce && dev_is_pci(d) &&    \
384                                 to_pci_dev(d)->untrusted)
385
386 /*
387  * Iterate over elements in device_domain_list and call the specified
388  * callback @fn against each element.
389  */
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391                                      void *data), void *data)
392 {
393         int ret = 0;
394         unsigned long flags;
395         struct device_domain_info *info;
396
397         spin_lock_irqsave(&device_domain_lock, flags);
398         list_for_each_entry(info, &device_domain_list, global) {
399                 ret = fn(info, data);
400                 if (ret) {
401                         spin_unlock_irqrestore(&device_domain_lock, flags);
402                         return ret;
403                 }
404         }
405         spin_unlock_irqrestore(&device_domain_lock, flags);
406
407         return 0;
408 }
409
410 const struct iommu_ops intel_iommu_ops;
411
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 {
414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 }
416
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 }
421
422 static void init_translation_status(struct intel_iommu *iommu)
423 {
424         u32 gsts;
425
426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
427         if (gsts & DMA_GSTS_TES)
428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 }
430
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 {
434         return container_of(dom, struct dmar_domain, domain);
435 }
436
437 static int __init intel_iommu_setup(char *str)
438 {
439         if (!str)
440                 return -EINVAL;
441         while (*str) {
442                 if (!strncmp(str, "on", 2)) {
443                         dmar_disabled = 0;
444                         pr_info("IOMMU enabled\n");
445                 } else if (!strncmp(str, "off", 3)) {
446                         dmar_disabled = 1;
447                         no_platform_optin = 1;
448                         pr_info("IOMMU disabled\n");
449                 } else if (!strncmp(str, "igfx_off", 8)) {
450                         dmar_map_gfx = 0;
451                         pr_info("Disable GFX device mapping\n");
452                 } else if (!strncmp(str, "forcedac", 8)) {
453                         pr_info("Forcing DAC for PCI devices\n");
454                         dmar_forcedac = 1;
455                 } else if (!strncmp(str, "strict", 6)) {
456                         pr_info("Disable batched IOTLB flush\n");
457                         intel_iommu_strict = 1;
458                 } else if (!strncmp(str, "sp_off", 6)) {
459                         pr_info("Disable supported super page\n");
460                         intel_iommu_superpage = 0;
461                 } else if (!strncmp(str, "sm_on", 5)) {
462                         pr_info("Intel-IOMMU: scalable mode supported\n");
463                         intel_iommu_sm = 1;
464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467                         intel_iommu_tboot_noforce = 1;
468                 } else if (!strncmp(str, "nobounce", 8)) {
469                         pr_info("Intel-IOMMU: No bounce buffer. This could expose security risks of DMA attacks\n");
470                         intel_no_bounce = 1;
471                 }
472
473                 str += strcspn(str, ",");
474                 while (*str == ',')
475                         str++;
476         }
477         return 0;
478 }
479 __setup("intel_iommu=", intel_iommu_setup);
480
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
483
484 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
485 {
486         struct dmar_domain **domains;
487         int idx = did >> 8;
488
489         domains = iommu->domains[idx];
490         if (!domains)
491                 return NULL;
492
493         return domains[did & 0xff];
494 }
495
496 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
497                              struct dmar_domain *domain)
498 {
499         struct dmar_domain **domains;
500         int idx = did >> 8;
501
502         if (!iommu->domains[idx]) {
503                 size_t size = 256 * sizeof(struct dmar_domain *);
504                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
505         }
506
507         domains = iommu->domains[idx];
508         if (WARN_ON(!domains))
509                 return;
510         else
511                 domains[did & 0xff] = domain;
512 }
513
514 void *alloc_pgtable_page(int node)
515 {
516         struct page *page;
517         void *vaddr = NULL;
518
519         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
520         if (page)
521                 vaddr = page_address(page);
522         return vaddr;
523 }
524
525 void free_pgtable_page(void *vaddr)
526 {
527         free_page((unsigned long)vaddr);
528 }
529
530 static inline void *alloc_domain_mem(void)
531 {
532         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
533 }
534
535 static void free_domain_mem(void *vaddr)
536 {
537         kmem_cache_free(iommu_domain_cache, vaddr);
538 }
539
540 static inline void * alloc_devinfo_mem(void)
541 {
542         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
543 }
544
545 static inline void free_devinfo_mem(void *vaddr)
546 {
547         kmem_cache_free(iommu_devinfo_cache, vaddr);
548 }
549
550 static inline int domain_type_is_si(struct dmar_domain *domain)
551 {
552         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
553 }
554
555 static inline int domain_pfn_supported(struct dmar_domain *domain,
556                                        unsigned long pfn)
557 {
558         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
559
560         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
561 }
562
563 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
564 {
565         unsigned long sagaw;
566         int agaw = -1;
567
568         sagaw = cap_sagaw(iommu->cap);
569         for (agaw = width_to_agaw(max_gaw);
570              agaw >= 0; agaw--) {
571                 if (test_bit(agaw, &sagaw))
572                         break;
573         }
574
575         return agaw;
576 }
577
578 /*
579  * Calculate max SAGAW for each iommu.
580  */
581 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
582 {
583         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
584 }
585
586 /*
587  * calculate agaw for each iommu.
588  * "SAGAW" may be different across iommus, use a default agaw, and
589  * get a supported less agaw for iommus that don't support the default agaw.
590  */
591 int iommu_calculate_agaw(struct intel_iommu *iommu)
592 {
593         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
594 }
595
596 /* This functionin only returns single iommu in a domain */
597 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
598 {
599         int iommu_id;
600
601         /* si_domain and vm domain should not get here. */
602         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
603                 return NULL;
604
605         for_each_domain_iommu(iommu_id, domain)
606                 break;
607
608         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
609                 return NULL;
610
611         return g_iommus[iommu_id];
612 }
613
614 static void domain_update_iommu_coherency(struct dmar_domain *domain)
615 {
616         struct dmar_drhd_unit *drhd;
617         struct intel_iommu *iommu;
618         bool found = false;
619         int i;
620
621         domain->iommu_coherency = 1;
622
623         for_each_domain_iommu(i, domain) {
624                 found = true;
625                 if (!ecap_coherent(g_iommus[i]->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         if (found)
631                 return;
632
633         /* No hardware attached; use lowest common denominator */
634         rcu_read_lock();
635         for_each_active_iommu(iommu, drhd) {
636                 if (!ecap_coherent(iommu->ecap)) {
637                         domain->iommu_coherency = 0;
638                         break;
639                 }
640         }
641         rcu_read_unlock();
642 }
643
644 static int domain_update_iommu_snooping(struct intel_iommu *skip)
645 {
646         struct dmar_drhd_unit *drhd;
647         struct intel_iommu *iommu;
648         int ret = 1;
649
650         rcu_read_lock();
651         for_each_active_iommu(iommu, drhd) {
652                 if (iommu != skip) {
653                         if (!ecap_sc_support(iommu->ecap)) {
654                                 ret = 0;
655                                 break;
656                         }
657                 }
658         }
659         rcu_read_unlock();
660
661         return ret;
662 }
663
664 static int domain_update_iommu_superpage(struct intel_iommu *skip)
665 {
666         struct dmar_drhd_unit *drhd;
667         struct intel_iommu *iommu;
668         int mask = 0xf;
669
670         if (!intel_iommu_superpage) {
671                 return 0;
672         }
673
674         /* set iommu_superpage to the smallest common denominator */
675         rcu_read_lock();
676         for_each_active_iommu(iommu, drhd) {
677                 if (iommu != skip) {
678                         mask &= cap_super_page_val(iommu->cap);
679                         if (!mask)
680                                 break;
681                 }
682         }
683         rcu_read_unlock();
684
685         return fls(mask);
686 }
687
688 /* Some capabilities may be different across iommus */
689 static void domain_update_iommu_cap(struct dmar_domain *domain)
690 {
691         domain_update_iommu_coherency(domain);
692         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
693         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
694 }
695
696 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
697                                          u8 devfn, int alloc)
698 {
699         struct root_entry *root = &iommu->root_entry[bus];
700         struct context_entry *context;
701         u64 *entry;
702
703         entry = &root->lo;
704         if (sm_supported(iommu)) {
705                 if (devfn >= 0x80) {
706                         devfn -= 0x80;
707                         entry = &root->hi;
708                 }
709                 devfn *= 2;
710         }
711         if (*entry & 1)
712                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
713         else {
714                 unsigned long phy_addr;
715                 if (!alloc)
716                         return NULL;
717
718                 context = alloc_pgtable_page(iommu->node);
719                 if (!context)
720                         return NULL;
721
722                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
723                 phy_addr = virt_to_phys((void *)context);
724                 *entry = phy_addr | 1;
725                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
726         }
727         return &context[devfn];
728 }
729
730 static int iommu_dummy(struct device *dev)
731 {
732         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
733 }
734
735 /**
736  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
737  *                               sub-hierarchy of a candidate PCI-PCI bridge
738  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
739  * @bridge: the candidate PCI-PCI bridge
740  *
741  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
742  */
743 static bool
744 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
745 {
746         struct pci_dev *pdev, *pbridge;
747
748         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
749                 return false;
750
751         pdev = to_pci_dev(dev);
752         pbridge = to_pci_dev(bridge);
753
754         if (pbridge->subordinate &&
755             pbridge->subordinate->number <= pdev->bus->number &&
756             pbridge->subordinate->busn_res.end >= pdev->bus->number)
757                 return true;
758
759         return false;
760 }
761
762 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
763 {
764         struct dmar_drhd_unit *drhd = NULL;
765         struct intel_iommu *iommu;
766         struct device *tmp;
767         struct pci_dev *pdev = NULL;
768         u16 segment = 0;
769         int i;
770
771         if (iommu_dummy(dev))
772                 return NULL;
773
774         if (dev_is_pci(dev)) {
775                 struct pci_dev *pf_pdev;
776
777                 pdev = to_pci_dev(dev);
778
779 #ifdef CONFIG_X86
780                 /* VMD child devices currently cannot be handled individually */
781                 if (is_vmd(pdev->bus))
782                         return NULL;
783 #endif
784
785                 /* VFs aren't listed in scope tables; we need to look up
786                  * the PF instead to find the IOMMU. */
787                 pf_pdev = pci_physfn(pdev);
788                 dev = &pf_pdev->dev;
789                 segment = pci_domain_nr(pdev->bus);
790         } else if (has_acpi_companion(dev))
791                 dev = &ACPI_COMPANION(dev)->dev;
792
793         rcu_read_lock();
794         for_each_active_iommu(iommu, drhd) {
795                 if (pdev && segment != drhd->segment)
796                         continue;
797
798                 for_each_active_dev_scope(drhd->devices,
799                                           drhd->devices_cnt, i, tmp) {
800                         if (tmp == dev) {
801                                 /* For a VF use its original BDF# not that of the PF
802                                  * which we used for the IOMMU lookup. Strictly speaking
803                                  * we could do this for all PCI devices; we only need to
804                                  * get the BDF# from the scope table for ACPI matches. */
805                                 if (pdev && pdev->is_virtfn)
806                                         goto got_pdev;
807
808                                 *bus = drhd->devices[i].bus;
809                                 *devfn = drhd->devices[i].devfn;
810                                 goto out;
811                         }
812
813                         if (is_downstream_to_pci_bridge(dev, tmp))
814                                 goto got_pdev;
815                 }
816
817                 if (pdev && drhd->include_all) {
818                 got_pdev:
819                         *bus = pdev->bus->number;
820                         *devfn = pdev->devfn;
821                         goto out;
822                 }
823         }
824         iommu = NULL;
825  out:
826         rcu_read_unlock();
827
828         return iommu;
829 }
830
831 static void domain_flush_cache(struct dmar_domain *domain,
832                                void *addr, int size)
833 {
834         if (!domain->iommu_coherency)
835                 clflush_cache_range(addr, size);
836 }
837
838 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
839 {
840         struct context_entry *context;
841         int ret = 0;
842         unsigned long flags;
843
844         spin_lock_irqsave(&iommu->lock, flags);
845         context = iommu_context_addr(iommu, bus, devfn, 0);
846         if (context)
847                 ret = context_present(context);
848         spin_unlock_irqrestore(&iommu->lock, flags);
849         return ret;
850 }
851
852 static void free_context_table(struct intel_iommu *iommu)
853 {
854         int i;
855         unsigned long flags;
856         struct context_entry *context;
857
858         spin_lock_irqsave(&iommu->lock, flags);
859         if (!iommu->root_entry) {
860                 goto out;
861         }
862         for (i = 0; i < ROOT_ENTRY_NR; i++) {
863                 context = iommu_context_addr(iommu, i, 0, 0);
864                 if (context)
865                         free_pgtable_page(context);
866
867                 if (!sm_supported(iommu))
868                         continue;
869
870                 context = iommu_context_addr(iommu, i, 0x80, 0);
871                 if (context)
872                         free_pgtable_page(context);
873
874         }
875         free_pgtable_page(iommu->root_entry);
876         iommu->root_entry = NULL;
877 out:
878         spin_unlock_irqrestore(&iommu->lock, flags);
879 }
880
881 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
882                                       unsigned long pfn, int *target_level)
883 {
884         struct dma_pte *parent, *pte;
885         int level = agaw_to_level(domain->agaw);
886         int offset;
887
888         BUG_ON(!domain->pgd);
889
890         if (!domain_pfn_supported(domain, pfn))
891                 /* Address beyond IOMMU's addressing capabilities. */
892                 return NULL;
893
894         parent = domain->pgd;
895
896         while (1) {
897                 void *tmp_page;
898
899                 offset = pfn_level_offset(pfn, level);
900                 pte = &parent[offset];
901                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
902                         break;
903                 if (level == *target_level)
904                         break;
905
906                 if (!dma_pte_present(pte)) {
907                         uint64_t pteval;
908
909                         tmp_page = alloc_pgtable_page(domain->nid);
910
911                         if (!tmp_page)
912                                 return NULL;
913
914                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
915                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
916                         if (cmpxchg64(&pte->val, 0ULL, pteval))
917                                 /* Someone else set it while we were thinking; use theirs. */
918                                 free_pgtable_page(tmp_page);
919                         else
920                                 domain_flush_cache(domain, pte, sizeof(*pte));
921                 }
922                 if (level == 1)
923                         break;
924
925                 parent = phys_to_virt(dma_pte_addr(pte));
926                 level--;
927         }
928
929         if (!*target_level)
930                 *target_level = level;
931
932         return pte;
933 }
934
935 /* return address's pte at specific level */
936 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
937                                          unsigned long pfn,
938                                          int level, int *large_page)
939 {
940         struct dma_pte *parent, *pte;
941         int total = agaw_to_level(domain->agaw);
942         int offset;
943
944         parent = domain->pgd;
945         while (level <= total) {
946                 offset = pfn_level_offset(pfn, total);
947                 pte = &parent[offset];
948                 if (level == total)
949                         return pte;
950
951                 if (!dma_pte_present(pte)) {
952                         *large_page = total;
953                         break;
954                 }
955
956                 if (dma_pte_superpage(pte)) {
957                         *large_page = total;
958                         return pte;
959                 }
960
961                 parent = phys_to_virt(dma_pte_addr(pte));
962                 total--;
963         }
964         return NULL;
965 }
966
967 /* clear last level pte, a tlb flush should be followed */
968 static void dma_pte_clear_range(struct dmar_domain *domain,
969                                 unsigned long start_pfn,
970                                 unsigned long last_pfn)
971 {
972         unsigned int large_page;
973         struct dma_pte *first_pte, *pte;
974
975         BUG_ON(!domain_pfn_supported(domain, start_pfn));
976         BUG_ON(!domain_pfn_supported(domain, last_pfn));
977         BUG_ON(start_pfn > last_pfn);
978
979         /* we don't need lock here; nobody else touches the iova range */
980         do {
981                 large_page = 1;
982                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
983                 if (!pte) {
984                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
985                         continue;
986                 }
987                 do {
988                         dma_clear_pte(pte);
989                         start_pfn += lvl_to_nr_pages(large_page);
990                         pte++;
991                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
992
993                 domain_flush_cache(domain, first_pte,
994                                    (void *)pte - (void *)first_pte);
995
996         } while (start_pfn && start_pfn <= last_pfn);
997 }
998
999 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1000                                int retain_level, struct dma_pte *pte,
1001                                unsigned long pfn, unsigned long start_pfn,
1002                                unsigned long last_pfn)
1003 {
1004         pfn = max(start_pfn, pfn);
1005         pte = &pte[pfn_level_offset(pfn, level)];
1006
1007         do {
1008                 unsigned long level_pfn;
1009                 struct dma_pte *level_pte;
1010
1011                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1012                         goto next;
1013
1014                 level_pfn = pfn & level_mask(level);
1015                 level_pte = phys_to_virt(dma_pte_addr(pte));
1016
1017                 if (level > 2) {
1018                         dma_pte_free_level(domain, level - 1, retain_level,
1019                                            level_pte, level_pfn, start_pfn,
1020                                            last_pfn);
1021                 }
1022
1023                 /*
1024                  * Free the page table if we're below the level we want to
1025                  * retain and the range covers the entire table.
1026                  */
1027                 if (level < retain_level && !(start_pfn > level_pfn ||
1028                       last_pfn < level_pfn + level_size(level) - 1)) {
1029                         dma_clear_pte(pte);
1030                         domain_flush_cache(domain, pte, sizeof(*pte));
1031                         free_pgtable_page(level_pte);
1032                 }
1033 next:
1034                 pfn += level_size(level);
1035         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1036 }
1037
1038 /*
1039  * clear last level (leaf) ptes and free page table pages below the
1040  * level we wish to keep intact.
1041  */
1042 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1043                                    unsigned long start_pfn,
1044                                    unsigned long last_pfn,
1045                                    int retain_level)
1046 {
1047         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1048         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1049         BUG_ON(start_pfn > last_pfn);
1050
1051         dma_pte_clear_range(domain, start_pfn, last_pfn);
1052
1053         /* We don't need lock here; nobody else touches the iova range */
1054         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1055                            domain->pgd, 0, start_pfn, last_pfn);
1056
1057         /* free pgd */
1058         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1059                 free_pgtable_page(domain->pgd);
1060                 domain->pgd = NULL;
1061         }
1062 }
1063
1064 /* When a page at a given level is being unlinked from its parent, we don't
1065    need to *modify* it at all. All we need to do is make a list of all the
1066    pages which can be freed just as soon as we've flushed the IOTLB and we
1067    know the hardware page-walk will no longer touch them.
1068    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1069    be freed. */
1070 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1071                                             int level, struct dma_pte *pte,
1072                                             struct page *freelist)
1073 {
1074         struct page *pg;
1075
1076         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1077         pg->freelist = freelist;
1078         freelist = pg;
1079
1080         if (level == 1)
1081                 return freelist;
1082
1083         pte = page_address(pg);
1084         do {
1085                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1086                         freelist = dma_pte_list_pagetables(domain, level - 1,
1087                                                            pte, freelist);
1088                 pte++;
1089         } while (!first_pte_in_page(pte));
1090
1091         return freelist;
1092 }
1093
1094 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1095                                         struct dma_pte *pte, unsigned long pfn,
1096                                         unsigned long start_pfn,
1097                                         unsigned long last_pfn,
1098                                         struct page *freelist)
1099 {
1100         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1101
1102         pfn = max(start_pfn, pfn);
1103         pte = &pte[pfn_level_offset(pfn, level)];
1104
1105         do {
1106                 unsigned long level_pfn;
1107
1108                 if (!dma_pte_present(pte))
1109                         goto next;
1110
1111                 level_pfn = pfn & level_mask(level);
1112
1113                 /* If range covers entire pagetable, free it */
1114                 if (start_pfn <= level_pfn &&
1115                     last_pfn >= level_pfn + level_size(level) - 1) {
1116                         /* These suborbinate page tables are going away entirely. Don't
1117                            bother to clear them; we're just going to *free* them. */
1118                         if (level > 1 && !dma_pte_superpage(pte))
1119                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1120
1121                         dma_clear_pte(pte);
1122                         if (!first_pte)
1123                                 first_pte = pte;
1124                         last_pte = pte;
1125                 } else if (level > 1) {
1126                         /* Recurse down into a level that isn't *entirely* obsolete */
1127                         freelist = dma_pte_clear_level(domain, level - 1,
1128                                                        phys_to_virt(dma_pte_addr(pte)),
1129                                                        level_pfn, start_pfn, last_pfn,
1130                                                        freelist);
1131                 }
1132 next:
1133                 pfn += level_size(level);
1134         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1135
1136         if (first_pte)
1137                 domain_flush_cache(domain, first_pte,
1138                                    (void *)++last_pte - (void *)first_pte);
1139
1140         return freelist;
1141 }
1142
1143 /* We can't just free the pages because the IOMMU may still be walking
1144    the page tables, and may have cached the intermediate levels. The
1145    pages can only be freed after the IOTLB flush has been done. */
1146 static struct page *domain_unmap(struct dmar_domain *domain,
1147                                  unsigned long start_pfn,
1148                                  unsigned long last_pfn)
1149 {
1150         struct page *freelist;
1151
1152         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1153         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1154         BUG_ON(start_pfn > last_pfn);
1155
1156         /* we don't need lock here; nobody else touches the iova range */
1157         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1158                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1159
1160         /* free pgd */
1161         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1162                 struct page *pgd_page = virt_to_page(domain->pgd);
1163                 pgd_page->freelist = freelist;
1164                 freelist = pgd_page;
1165
1166                 domain->pgd = NULL;
1167         }
1168
1169         return freelist;
1170 }
1171
1172 static void dma_free_pagelist(struct page *freelist)
1173 {
1174         struct page *pg;
1175
1176         while ((pg = freelist)) {
1177                 freelist = pg->freelist;
1178                 free_pgtable_page(page_address(pg));
1179         }
1180 }
1181
1182 static void iova_entry_free(unsigned long data)
1183 {
1184         struct page *freelist = (struct page *)data;
1185
1186         dma_free_pagelist(freelist);
1187 }
1188
1189 /* iommu handling */
1190 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1191 {
1192         struct root_entry *root;
1193         unsigned long flags;
1194
1195         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1196         if (!root) {
1197                 pr_err("Allocating root entry for %s failed\n",
1198                         iommu->name);
1199                 return -ENOMEM;
1200         }
1201
1202         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1203
1204         spin_lock_irqsave(&iommu->lock, flags);
1205         iommu->root_entry = root;
1206         spin_unlock_irqrestore(&iommu->lock, flags);
1207
1208         return 0;
1209 }
1210
1211 static void iommu_set_root_entry(struct intel_iommu *iommu)
1212 {
1213         u64 addr;
1214         u32 sts;
1215         unsigned long flag;
1216
1217         addr = virt_to_phys(iommu->root_entry);
1218         if (sm_supported(iommu))
1219                 addr |= DMA_RTADDR_SMT;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1223
1224         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1225
1226         /* Make sure hardware complete it */
1227         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1228                       readl, (sts & DMA_GSTS_RTPS), sts);
1229
1230         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1231 }
1232
1233 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1234 {
1235         u32 val;
1236         unsigned long flag;
1237
1238         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1239                 return;
1240
1241         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1242         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1243
1244         /* Make sure hardware complete it */
1245         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1246                       readl, (!(val & DMA_GSTS_WBFS)), val);
1247
1248         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1249 }
1250
1251 /* return value determine if we need a write buffer flush */
1252 static void __iommu_flush_context(struct intel_iommu *iommu,
1253                                   u16 did, u16 source_id, u8 function_mask,
1254                                   u64 type)
1255 {
1256         u64 val = 0;
1257         unsigned long flag;
1258
1259         switch (type) {
1260         case DMA_CCMD_GLOBAL_INVL:
1261                 val = DMA_CCMD_GLOBAL_INVL;
1262                 break;
1263         case DMA_CCMD_DOMAIN_INVL:
1264                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1265                 break;
1266         case DMA_CCMD_DEVICE_INVL:
1267                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1268                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1269                 break;
1270         default:
1271                 BUG();
1272         }
1273         val |= DMA_CCMD_ICC;
1274
1275         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1276         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1277
1278         /* Make sure hardware complete it */
1279         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1280                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1281
1282         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1283 }
1284
1285 /* return value determine if we need a write buffer flush */
1286 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1287                                 u64 addr, unsigned int size_order, u64 type)
1288 {
1289         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1290         u64 val = 0, val_iva = 0;
1291         unsigned long flag;
1292
1293         switch (type) {
1294         case DMA_TLB_GLOBAL_FLUSH:
1295                 /* global flush doesn't need set IVA_REG */
1296                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1297                 break;
1298         case DMA_TLB_DSI_FLUSH:
1299                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300                 break;
1301         case DMA_TLB_PSI_FLUSH:
1302                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1303                 /* IH bit is passed in as part of address */
1304                 val_iva = size_order | addr;
1305                 break;
1306         default:
1307                 BUG();
1308         }
1309         /* Note: set drain read/write */
1310 #if 0
1311         /*
1312          * This is probably to be super secure.. Looks like we can
1313          * ignore it without any impact.
1314          */
1315         if (cap_read_drain(iommu->cap))
1316                 val |= DMA_TLB_READ_DRAIN;
1317 #endif
1318         if (cap_write_drain(iommu->cap))
1319                 val |= DMA_TLB_WRITE_DRAIN;
1320
1321         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1322         /* Note: Only uses first TLB reg currently */
1323         if (val_iva)
1324                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1325         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1326
1327         /* Make sure hardware complete it */
1328         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1329                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1330
1331         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1332
1333         /* check IOTLB invalidation granularity */
1334         if (DMA_TLB_IAIG(val) == 0)
1335                 pr_err("Flush IOTLB failed\n");
1336         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1337                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1338                         (unsigned long long)DMA_TLB_IIRG(type),
1339                         (unsigned long long)DMA_TLB_IAIG(val));
1340 }
1341
1342 static struct device_domain_info *
1343 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1344                          u8 bus, u8 devfn)
1345 {
1346         struct device_domain_info *info;
1347
1348         assert_spin_locked(&device_domain_lock);
1349
1350         if (!iommu->qi)
1351                 return NULL;
1352
1353         list_for_each_entry(info, &domain->devices, link)
1354                 if (info->iommu == iommu && info->bus == bus &&
1355                     info->devfn == devfn) {
1356                         if (info->ats_supported && info->dev)
1357                                 return info;
1358                         break;
1359                 }
1360
1361         return NULL;
1362 }
1363
1364 static void domain_update_iotlb(struct dmar_domain *domain)
1365 {
1366         struct device_domain_info *info;
1367         bool has_iotlb_device = false;
1368
1369         assert_spin_locked(&device_domain_lock);
1370
1371         list_for_each_entry(info, &domain->devices, link) {
1372                 struct pci_dev *pdev;
1373
1374                 if (!info->dev || !dev_is_pci(info->dev))
1375                         continue;
1376
1377                 pdev = to_pci_dev(info->dev);
1378                 if (pdev->ats_enabled) {
1379                         has_iotlb_device = true;
1380                         break;
1381                 }
1382         }
1383
1384         domain->has_iotlb_device = has_iotlb_device;
1385 }
1386
1387 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1388 {
1389         struct pci_dev *pdev;
1390
1391         assert_spin_locked(&device_domain_lock);
1392
1393         if (!info || !dev_is_pci(info->dev))
1394                 return;
1395
1396         pdev = to_pci_dev(info->dev);
1397         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1398          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1399          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1400          * reserved, which should be set to 0.
1401          */
1402         if (!ecap_dit(info->iommu->ecap))
1403                 info->pfsid = 0;
1404         else {
1405                 struct pci_dev *pf_pdev;
1406
1407                 /* pdev will be returned if device is not a vf */
1408                 pf_pdev = pci_physfn(pdev);
1409                 info->pfsid = pci_dev_id(pf_pdev);
1410         }
1411
1412 #ifdef CONFIG_INTEL_IOMMU_SVM
1413         /* The PCIe spec, in its wisdom, declares that the behaviour of
1414            the device if you enable PASID support after ATS support is
1415            undefined. So always enable PASID support on devices which
1416            have it, even if we can't yet know if we're ever going to
1417            use it. */
1418         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1419                 info->pasid_enabled = 1;
1420
1421         if (info->pri_supported &&
1422             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1423             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1424                 info->pri_enabled = 1;
1425 #endif
1426         if (!pdev->untrusted && info->ats_supported &&
1427             pci_ats_page_aligned(pdev) &&
1428             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1429                 info->ats_enabled = 1;
1430                 domain_update_iotlb(info->domain);
1431                 info->ats_qdep = pci_ats_queue_depth(pdev);
1432         }
1433 }
1434
1435 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1436 {
1437         struct pci_dev *pdev;
1438
1439         assert_spin_locked(&device_domain_lock);
1440
1441         if (!dev_is_pci(info->dev))
1442                 return;
1443
1444         pdev = to_pci_dev(info->dev);
1445
1446         if (info->ats_enabled) {
1447                 pci_disable_ats(pdev);
1448                 info->ats_enabled = 0;
1449                 domain_update_iotlb(info->domain);
1450         }
1451 #ifdef CONFIG_INTEL_IOMMU_SVM
1452         if (info->pri_enabled) {
1453                 pci_disable_pri(pdev);
1454                 info->pri_enabled = 0;
1455         }
1456         if (info->pasid_enabled) {
1457                 pci_disable_pasid(pdev);
1458                 info->pasid_enabled = 0;
1459         }
1460 #endif
1461 }
1462
1463 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1464                                   u64 addr, unsigned mask)
1465 {
1466         u16 sid, qdep;
1467         unsigned long flags;
1468         struct device_domain_info *info;
1469
1470         if (!domain->has_iotlb_device)
1471                 return;
1472
1473         spin_lock_irqsave(&device_domain_lock, flags);
1474         list_for_each_entry(info, &domain->devices, link) {
1475                 if (!info->ats_enabled)
1476                         continue;
1477
1478                 sid = info->bus << 8 | info->devfn;
1479                 qdep = info->ats_qdep;
1480                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1481                                 qdep, addr, mask);
1482         }
1483         spin_unlock_irqrestore(&device_domain_lock, flags);
1484 }
1485
1486 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1487                                   struct dmar_domain *domain,
1488                                   unsigned long pfn, unsigned int pages,
1489                                   int ih, int map)
1490 {
1491         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1492         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1493         u16 did = domain->iommu_did[iommu->seq_id];
1494
1495         BUG_ON(pages == 0);
1496
1497         if (ih)
1498                 ih = 1 << 6;
1499         /*
1500          * Fallback to domain selective flush if no PSI support or the size is
1501          * too big.
1502          * PSI requires page size to be 2 ^ x, and the base address is naturally
1503          * aligned to the size
1504          */
1505         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1506                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1507                                                 DMA_TLB_DSI_FLUSH);
1508         else
1509                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1510                                                 DMA_TLB_PSI_FLUSH);
1511
1512         /*
1513          * In caching mode, changes of pages from non-present to present require
1514          * flush. However, device IOTLB doesn't need to be flushed in this case.
1515          */
1516         if (!cap_caching_mode(iommu->cap) || !map)
1517                 iommu_flush_dev_iotlb(domain, addr, mask);
1518 }
1519
1520 /* Notification for newly created mappings */
1521 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1522                                         struct dmar_domain *domain,
1523                                         unsigned long pfn, unsigned int pages)
1524 {
1525         /* It's a non-present to present mapping. Only flush if caching mode */
1526         if (cap_caching_mode(iommu->cap))
1527                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1528         else
1529                 iommu_flush_write_buffer(iommu);
1530 }
1531
1532 static void iommu_flush_iova(struct iova_domain *iovad)
1533 {
1534         struct dmar_domain *domain;
1535         int idx;
1536
1537         domain = container_of(iovad, struct dmar_domain, iovad);
1538
1539         for_each_domain_iommu(idx, domain) {
1540                 struct intel_iommu *iommu = g_iommus[idx];
1541                 u16 did = domain->iommu_did[iommu->seq_id];
1542
1543                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1544
1545                 if (!cap_caching_mode(iommu->cap))
1546                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1547                                               0, MAX_AGAW_PFN_WIDTH);
1548         }
1549 }
1550
1551 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1552 {
1553         u32 pmen;
1554         unsigned long flags;
1555
1556         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1557                 return;
1558
1559         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1560         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1561         pmen &= ~DMA_PMEN_EPM;
1562         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1563
1564         /* wait for the protected region status bit to clear */
1565         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1566                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1567
1568         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1569 }
1570
1571 static void iommu_enable_translation(struct intel_iommu *iommu)
1572 {
1573         u32 sts;
1574         unsigned long flags;
1575
1576         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1577         iommu->gcmd |= DMA_GCMD_TE;
1578         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1579
1580         /* Make sure hardware complete it */
1581         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1582                       readl, (sts & DMA_GSTS_TES), sts);
1583
1584         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1585 }
1586
1587 static void iommu_disable_translation(struct intel_iommu *iommu)
1588 {
1589         u32 sts;
1590         unsigned long flag;
1591
1592         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1593         iommu->gcmd &= ~DMA_GCMD_TE;
1594         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1595
1596         /* Make sure hardware complete it */
1597         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1598                       readl, (!(sts & DMA_GSTS_TES)), sts);
1599
1600         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1601 }
1602
1603 static int iommu_init_domains(struct intel_iommu *iommu)
1604 {
1605         u32 ndomains, nlongs;
1606         size_t size;
1607
1608         ndomains = cap_ndoms(iommu->cap);
1609         pr_debug("%s: Number of Domains supported <%d>\n",
1610                  iommu->name, ndomains);
1611         nlongs = BITS_TO_LONGS(ndomains);
1612
1613         spin_lock_init(&iommu->lock);
1614
1615         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1616         if (!iommu->domain_ids) {
1617                 pr_err("%s: Allocating domain id array failed\n",
1618                        iommu->name);
1619                 return -ENOMEM;
1620         }
1621
1622         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1623         iommu->domains = kzalloc(size, GFP_KERNEL);
1624
1625         if (iommu->domains) {
1626                 size = 256 * sizeof(struct dmar_domain *);
1627                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1628         }
1629
1630         if (!iommu->domains || !iommu->domains[0]) {
1631                 pr_err("%s: Allocating domain array failed\n",
1632                        iommu->name);
1633                 kfree(iommu->domain_ids);
1634                 kfree(iommu->domains);
1635                 iommu->domain_ids = NULL;
1636                 iommu->domains    = NULL;
1637                 return -ENOMEM;
1638         }
1639
1640         /*
1641          * If Caching mode is set, then invalid translations are tagged
1642          * with domain-id 0, hence we need to pre-allocate it. We also
1643          * use domain-id 0 as a marker for non-allocated domain-id, so
1644          * make sure it is not used for a real domain.
1645          */
1646         set_bit(0, iommu->domain_ids);
1647
1648         /*
1649          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1650          * entry for first-level or pass-through translation modes should
1651          * be programmed with a domain id different from those used for
1652          * second-level or nested translation. We reserve a domain id for
1653          * this purpose.
1654          */
1655         if (sm_supported(iommu))
1656                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1657
1658         return 0;
1659 }
1660
1661 static void disable_dmar_iommu(struct intel_iommu *iommu)
1662 {
1663         struct device_domain_info *info, *tmp;
1664         unsigned long flags;
1665
1666         if (!iommu->domains || !iommu->domain_ids)
1667                 return;
1668
1669         spin_lock_irqsave(&device_domain_lock, flags);
1670         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1671                 if (info->iommu != iommu)
1672                         continue;
1673
1674                 if (!info->dev || !info->domain)
1675                         continue;
1676
1677                 __dmar_remove_one_dev_info(info);
1678         }
1679         spin_unlock_irqrestore(&device_domain_lock, flags);
1680
1681         if (iommu->gcmd & DMA_GCMD_TE)
1682                 iommu_disable_translation(iommu);
1683 }
1684
1685 static void free_dmar_iommu(struct intel_iommu *iommu)
1686 {
1687         if ((iommu->domains) && (iommu->domain_ids)) {
1688                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1689                 int i;
1690
1691                 for (i = 0; i < elems; i++)
1692                         kfree(iommu->domains[i]);
1693                 kfree(iommu->domains);
1694                 kfree(iommu->domain_ids);
1695                 iommu->domains = NULL;
1696                 iommu->domain_ids = NULL;
1697         }
1698
1699         g_iommus[iommu->seq_id] = NULL;
1700
1701         /* free context mapping */
1702         free_context_table(iommu);
1703
1704 #ifdef CONFIG_INTEL_IOMMU_SVM
1705         if (pasid_supported(iommu)) {
1706                 if (ecap_prs(iommu->ecap))
1707                         intel_svm_finish_prq(iommu);
1708         }
1709 #endif
1710 }
1711
1712 static struct dmar_domain *alloc_domain(int flags)
1713 {
1714         struct dmar_domain *domain;
1715
1716         domain = alloc_domain_mem();
1717         if (!domain)
1718                 return NULL;
1719
1720         memset(domain, 0, sizeof(*domain));
1721         domain->nid = NUMA_NO_NODE;
1722         domain->flags = flags;
1723         domain->has_iotlb_device = false;
1724         INIT_LIST_HEAD(&domain->devices);
1725
1726         return domain;
1727 }
1728
1729 /* Must be called with iommu->lock */
1730 static int domain_attach_iommu(struct dmar_domain *domain,
1731                                struct intel_iommu *iommu)
1732 {
1733         unsigned long ndomains;
1734         int num;
1735
1736         assert_spin_locked(&device_domain_lock);
1737         assert_spin_locked(&iommu->lock);
1738
1739         domain->iommu_refcnt[iommu->seq_id] += 1;
1740         domain->iommu_count += 1;
1741         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1742                 ndomains = cap_ndoms(iommu->cap);
1743                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1744
1745                 if (num >= ndomains) {
1746                         pr_err("%s: No free domain ids\n", iommu->name);
1747                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1748                         domain->iommu_count -= 1;
1749                         return -ENOSPC;
1750                 }
1751
1752                 set_bit(num, iommu->domain_ids);
1753                 set_iommu_domain(iommu, num, domain);
1754
1755                 domain->iommu_did[iommu->seq_id] = num;
1756                 domain->nid                      = iommu->node;
1757
1758                 domain_update_iommu_cap(domain);
1759         }
1760
1761         return 0;
1762 }
1763
1764 static int domain_detach_iommu(struct dmar_domain *domain,
1765                                struct intel_iommu *iommu)
1766 {
1767         int num, count;
1768
1769         assert_spin_locked(&device_domain_lock);
1770         assert_spin_locked(&iommu->lock);
1771
1772         domain->iommu_refcnt[iommu->seq_id] -= 1;
1773         count = --domain->iommu_count;
1774         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1775                 num = domain->iommu_did[iommu->seq_id];
1776                 clear_bit(num, iommu->domain_ids);
1777                 set_iommu_domain(iommu, num, NULL);
1778
1779                 domain_update_iommu_cap(domain);
1780                 domain->iommu_did[iommu->seq_id] = 0;
1781         }
1782
1783         return count;
1784 }
1785
1786 static struct iova_domain reserved_iova_list;
1787 static struct lock_class_key reserved_rbtree_key;
1788
1789 static int dmar_init_reserved_ranges(void)
1790 {
1791         struct pci_dev *pdev = NULL;
1792         struct iova *iova;
1793         int i;
1794
1795         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1796
1797         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1798                 &reserved_rbtree_key);
1799
1800         /* IOAPIC ranges shouldn't be accessed by DMA */
1801         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1802                 IOVA_PFN(IOAPIC_RANGE_END));
1803         if (!iova) {
1804                 pr_err("Reserve IOAPIC range failed\n");
1805                 return -ENODEV;
1806         }
1807
1808         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1809         for_each_pci_dev(pdev) {
1810                 struct resource *r;
1811
1812                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1813                         r = &pdev->resource[i];
1814                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1815                                 continue;
1816                         iova = reserve_iova(&reserved_iova_list,
1817                                             IOVA_PFN(r->start),
1818                                             IOVA_PFN(r->end));
1819                         if (!iova) {
1820                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1821                                 return -ENODEV;
1822                         }
1823                 }
1824         }
1825         return 0;
1826 }
1827
1828 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1829 {
1830         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1831 }
1832
1833 static inline int guestwidth_to_adjustwidth(int gaw)
1834 {
1835         int agaw;
1836         int r = (gaw - 12) % 9;
1837
1838         if (r == 0)
1839                 agaw = gaw;
1840         else
1841                 agaw = gaw + 9 - r;
1842         if (agaw > 64)
1843                 agaw = 64;
1844         return agaw;
1845 }
1846
1847 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1848                        int guest_width)
1849 {
1850         int adjust_width, agaw;
1851         unsigned long sagaw;
1852         int err;
1853
1854         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1855
1856         err = init_iova_flush_queue(&domain->iovad,
1857                                     iommu_flush_iova, iova_entry_free);
1858         if (err)
1859                 return err;
1860
1861         domain_reserve_special_ranges(domain);
1862
1863         /* calculate AGAW */
1864         if (guest_width > cap_mgaw(iommu->cap))
1865                 guest_width = cap_mgaw(iommu->cap);
1866         domain->gaw = guest_width;
1867         adjust_width = guestwidth_to_adjustwidth(guest_width);
1868         agaw = width_to_agaw(adjust_width);
1869         sagaw = cap_sagaw(iommu->cap);
1870         if (!test_bit(agaw, &sagaw)) {
1871                 /* hardware doesn't support it, choose a bigger one */
1872                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1873                 agaw = find_next_bit(&sagaw, 5, agaw);
1874                 if (agaw >= 5)
1875                         return -ENODEV;
1876         }
1877         domain->agaw = agaw;
1878
1879         if (ecap_coherent(iommu->ecap))
1880                 domain->iommu_coherency = 1;
1881         else
1882                 domain->iommu_coherency = 0;
1883
1884         if (ecap_sc_support(iommu->ecap))
1885                 domain->iommu_snooping = 1;
1886         else
1887                 domain->iommu_snooping = 0;
1888
1889         if (intel_iommu_superpage)
1890                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1891         else
1892                 domain->iommu_superpage = 0;
1893
1894         domain->nid = iommu->node;
1895
1896         /* always allocate the top pgd */
1897         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1898         if (!domain->pgd)
1899                 return -ENOMEM;
1900         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1901         return 0;
1902 }
1903
1904 static void domain_exit(struct dmar_domain *domain)
1905 {
1906
1907         /* Remove associated devices and clear attached or cached domains */
1908         domain_remove_dev_info(domain);
1909
1910         /* destroy iovas */
1911         put_iova_domain(&domain->iovad);
1912
1913         if (domain->pgd) {
1914                 struct page *freelist;
1915
1916                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1917                 dma_free_pagelist(freelist);
1918         }
1919
1920         free_domain_mem(domain);
1921 }
1922
1923 /*
1924  * Get the PASID directory size for scalable mode context entry.
1925  * Value of X in the PDTS field of a scalable mode context entry
1926  * indicates PASID directory with 2^(X + 7) entries.
1927  */
1928 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1929 {
1930         int pds, max_pde;
1931
1932         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1933         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1934         if (pds < 7)
1935                 return 0;
1936
1937         return pds - 7;
1938 }
1939
1940 /*
1941  * Set the RID_PASID field of a scalable mode context entry. The
1942  * IOMMU hardware will use the PASID value set in this field for
1943  * DMA translations of DMA requests without PASID.
1944  */
1945 static inline void
1946 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1947 {
1948         context->hi |= pasid & ((1 << 20) - 1);
1949         context->hi |= (1 << 20);
1950 }
1951
1952 /*
1953  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1954  * entry.
1955  */
1956 static inline void context_set_sm_dte(struct context_entry *context)
1957 {
1958         context->lo |= (1 << 2);
1959 }
1960
1961 /*
1962  * Set the PRE(Page Request Enable) field of a scalable mode context
1963  * entry.
1964  */
1965 static inline void context_set_sm_pre(struct context_entry *context)
1966 {
1967         context->lo |= (1 << 4);
1968 }
1969
1970 /* Convert value to context PASID directory size field coding. */
1971 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1972
1973 static int domain_context_mapping_one(struct dmar_domain *domain,
1974                                       struct intel_iommu *iommu,
1975                                       struct pasid_table *table,
1976                                       u8 bus, u8 devfn)
1977 {
1978         u16 did = domain->iommu_did[iommu->seq_id];
1979         int translation = CONTEXT_TT_MULTI_LEVEL;
1980         struct device_domain_info *info = NULL;
1981         struct context_entry *context;
1982         unsigned long flags;
1983         int ret;
1984
1985         WARN_ON(did == 0);
1986
1987         if (hw_pass_through && domain_type_is_si(domain))
1988                 translation = CONTEXT_TT_PASS_THROUGH;
1989
1990         pr_debug("Set context mapping for %02x:%02x.%d\n",
1991                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1992
1993         BUG_ON(!domain->pgd);
1994
1995         spin_lock_irqsave(&device_domain_lock, flags);
1996         spin_lock(&iommu->lock);
1997
1998         ret = -ENOMEM;
1999         context = iommu_context_addr(iommu, bus, devfn, 1);
2000         if (!context)
2001                 goto out_unlock;
2002
2003         ret = 0;
2004         if (context_present(context))
2005                 goto out_unlock;
2006
2007         /*
2008          * For kdump cases, old valid entries may be cached due to the
2009          * in-flight DMA and copied pgtable, but there is no unmapping
2010          * behaviour for them, thus we need an explicit cache flush for
2011          * the newly-mapped device. For kdump, at this point, the device
2012          * is supposed to finish reset at its driver probe stage, so no
2013          * in-flight DMA will exist, and we don't need to worry anymore
2014          * hereafter.
2015          */
2016         if (context_copied(context)) {
2017                 u16 did_old = context_domain_id(context);
2018
2019                 if (did_old < cap_ndoms(iommu->cap)) {
2020                         iommu->flush.flush_context(iommu, did_old,
2021                                                    (((u16)bus) << 8) | devfn,
2022                                                    DMA_CCMD_MASK_NOBIT,
2023                                                    DMA_CCMD_DEVICE_INVL);
2024                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2025                                                  DMA_TLB_DSI_FLUSH);
2026                 }
2027         }
2028
2029         context_clear_entry(context);
2030
2031         if (sm_supported(iommu)) {
2032                 unsigned long pds;
2033
2034                 WARN_ON(!table);
2035
2036                 /* Setup the PASID DIR pointer: */
2037                 pds = context_get_sm_pds(table);
2038                 context->lo = (u64)virt_to_phys(table->table) |
2039                                 context_pdts(pds);
2040
2041                 /* Setup the RID_PASID field: */
2042                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2043
2044                 /*
2045                  * Setup the Device-TLB enable bit and Page request
2046                  * Enable bit:
2047                  */
2048                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2049                 if (info && info->ats_supported)
2050                         context_set_sm_dte(context);
2051                 if (info && info->pri_supported)
2052                         context_set_sm_pre(context);
2053         } else {
2054                 struct dma_pte *pgd = domain->pgd;
2055                 int agaw;
2056
2057                 context_set_domain_id(context, did);
2058
2059                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2060                         /*
2061                          * Skip top levels of page tables for iommu which has
2062                          * less agaw than default. Unnecessary for PT mode.
2063                          */
2064                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2065                                 ret = -ENOMEM;
2066                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2067                                 if (!dma_pte_present(pgd))
2068                                         goto out_unlock;
2069                         }
2070
2071                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2072                         if (info && info->ats_supported)
2073                                 translation = CONTEXT_TT_DEV_IOTLB;
2074                         else
2075                                 translation = CONTEXT_TT_MULTI_LEVEL;
2076
2077                         context_set_address_root(context, virt_to_phys(pgd));
2078                         context_set_address_width(context, agaw);
2079                 } else {
2080                         /*
2081                          * In pass through mode, AW must be programmed to
2082                          * indicate the largest AGAW value supported by
2083                          * hardware. And ASR is ignored by hardware.
2084                          */
2085                         context_set_address_width(context, iommu->msagaw);
2086                 }
2087
2088                 context_set_translation_type(context, translation);
2089         }
2090
2091         context_set_fault_enable(context);
2092         context_set_present(context);
2093         domain_flush_cache(domain, context, sizeof(*context));
2094
2095         /*
2096          * It's a non-present to present mapping. If hardware doesn't cache
2097          * non-present entry we only need to flush the write-buffer. If the
2098          * _does_ cache non-present entries, then it does so in the special
2099          * domain #0, which we have to flush:
2100          */
2101         if (cap_caching_mode(iommu->cap)) {
2102                 iommu->flush.flush_context(iommu, 0,
2103                                            (((u16)bus) << 8) | devfn,
2104                                            DMA_CCMD_MASK_NOBIT,
2105                                            DMA_CCMD_DEVICE_INVL);
2106                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2107         } else {
2108                 iommu_flush_write_buffer(iommu);
2109         }
2110         iommu_enable_dev_iotlb(info);
2111
2112         ret = 0;
2113
2114 out_unlock:
2115         spin_unlock(&iommu->lock);
2116         spin_unlock_irqrestore(&device_domain_lock, flags);
2117
2118         return ret;
2119 }
2120
2121 struct domain_context_mapping_data {
2122         struct dmar_domain *domain;
2123         struct intel_iommu *iommu;
2124         struct pasid_table *table;
2125 };
2126
2127 static int domain_context_mapping_cb(struct pci_dev *pdev,
2128                                      u16 alias, void *opaque)
2129 {
2130         struct domain_context_mapping_data *data = opaque;
2131
2132         return domain_context_mapping_one(data->domain, data->iommu,
2133                                           data->table, PCI_BUS_NUM(alias),
2134                                           alias & 0xff);
2135 }
2136
2137 static int
2138 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2139 {
2140         struct domain_context_mapping_data data;
2141         struct pasid_table *table;
2142         struct intel_iommu *iommu;
2143         u8 bus, devfn;
2144
2145         iommu = device_to_iommu(dev, &bus, &devfn);
2146         if (!iommu)
2147                 return -ENODEV;
2148
2149         table = intel_pasid_get_table(dev);
2150
2151         if (!dev_is_pci(dev))
2152                 return domain_context_mapping_one(domain, iommu, table,
2153                                                   bus, devfn);
2154
2155         data.domain = domain;
2156         data.iommu = iommu;
2157         data.table = table;
2158
2159         return pci_for_each_dma_alias(to_pci_dev(dev),
2160                                       &domain_context_mapping_cb, &data);
2161 }
2162
2163 static int domain_context_mapped_cb(struct pci_dev *pdev,
2164                                     u16 alias, void *opaque)
2165 {
2166         struct intel_iommu *iommu = opaque;
2167
2168         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2169 }
2170
2171 static int domain_context_mapped(struct device *dev)
2172 {
2173         struct intel_iommu *iommu;
2174         u8 bus, devfn;
2175
2176         iommu = device_to_iommu(dev, &bus, &devfn);
2177         if (!iommu)
2178                 return -ENODEV;
2179
2180         if (!dev_is_pci(dev))
2181                 return device_context_mapped(iommu, bus, devfn);
2182
2183         return !pci_for_each_dma_alias(to_pci_dev(dev),
2184                                        domain_context_mapped_cb, iommu);
2185 }
2186
2187 /* Returns a number of VTD pages, but aligned to MM page size */
2188 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2189                                             size_t size)
2190 {
2191         host_addr &= ~PAGE_MASK;
2192         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2193 }
2194
2195 /* Return largest possible superpage level for a given mapping */
2196 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2197                                           unsigned long iov_pfn,
2198                                           unsigned long phy_pfn,
2199                                           unsigned long pages)
2200 {
2201         int support, level = 1;
2202         unsigned long pfnmerge;
2203
2204         support = domain->iommu_superpage;
2205
2206         /* To use a large page, the virtual *and* physical addresses
2207            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2208            of them will mean we have to use smaller pages. So just
2209            merge them and check both at once. */
2210         pfnmerge = iov_pfn | phy_pfn;
2211
2212         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2213                 pages >>= VTD_STRIDE_SHIFT;
2214                 if (!pages)
2215                         break;
2216                 pfnmerge >>= VTD_STRIDE_SHIFT;
2217                 level++;
2218                 support--;
2219         }
2220         return level;
2221 }
2222
2223 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2224                             struct scatterlist *sg, unsigned long phys_pfn,
2225                             unsigned long nr_pages, int prot)
2226 {
2227         struct dma_pte *first_pte = NULL, *pte = NULL;
2228         phys_addr_t uninitialized_var(pteval);
2229         unsigned long sg_res = 0;
2230         unsigned int largepage_lvl = 0;
2231         unsigned long lvl_pages = 0;
2232
2233         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2234
2235         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2236                 return -EINVAL;
2237
2238         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2239
2240         if (!sg) {
2241                 sg_res = nr_pages;
2242                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2243         }
2244
2245         while (nr_pages > 0) {
2246                 uint64_t tmp;
2247
2248                 if (!sg_res) {
2249                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2250
2251                         sg_res = aligned_nrpages(sg->offset, sg->length);
2252                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2253                         sg->dma_length = sg->length;
2254                         pteval = (sg_phys(sg) - pgoff) | prot;
2255                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2256                 }
2257
2258                 if (!pte) {
2259                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2260
2261                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2262                         if (!pte)
2263                                 return -ENOMEM;
2264                         /* It is large page*/
2265                         if (largepage_lvl > 1) {
2266                                 unsigned long nr_superpages, end_pfn;
2267
2268                                 pteval |= DMA_PTE_LARGE_PAGE;
2269                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2270
2271                                 nr_superpages = sg_res / lvl_pages;
2272                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2273
2274                                 /*
2275                                  * Ensure that old small page tables are
2276                                  * removed to make room for superpage(s).
2277                                  * We're adding new large pages, so make sure
2278                                  * we don't remove their parent tables.
2279                                  */
2280                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2281                                                        largepage_lvl + 1);
2282                         } else {
2283                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2284                         }
2285
2286                 }
2287                 /* We don't need lock here, nobody else
2288                  * touches the iova range
2289                  */
2290                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2291                 if (tmp) {
2292                         static int dumps = 5;
2293                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2294                                 iov_pfn, tmp, (unsigned long long)pteval);
2295                         if (dumps) {
2296                                 dumps--;
2297                                 debug_dma_dump_mappings(NULL);
2298                         }
2299                         WARN_ON(1);
2300                 }
2301
2302                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2303
2304                 BUG_ON(nr_pages < lvl_pages);
2305                 BUG_ON(sg_res < lvl_pages);
2306
2307                 nr_pages -= lvl_pages;
2308                 iov_pfn += lvl_pages;
2309                 phys_pfn += lvl_pages;
2310                 pteval += lvl_pages * VTD_PAGE_SIZE;
2311                 sg_res -= lvl_pages;
2312
2313                 /* If the next PTE would be the first in a new page, then we
2314                    need to flush the cache on the entries we've just written.
2315                    And then we'll need to recalculate 'pte', so clear it and
2316                    let it get set again in the if (!pte) block above.
2317
2318                    If we're done (!nr_pages) we need to flush the cache too.
2319
2320                    Also if we've been setting superpages, we may need to
2321                    recalculate 'pte' and switch back to smaller pages for the
2322                    end of the mapping, if the trailing size is not enough to
2323                    use another superpage (i.e. sg_res < lvl_pages). */
2324                 pte++;
2325                 if (!nr_pages || first_pte_in_page(pte) ||
2326                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2327                         domain_flush_cache(domain, first_pte,
2328                                            (void *)pte - (void *)first_pte);
2329                         pte = NULL;
2330                 }
2331
2332                 if (!sg_res && nr_pages)
2333                         sg = sg_next(sg);
2334         }
2335         return 0;
2336 }
2337
2338 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2339                           struct scatterlist *sg, unsigned long phys_pfn,
2340                           unsigned long nr_pages, int prot)
2341 {
2342         int iommu_id, ret;
2343         struct intel_iommu *iommu;
2344
2345         /* Do the real mapping first */
2346         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2347         if (ret)
2348                 return ret;
2349
2350         for_each_domain_iommu(iommu_id, domain) {
2351                 iommu = g_iommus[iommu_id];
2352                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2353         }
2354
2355         return 0;
2356 }
2357
2358 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2359                                     struct scatterlist *sg, unsigned long nr_pages,
2360                                     int prot)
2361 {
2362         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2363 }
2364
2365 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2366                                      unsigned long phys_pfn, unsigned long nr_pages,
2367                                      int prot)
2368 {
2369         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2370 }
2371
2372 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2373 {
2374         unsigned long flags;
2375         struct context_entry *context;
2376         u16 did_old;
2377
2378         if (!iommu)
2379                 return;
2380
2381         spin_lock_irqsave(&iommu->lock, flags);
2382         context = iommu_context_addr(iommu, bus, devfn, 0);
2383         if (!context) {
2384                 spin_unlock_irqrestore(&iommu->lock, flags);
2385                 return;
2386         }
2387         did_old = context_domain_id(context);
2388         context_clear_entry(context);
2389         __iommu_flush_cache(iommu, context, sizeof(*context));
2390         spin_unlock_irqrestore(&iommu->lock, flags);
2391         iommu->flush.flush_context(iommu,
2392                                    did_old,
2393                                    (((u16)bus) << 8) | devfn,
2394                                    DMA_CCMD_MASK_NOBIT,
2395                                    DMA_CCMD_DEVICE_INVL);
2396         iommu->flush.flush_iotlb(iommu,
2397                                  did_old,
2398                                  0,
2399                                  0,
2400                                  DMA_TLB_DSI_FLUSH);
2401 }
2402
2403 static inline void unlink_domain_info(struct device_domain_info *info)
2404 {
2405         assert_spin_locked(&device_domain_lock);
2406         list_del(&info->link);
2407         list_del(&info->global);
2408         if (info->dev)
2409                 info->dev->archdata.iommu = NULL;
2410 }
2411
2412 static void domain_remove_dev_info(struct dmar_domain *domain)
2413 {
2414         struct device_domain_info *info, *tmp;
2415         unsigned long flags;
2416
2417         spin_lock_irqsave(&device_domain_lock, flags);
2418         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2419                 __dmar_remove_one_dev_info(info);
2420         spin_unlock_irqrestore(&device_domain_lock, flags);
2421 }
2422
2423 /*
2424  * find_domain
2425  * Note: we use struct device->archdata.iommu stores the info
2426  */
2427 static struct dmar_domain *find_domain(struct device *dev)
2428 {
2429         struct device_domain_info *info;
2430
2431         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2432                 struct iommu_domain *domain;
2433
2434                 dev->archdata.iommu = NULL;
2435                 domain = iommu_get_domain_for_dev(dev);
2436                 if (domain)
2437                         intel_iommu_attach_device(domain, dev);
2438         }
2439
2440         /* No lock here, assumes no domain exit in normal case */
2441         info = dev->archdata.iommu;
2442
2443         if (likely(info))
2444                 return info->domain;
2445         return NULL;
2446 }
2447
2448 static inline struct device_domain_info *
2449 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2450 {
2451         struct device_domain_info *info;
2452
2453         list_for_each_entry(info, &device_domain_list, global)
2454                 if (info->iommu->segment == segment && info->bus == bus &&
2455                     info->devfn == devfn)
2456                         return info;
2457
2458         return NULL;
2459 }
2460
2461 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2462                                                     int bus, int devfn,
2463                                                     struct device *dev,
2464                                                     struct dmar_domain *domain)
2465 {
2466         struct dmar_domain *found = NULL;
2467         struct device_domain_info *info;
2468         unsigned long flags;
2469         int ret;
2470
2471         info = alloc_devinfo_mem();
2472         if (!info)
2473                 return NULL;
2474
2475         info->bus = bus;
2476         info->devfn = devfn;
2477         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2478         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2479         info->ats_qdep = 0;
2480         info->dev = dev;
2481         info->domain = domain;
2482         info->iommu = iommu;
2483         info->pasid_table = NULL;
2484         info->auxd_enabled = 0;
2485         INIT_LIST_HEAD(&info->auxiliary_domains);
2486
2487         if (dev && dev_is_pci(dev)) {
2488                 struct pci_dev *pdev = to_pci_dev(info->dev);
2489
2490                 if (!pdev->untrusted &&
2491                     !pci_ats_disabled() &&
2492                     ecap_dev_iotlb_support(iommu->ecap) &&
2493                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2494                     dmar_find_matched_atsr_unit(pdev))
2495                         info->ats_supported = 1;
2496
2497                 if (sm_supported(iommu)) {
2498                         if (pasid_supported(iommu)) {
2499                                 int features = pci_pasid_features(pdev);
2500                                 if (features >= 0)
2501                                         info->pasid_supported = features | 1;
2502                         }
2503
2504                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2505                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2506                                 info->pri_supported = 1;
2507                 }
2508         }
2509
2510         spin_lock_irqsave(&device_domain_lock, flags);
2511         if (dev)
2512                 found = find_domain(dev);
2513
2514         if (!found) {
2515                 struct device_domain_info *info2;
2516                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2517                 if (info2) {
2518                         found      = info2->domain;
2519                         info2->dev = dev;
2520                 }
2521         }
2522
2523         if (found) {
2524                 spin_unlock_irqrestore(&device_domain_lock, flags);
2525                 free_devinfo_mem(info);
2526                 /* Caller must free the original domain */
2527                 return found;
2528         }
2529
2530         spin_lock(&iommu->lock);
2531         ret = domain_attach_iommu(domain, iommu);
2532         spin_unlock(&iommu->lock);
2533
2534         if (ret) {
2535                 spin_unlock_irqrestore(&device_domain_lock, flags);
2536                 free_devinfo_mem(info);
2537                 return NULL;
2538         }
2539
2540         list_add(&info->link, &domain->devices);
2541         list_add(&info->global, &device_domain_list);
2542         if (dev)
2543                 dev->archdata.iommu = info;
2544         spin_unlock_irqrestore(&device_domain_lock, flags);
2545
2546         /* PASID table is mandatory for a PCI device in scalable mode. */
2547         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2548                 ret = intel_pasid_alloc_table(dev);
2549                 if (ret) {
2550                         dev_err(dev, "PASID table allocation failed\n");
2551                         dmar_remove_one_dev_info(dev);
2552                         return NULL;
2553                 }
2554
2555                 /* Setup the PASID entry for requests without PASID: */
2556                 spin_lock(&iommu->lock);
2557                 if (hw_pass_through && domain_type_is_si(domain))
2558                         ret = intel_pasid_setup_pass_through(iommu, domain,
2559                                         dev, PASID_RID2PASID);
2560                 else
2561                         ret = intel_pasid_setup_second_level(iommu, domain,
2562                                         dev, PASID_RID2PASID);
2563                 spin_unlock(&iommu->lock);
2564                 if (ret) {
2565                         dev_err(dev, "Setup RID2PASID failed\n");
2566                         dmar_remove_one_dev_info(dev);
2567                         return NULL;
2568                 }
2569         }
2570
2571         if (dev && domain_context_mapping(domain, dev)) {
2572                 dev_err(dev, "Domain context map failed\n");
2573                 dmar_remove_one_dev_info(dev);
2574                 return NULL;
2575         }
2576
2577         return domain;
2578 }
2579
2580 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2581 {
2582         *(u16 *)opaque = alias;
2583         return 0;
2584 }
2585
2586 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2587 {
2588         struct device_domain_info *info;
2589         struct dmar_domain *domain = NULL;
2590         struct intel_iommu *iommu;
2591         u16 dma_alias;
2592         unsigned long flags;
2593         u8 bus, devfn;
2594
2595         iommu = device_to_iommu(dev, &bus, &devfn);
2596         if (!iommu)
2597                 return NULL;
2598
2599         if (dev_is_pci(dev)) {
2600                 struct pci_dev *pdev = to_pci_dev(dev);
2601
2602                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2603
2604                 spin_lock_irqsave(&device_domain_lock, flags);
2605                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2606                                                       PCI_BUS_NUM(dma_alias),
2607                                                       dma_alias & 0xff);
2608                 if (info) {
2609                         iommu = info->iommu;
2610                         domain = info->domain;
2611                 }
2612                 spin_unlock_irqrestore(&device_domain_lock, flags);
2613
2614                 /* DMA alias already has a domain, use it */
2615                 if (info)
2616                         goto out;
2617         }
2618
2619         /* Allocate and initialize new domain for the device */
2620         domain = alloc_domain(0);
2621         if (!domain)
2622                 return NULL;
2623         if (domain_init(domain, iommu, gaw)) {
2624                 domain_exit(domain);
2625                 return NULL;
2626         }
2627
2628 out:
2629         return domain;
2630 }
2631
2632 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2633                                               struct dmar_domain *domain)
2634 {
2635         struct intel_iommu *iommu;
2636         struct dmar_domain *tmp;
2637         u16 req_id, dma_alias;
2638         u8 bus, devfn;
2639
2640         iommu = device_to_iommu(dev, &bus, &devfn);
2641         if (!iommu)
2642                 return NULL;
2643
2644         req_id = ((u16)bus << 8) | devfn;
2645
2646         if (dev_is_pci(dev)) {
2647                 struct pci_dev *pdev = to_pci_dev(dev);
2648
2649                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2650
2651                 /* register PCI DMA alias device */
2652                 if (req_id != dma_alias) {
2653                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2654                                         dma_alias & 0xff, NULL, domain);
2655
2656                         if (!tmp || tmp != domain)
2657                                 return tmp;
2658                 }
2659         }
2660
2661         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2662         if (!tmp || tmp != domain)
2663                 return tmp;
2664
2665         return domain;
2666 }
2667
2668 static int iommu_domain_identity_map(struct dmar_domain *domain,
2669                                      unsigned long long start,
2670                                      unsigned long long end)
2671 {
2672         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2673         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2674
2675         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2676                           dma_to_mm_pfn(last_vpfn))) {
2677                 pr_err("Reserving iova failed\n");
2678                 return -ENOMEM;
2679         }
2680
2681         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2682         /*
2683          * RMRR range might have overlap with physical memory range,
2684          * clear it first
2685          */
2686         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2687
2688         return __domain_mapping(domain, first_vpfn, NULL,
2689                                 first_vpfn, last_vpfn - first_vpfn + 1,
2690                                 DMA_PTE_READ|DMA_PTE_WRITE);
2691 }
2692
2693 static int domain_prepare_identity_map(struct device *dev,
2694                                        struct dmar_domain *domain,
2695                                        unsigned long long start,
2696                                        unsigned long long end)
2697 {
2698         /* For _hardware_ passthrough, don't bother. But for software
2699            passthrough, we do it anyway -- it may indicate a memory
2700            range which is reserved in E820, so which didn't get set
2701            up to start with in si_domain */
2702         if (domain == si_domain && hw_pass_through) {
2703                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2704                          start, end);
2705                 return 0;
2706         }
2707
2708         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2709
2710         if (end < start) {
2711                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2712                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2713                         dmi_get_system_info(DMI_BIOS_VENDOR),
2714                         dmi_get_system_info(DMI_BIOS_VERSION),
2715                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2716                 return -EIO;
2717         }
2718
2719         if (end >> agaw_to_width(domain->agaw)) {
2720                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2721                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2722                      agaw_to_width(domain->agaw),
2723                      dmi_get_system_info(DMI_BIOS_VENDOR),
2724                      dmi_get_system_info(DMI_BIOS_VERSION),
2725                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2726                 return -EIO;
2727         }
2728
2729         return iommu_domain_identity_map(domain, start, end);
2730 }
2731
2732 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2733
2734 static int __init si_domain_init(int hw)
2735 {
2736         struct dmar_rmrr_unit *rmrr;
2737         struct device *dev;
2738         int i, nid, ret;
2739
2740         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2741         if (!si_domain)
2742                 return -EFAULT;
2743
2744         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2745                 domain_exit(si_domain);
2746                 return -EFAULT;
2747         }
2748
2749         if (hw)
2750                 return 0;
2751
2752         for_each_online_node(nid) {
2753                 unsigned long start_pfn, end_pfn;
2754                 int i;
2755
2756                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2757                         ret = iommu_domain_identity_map(si_domain,
2758                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2759                         if (ret)
2760                                 return ret;
2761                 }
2762         }
2763
2764         /*
2765          * Normally we use DMA domains for devices which have RMRRs. But we
2766          * loose this requirement for graphic and usb devices. Identity map
2767          * the RMRRs for graphic and USB devices so that they could use the
2768          * si_domain.
2769          */
2770         for_each_rmrr_units(rmrr) {
2771                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2772                                           i, dev) {
2773                         unsigned long long start = rmrr->base_address;
2774                         unsigned long long end = rmrr->end_address;
2775
2776                         if (device_is_rmrr_locked(dev))
2777                                 continue;
2778
2779                         if (WARN_ON(end < start ||
2780                                     end >> agaw_to_width(si_domain->agaw)))
2781                                 continue;
2782
2783                         ret = iommu_domain_identity_map(si_domain, start, end);
2784                         if (ret)
2785                                 return ret;
2786                 }
2787         }
2788
2789         return 0;
2790 }
2791
2792 static int identity_mapping(struct device *dev)
2793 {
2794         struct device_domain_info *info;
2795
2796         info = dev->archdata.iommu;
2797         if (info && info != DUMMY_DEVICE_DOMAIN_INFO && info != DEFER_DEVICE_DOMAIN_INFO)
2798                 return (info->domain == si_domain);
2799
2800         return 0;
2801 }
2802
2803 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2804 {
2805         struct dmar_domain *ndomain;
2806         struct intel_iommu *iommu;
2807         u8 bus, devfn;
2808
2809         iommu = device_to_iommu(dev, &bus, &devfn);
2810         if (!iommu)
2811                 return -ENODEV;
2812
2813         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2814         if (ndomain != domain)
2815                 return -EBUSY;
2816
2817         return 0;
2818 }
2819
2820 static bool device_has_rmrr(struct device *dev)
2821 {
2822         struct dmar_rmrr_unit *rmrr;
2823         struct device *tmp;
2824         int i;
2825
2826         rcu_read_lock();
2827         for_each_rmrr_units(rmrr) {
2828                 /*
2829                  * Return TRUE if this RMRR contains the device that
2830                  * is passed in.
2831                  */
2832                 for_each_active_dev_scope(rmrr->devices,
2833                                           rmrr->devices_cnt, i, tmp)
2834                         if (tmp == dev ||
2835                             is_downstream_to_pci_bridge(dev, tmp)) {
2836                                 rcu_read_unlock();
2837                                 return true;
2838                         }
2839         }
2840         rcu_read_unlock();
2841         return false;
2842 }
2843
2844 /**
2845  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2846  * is relaxable (ie. is allowed to be not enforced under some conditions)
2847  * @dev: device handle
2848  *
2849  * We assume that PCI USB devices with RMRRs have them largely
2850  * for historical reasons and that the RMRR space is not actively used post
2851  * boot.  This exclusion may change if vendors begin to abuse it.
2852  *
2853  * The same exception is made for graphics devices, with the requirement that
2854  * any use of the RMRR regions will be torn down before assigning the device
2855  * to a guest.
2856  *
2857  * Return: true if the RMRR is relaxable, false otherwise
2858  */
2859 static bool device_rmrr_is_relaxable(struct device *dev)
2860 {
2861         struct pci_dev *pdev;
2862
2863         if (!dev_is_pci(dev))
2864                 return false;
2865
2866         pdev = to_pci_dev(dev);
2867         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2868                 return true;
2869         else
2870                 return false;
2871 }
2872
2873 /*
2874  * There are a couple cases where we need to restrict the functionality of
2875  * devices associated with RMRRs.  The first is when evaluating a device for
2876  * identity mapping because problems exist when devices are moved in and out
2877  * of domains and their respective RMRR information is lost.  This means that
2878  * a device with associated RMRRs will never be in a "passthrough" domain.
2879  * The second is use of the device through the IOMMU API.  This interface
2880  * expects to have full control of the IOVA space for the device.  We cannot
2881  * satisfy both the requirement that RMRR access is maintained and have an
2882  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2883  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2884  * We therefore prevent devices associated with an RMRR from participating in
2885  * the IOMMU API, which eliminates them from device assignment.
2886  *
2887  * In both cases, devices which have relaxable RMRRs are not concerned by this
2888  * restriction. See device_rmrr_is_relaxable comment.
2889  */
2890 static bool device_is_rmrr_locked(struct device *dev)
2891 {
2892         if (!device_has_rmrr(dev))
2893                 return false;
2894
2895         if (device_rmrr_is_relaxable(dev))
2896                 return false;
2897
2898         return true;
2899 }
2900
2901 /*
2902  * Return the required default domain type for a specific device.
2903  *
2904  * @dev: the device in query
2905  * @startup: true if this is during early boot
2906  *
2907  * Returns:
2908  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2909  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2910  *  - 0: both identity and dynamic domains work for this device
2911  */
2912 static int device_def_domain_type(struct device *dev)
2913 {
2914         if (dev_is_pci(dev)) {
2915                 struct pci_dev *pdev = to_pci_dev(dev);
2916
2917                 if (device_is_rmrr_locked(dev))
2918                         return IOMMU_DOMAIN_DMA;
2919
2920                 /*
2921                  * Prevent any device marked as untrusted from getting
2922                  * placed into the statically identity mapping domain.
2923                  */
2924                 if (pdev->untrusted)
2925                         return IOMMU_DOMAIN_DMA;
2926
2927                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2928                         return IOMMU_DOMAIN_IDENTITY;
2929
2930                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2931                         return IOMMU_DOMAIN_IDENTITY;
2932
2933                 /*
2934                  * We want to start off with all devices in the 1:1 domain, and
2935                  * take them out later if we find they can't access all of memory.
2936                  *
2937                  * However, we can't do this for PCI devices behind bridges,
2938                  * because all PCI devices behind the same bridge will end up
2939                  * with the same source-id on their transactions.
2940                  *
2941                  * Practically speaking, we can't change things around for these
2942                  * devices at run-time, because we can't be sure there'll be no
2943                  * DMA transactions in flight for any of their siblings.
2944                  *
2945                  * So PCI devices (unless they're on the root bus) as well as
2946                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2947                  * the 1:1 domain, just in _case_ one of their siblings turns out
2948                  * not to be able to map all of memory.
2949                  */
2950                 if (!pci_is_pcie(pdev)) {
2951                         if (!pci_is_root_bus(pdev->bus))
2952                                 return IOMMU_DOMAIN_DMA;
2953                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2954                                 return IOMMU_DOMAIN_DMA;
2955                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2956                         return IOMMU_DOMAIN_DMA;
2957         } else {
2958                 if (device_has_rmrr(dev))
2959                         return IOMMU_DOMAIN_DMA;
2960         }
2961
2962         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2963                         IOMMU_DOMAIN_IDENTITY : 0;
2964 }
2965
2966 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2967 {
2968         /*
2969          * Start from the sane iommu hardware state.
2970          * If the queued invalidation is already initialized by us
2971          * (for example, while enabling interrupt-remapping) then
2972          * we got the things already rolling from a sane state.
2973          */
2974         if (!iommu->qi) {
2975                 /*
2976                  * Clear any previous faults.
2977                  */
2978                 dmar_fault(-1, iommu);
2979                 /*
2980                  * Disable queued invalidation if supported and already enabled
2981                  * before OS handover.
2982                  */
2983                 dmar_disable_qi(iommu);
2984         }
2985
2986         if (dmar_enable_qi(iommu)) {
2987                 /*
2988                  * Queued Invalidate not enabled, use Register Based Invalidate
2989                  */
2990                 iommu->flush.flush_context = __iommu_flush_context;
2991                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2992                 pr_info("%s: Using Register based invalidation\n",
2993                         iommu->name);
2994         } else {
2995                 iommu->flush.flush_context = qi_flush_context;
2996                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2997                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2998         }
2999 }
3000
3001 static int copy_context_table(struct intel_iommu *iommu,
3002                               struct root_entry *old_re,
3003                               struct context_entry **tbl,
3004                               int bus, bool ext)
3005 {
3006         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3007         struct context_entry *new_ce = NULL, ce;
3008         struct context_entry *old_ce = NULL;
3009         struct root_entry re;
3010         phys_addr_t old_ce_phys;
3011
3012         tbl_idx = ext ? bus * 2 : bus;
3013         memcpy(&re, old_re, sizeof(re));
3014
3015         for (devfn = 0; devfn < 256; devfn++) {
3016                 /* First calculate the correct index */
3017                 idx = (ext ? devfn * 2 : devfn) % 256;
3018
3019                 if (idx == 0) {
3020                         /* First save what we may have and clean up */
3021                         if (new_ce) {
3022                                 tbl[tbl_idx] = new_ce;
3023                                 __iommu_flush_cache(iommu, new_ce,
3024                                                     VTD_PAGE_SIZE);
3025                                 pos = 1;
3026                         }
3027
3028                         if (old_ce)
3029                                 memunmap(old_ce);
3030
3031                         ret = 0;
3032                         if (devfn < 0x80)
3033                                 old_ce_phys = root_entry_lctp(&re);
3034                         else
3035                                 old_ce_phys = root_entry_uctp(&re);
3036
3037                         if (!old_ce_phys) {
3038                                 if (ext && devfn == 0) {
3039                                         /* No LCTP, try UCTP */
3040                                         devfn = 0x7f;
3041                                         continue;
3042                                 } else {
3043                                         goto out;
3044                                 }
3045                         }
3046
3047                         ret = -ENOMEM;
3048                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3049                                         MEMREMAP_WB);
3050                         if (!old_ce)
3051                                 goto out;
3052
3053                         new_ce = alloc_pgtable_page(iommu->node);
3054                         if (!new_ce)
3055                                 goto out_unmap;
3056
3057                         ret = 0;
3058                 }
3059
3060                 /* Now copy the context entry */
3061                 memcpy(&ce, old_ce + idx, sizeof(ce));
3062
3063                 if (!__context_present(&ce))
3064                         continue;
3065
3066                 did = context_domain_id(&ce);
3067                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3068                         set_bit(did, iommu->domain_ids);
3069
3070                 /*
3071                  * We need a marker for copied context entries. This
3072                  * marker needs to work for the old format as well as
3073                  * for extended context entries.
3074                  *
3075                  * Bit 67 of the context entry is used. In the old
3076                  * format this bit is available to software, in the
3077                  * extended format it is the PGE bit, but PGE is ignored
3078                  * by HW if PASIDs are disabled (and thus still
3079                  * available).
3080                  *
3081                  * So disable PASIDs first and then mark the entry
3082                  * copied. This means that we don't copy PASID
3083                  * translations from the old kernel, but this is fine as
3084                  * faults there are not fatal.
3085                  */
3086                 context_clear_pasid_enable(&ce);
3087                 context_set_copied(&ce);
3088
3089                 new_ce[idx] = ce;
3090         }
3091
3092         tbl[tbl_idx + pos] = new_ce;
3093
3094         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3095
3096 out_unmap:
3097         memunmap(old_ce);
3098
3099 out:
3100         return ret;
3101 }
3102
3103 static int copy_translation_tables(struct intel_iommu *iommu)
3104 {
3105         struct context_entry **ctxt_tbls;
3106         struct root_entry *old_rt;
3107         phys_addr_t old_rt_phys;
3108         int ctxt_table_entries;
3109         unsigned long flags;
3110         u64 rtaddr_reg;
3111         int bus, ret;
3112         bool new_ext, ext;
3113
3114         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3115         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3116         new_ext    = !!ecap_ecs(iommu->ecap);
3117
3118         /*
3119          * The RTT bit can only be changed when translation is disabled,
3120          * but disabling translation means to open a window for data
3121          * corruption. So bail out and don't copy anything if we would
3122          * have to change the bit.
3123          */
3124         if (new_ext != ext)
3125                 return -EINVAL;
3126
3127         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3128         if (!old_rt_phys)
3129                 return -EINVAL;
3130
3131         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3132         if (!old_rt)
3133                 return -ENOMEM;
3134
3135         /* This is too big for the stack - allocate it from slab */
3136         ctxt_table_entries = ext ? 512 : 256;
3137         ret = -ENOMEM;
3138         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3139         if (!ctxt_tbls)
3140                 goto out_unmap;
3141
3142         for (bus = 0; bus < 256; bus++) {
3143                 ret = copy_context_table(iommu, &old_rt[bus],
3144                                          ctxt_tbls, bus, ext);
3145                 if (ret) {
3146                         pr_err("%s: Failed to copy context table for bus %d\n",
3147                                 iommu->name, bus);
3148                         continue;
3149                 }
3150         }
3151
3152         spin_lock_irqsave(&iommu->lock, flags);
3153
3154         /* Context tables are copied, now write them to the root_entry table */
3155         for (bus = 0; bus < 256; bus++) {
3156                 int idx = ext ? bus * 2 : bus;
3157                 u64 val;
3158
3159                 if (ctxt_tbls[idx]) {
3160                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3161                         iommu->root_entry[bus].lo = val;
3162                 }
3163
3164                 if (!ext || !ctxt_tbls[idx + 1])
3165                         continue;
3166
3167                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3168                 iommu->root_entry[bus].hi = val;
3169         }
3170
3171         spin_unlock_irqrestore(&iommu->lock, flags);
3172
3173         kfree(ctxt_tbls);
3174
3175         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3176
3177         ret = 0;
3178
3179 out_unmap:
3180         memunmap(old_rt);
3181
3182         return ret;
3183 }
3184
3185 static int __init init_dmars(void)
3186 {
3187         struct dmar_drhd_unit *drhd;
3188         struct intel_iommu *iommu;
3189         int ret;
3190
3191         /*
3192          * for each drhd
3193          *    allocate root
3194          *    initialize and program root entry to not present
3195          * endfor
3196          */
3197         for_each_drhd_unit(drhd) {
3198                 /*
3199                  * lock not needed as this is only incremented in the single
3200                  * threaded kernel __init code path all other access are read
3201                  * only
3202                  */
3203                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3204                         g_num_of_iommus++;
3205                         continue;
3206                 }
3207                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3208         }
3209
3210         /* Preallocate enough resources for IOMMU hot-addition */
3211         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3212                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3213
3214         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3215                         GFP_KERNEL);
3216         if (!g_iommus) {
3217                 pr_err("Allocating global iommu array failed\n");
3218                 ret = -ENOMEM;
3219                 goto error;
3220         }
3221
3222         for_each_iommu(iommu, drhd) {
3223                 if (drhd->ignored) {
3224                         iommu_disable_translation(iommu);
3225                         continue;
3226                 }
3227
3228                 /*
3229                  * Find the max pasid size of all IOMMU's in the system.
3230                  * We need to ensure the system pasid table is no bigger
3231                  * than the smallest supported.
3232                  */
3233                 if (pasid_supported(iommu)) {
3234                         u32 temp = 2 << ecap_pss(iommu->ecap);
3235
3236                         intel_pasid_max_id = min_t(u32, temp,
3237                                                    intel_pasid_max_id);
3238                 }
3239
3240                 g_iommus[iommu->seq_id] = iommu;
3241
3242                 intel_iommu_init_qi(iommu);
3243
3244                 ret = iommu_init_domains(iommu);
3245                 if (ret)
3246                         goto free_iommu;
3247
3248                 init_translation_status(iommu);
3249
3250                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3251                         iommu_disable_translation(iommu);
3252                         clear_translation_pre_enabled(iommu);
3253                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3254                                 iommu->name);
3255                 }
3256
3257                 /*
3258                  * TBD:
3259                  * we could share the same root & context tables
3260                  * among all IOMMU's. Need to Split it later.
3261                  */
3262                 ret = iommu_alloc_root_entry(iommu);
3263                 if (ret)
3264                         goto free_iommu;
3265
3266                 if (translation_pre_enabled(iommu)) {
3267                         pr_info("Translation already enabled - trying to copy translation structures\n");
3268
3269                         ret = copy_translation_tables(iommu);
3270                         if (ret) {
3271                                 /*
3272                                  * We found the IOMMU with translation
3273                                  * enabled - but failed to copy over the
3274                                  * old root-entry table. Try to proceed
3275                                  * by disabling translation now and
3276                                  * allocating a clean root-entry table.
3277                                  * This might cause DMAR faults, but
3278                                  * probably the dump will still succeed.
3279                                  */
3280                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3281                                        iommu->name);
3282                                 iommu_disable_translation(iommu);
3283                                 clear_translation_pre_enabled(iommu);
3284                         } else {
3285                                 pr_info("Copied translation tables from previous kernel for %s\n",
3286                                         iommu->name);
3287                         }
3288                 }
3289
3290                 if (!ecap_pass_through(iommu->ecap))
3291                         hw_pass_through = 0;
3292 #ifdef CONFIG_INTEL_IOMMU_SVM
3293                 if (pasid_supported(iommu))
3294                         intel_svm_init(iommu);
3295 #endif
3296         }
3297
3298         /*
3299          * Now that qi is enabled on all iommus, set the root entry and flush
3300          * caches. This is required on some Intel X58 chipsets, otherwise the
3301          * flush_context function will loop forever and the boot hangs.
3302          */
3303         for_each_active_iommu(iommu, drhd) {
3304                 iommu_flush_write_buffer(iommu);
3305                 iommu_set_root_entry(iommu);
3306                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3307                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3308         }
3309
3310         if (iommu_default_passthrough())
3311                 iommu_identity_mapping |= IDENTMAP_ALL;
3312
3313 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3314         dmar_map_gfx = 0;
3315 #endif
3316
3317         if (!dmar_map_gfx)
3318                 iommu_identity_mapping |= IDENTMAP_GFX;
3319
3320         check_tylersburg_isoch();
3321
3322         ret = si_domain_init(hw_pass_through);
3323         if (ret)
3324                 goto free_iommu;
3325
3326         /*
3327          * for each drhd
3328          *   enable fault log
3329          *   global invalidate context cache
3330          *   global invalidate iotlb
3331          *   enable translation
3332          */
3333         for_each_iommu(iommu, drhd) {
3334                 if (drhd->ignored) {
3335                         /*
3336                          * we always have to disable PMRs or DMA may fail on
3337                          * this device
3338                          */
3339                         if (force_on)
3340                                 iommu_disable_protect_mem_regions(iommu);
3341                         continue;
3342                 }
3343
3344                 iommu_flush_write_buffer(iommu);
3345
3346 #ifdef CONFIG_INTEL_IOMMU_SVM
3347                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3348                         /*
3349                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3350                          * could cause possible lock race condition.
3351                          */
3352                         up_write(&dmar_global_lock);
3353                         ret = intel_svm_enable_prq(iommu);
3354                         down_write(&dmar_global_lock);
3355                         if (ret)
3356                                 goto free_iommu;
3357                 }
3358 #endif
3359                 ret = dmar_set_interrupt(iommu);
3360                 if (ret)
3361                         goto free_iommu;
3362         }
3363
3364         return 0;
3365
3366 free_iommu:
3367         for_each_active_iommu(iommu, drhd) {
3368                 disable_dmar_iommu(iommu);
3369                 free_dmar_iommu(iommu);
3370         }
3371
3372         kfree(g_iommus);
3373
3374 error:
3375         return ret;
3376 }
3377
3378 /* This takes a number of _MM_ pages, not VTD pages */
3379 static unsigned long intel_alloc_iova(struct device *dev,
3380                                      struct dmar_domain *domain,
3381                                      unsigned long nrpages, uint64_t dma_mask)
3382 {
3383         unsigned long iova_pfn;
3384
3385         /* Restrict dma_mask to the width that the iommu can handle */
3386         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3387         /* Ensure we reserve the whole size-aligned region */
3388         nrpages = __roundup_pow_of_two(nrpages);
3389
3390         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3391                 /*
3392                  * First try to allocate an io virtual address in
3393                  * DMA_BIT_MASK(32) and if that fails then try allocating
3394                  * from higher range
3395                  */
3396                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3397                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3398                 if (iova_pfn)
3399                         return iova_pfn;
3400         }
3401         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3402                                    IOVA_PFN(dma_mask), true);
3403         if (unlikely(!iova_pfn)) {
3404                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3405                 return 0;
3406         }
3407
3408         return iova_pfn;
3409 }
3410
3411 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3412 {
3413         struct dmar_domain *domain, *tmp;
3414         struct dmar_rmrr_unit *rmrr;
3415         struct device *i_dev;
3416         int i, ret;
3417
3418         /* Device shouldn't be attached by any domains. */
3419         domain = find_domain(dev);
3420         if (domain)
3421                 return NULL;
3422
3423         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3424         if (!domain)
3425                 goto out;
3426
3427         /* We have a new domain - setup possible RMRRs for the device */
3428         rcu_read_lock();
3429         for_each_rmrr_units(rmrr) {
3430                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3431                                           i, i_dev) {
3432                         if (i_dev != dev)
3433                                 continue;
3434
3435                         ret = domain_prepare_identity_map(dev, domain,
3436                                                           rmrr->base_address,
3437                                                           rmrr->end_address);
3438                         if (ret)
3439                                 dev_err(dev, "Mapping reserved region failed\n");
3440                 }
3441         }
3442         rcu_read_unlock();
3443
3444         tmp = set_domain_for_dev(dev, domain);
3445         if (!tmp || domain != tmp) {
3446                 domain_exit(domain);
3447                 domain = tmp;
3448         }
3449
3450 out:
3451         if (!domain)
3452                 dev_err(dev, "Allocating domain failed\n");
3453         else
3454                 domain->domain.type = IOMMU_DOMAIN_DMA;
3455
3456         return domain;
3457 }
3458
3459 /* Check if the dev needs to go through non-identity map and unmap process.*/
3460 static bool iommu_need_mapping(struct device *dev)
3461 {
3462         int ret;
3463
3464         if (iommu_dummy(dev))
3465                 return false;
3466
3467         ret = identity_mapping(dev);
3468         if (ret) {
3469                 u64 dma_mask = *dev->dma_mask;
3470
3471                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3472                         dma_mask = dev->coherent_dma_mask;
3473
3474                 if (dma_mask >= dma_direct_get_required_mask(dev))
3475                         return false;
3476
3477                 /*
3478                  * 32 bit DMA is removed from si_domain and fall back to
3479                  * non-identity mapping.
3480                  */
3481                 dmar_remove_one_dev_info(dev);
3482                 ret = iommu_request_dma_domain_for_dev(dev);
3483                 if (ret) {
3484                         struct iommu_domain *domain;
3485                         struct dmar_domain *dmar_domain;
3486
3487                         domain = iommu_get_domain_for_dev(dev);
3488                         if (domain) {
3489                                 dmar_domain = to_dmar_domain(domain);
3490                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3491                         }
3492                         dmar_remove_one_dev_info(dev);
3493                         get_private_domain_for_dev(dev);
3494                 }
3495
3496                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3497         }
3498
3499         return true;
3500 }
3501
3502 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3503                                      size_t size, int dir, u64 dma_mask)
3504 {
3505         struct dmar_domain *domain;
3506         phys_addr_t start_paddr;
3507         unsigned long iova_pfn;
3508         int prot = 0;
3509         int ret;
3510         struct intel_iommu *iommu;
3511         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3512
3513         BUG_ON(dir == DMA_NONE);
3514
3515         domain = find_domain(dev);
3516         if (!domain)
3517                 return DMA_MAPPING_ERROR;
3518
3519         iommu = domain_get_iommu(domain);
3520         size = aligned_nrpages(paddr, size);
3521
3522         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3523         if (!iova_pfn)
3524                 goto error;
3525
3526         /*
3527          * Check if DMAR supports zero-length reads on write only
3528          * mappings..
3529          */
3530         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3531                         !cap_zlr(iommu->cap))
3532                 prot |= DMA_PTE_READ;
3533         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3534                 prot |= DMA_PTE_WRITE;
3535         /*
3536          * paddr - (paddr + size) might be partial page, we should map the whole
3537          * page.  Note: if two part of one page are separately mapped, we
3538          * might have two guest_addr mapping to the same host paddr, but this
3539          * is not a big problem
3540          */
3541         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3542                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3543         if (ret)
3544                 goto error;
3545
3546         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3547         start_paddr += paddr & ~PAGE_MASK;
3548
3549         trace_map_single(dev, start_paddr, paddr, size << VTD_PAGE_SHIFT);
3550
3551         return start_paddr;
3552
3553 error:
3554         if (iova_pfn)
3555                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3556         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3557                 size, (unsigned long long)paddr, dir);
3558         return DMA_MAPPING_ERROR;
3559 }
3560
3561 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3562                                  unsigned long offset, size_t size,
3563                                  enum dma_data_direction dir,
3564                                  unsigned long attrs)
3565 {
3566         if (iommu_need_mapping(dev))
3567                 return __intel_map_single(dev, page_to_phys(page) + offset,
3568                                 size, dir, *dev->dma_mask);
3569         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3570 }
3571
3572 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3573                                      size_t size, enum dma_data_direction dir,
3574                                      unsigned long attrs)
3575 {
3576         if (iommu_need_mapping(dev))
3577                 return __intel_map_single(dev, phys_addr, size, dir,
3578                                 *dev->dma_mask);
3579         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3580 }
3581
3582 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3583 {
3584         struct dmar_domain *domain;
3585         unsigned long start_pfn, last_pfn;
3586         unsigned long nrpages;
3587         unsigned long iova_pfn;
3588         struct intel_iommu *iommu;
3589         struct page *freelist;
3590         struct pci_dev *pdev = NULL;
3591
3592         domain = find_domain(dev);
3593         BUG_ON(!domain);
3594
3595         iommu = domain_get_iommu(domain);
3596
3597         iova_pfn = IOVA_PFN(dev_addr);
3598
3599         nrpages = aligned_nrpages(dev_addr, size);
3600         start_pfn = mm_to_dma_pfn(iova_pfn);
3601         last_pfn = start_pfn + nrpages - 1;
3602
3603         if (dev_is_pci(dev))
3604                 pdev = to_pci_dev(dev);
3605
3606         freelist = domain_unmap(domain, start_pfn, last_pfn);
3607         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3608                         !has_iova_flush_queue(&domain->iovad)) {
3609                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3610                                       nrpages, !freelist, 0);
3611                 /* free iova */
3612                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3613                 dma_free_pagelist(freelist);
3614         } else {
3615                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3616                            (unsigned long)freelist);
3617                 /*
3618                  * queue up the release of the unmap to save the 1/6th of the
3619                  * cpu used up by the iotlb flush operation...
3620                  */
3621         }
3622
3623         trace_unmap_single(dev, dev_addr, size);
3624 }
3625
3626 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3627                              size_t size, enum dma_data_direction dir,
3628                              unsigned long attrs)
3629 {
3630         if (iommu_need_mapping(dev))
3631                 intel_unmap(dev, dev_addr, size);
3632         else
3633                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3634 }
3635
3636 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3637                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3638 {
3639         if (iommu_need_mapping(dev))
3640                 intel_unmap(dev, dev_addr, size);
3641 }
3642
3643 static void *intel_alloc_coherent(struct device *dev, size_t size,
3644                                   dma_addr_t *dma_handle, gfp_t flags,
3645                                   unsigned long attrs)
3646 {
3647         struct page *page = NULL;
3648         int order;
3649
3650         if (!iommu_need_mapping(dev))
3651                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3652
3653         size = PAGE_ALIGN(size);
3654         order = get_order(size);
3655
3656         if (gfpflags_allow_blocking(flags)) {
3657                 unsigned int count = size >> PAGE_SHIFT;
3658
3659                 page = dma_alloc_from_contiguous(dev, count, order,
3660                                                  flags & __GFP_NOWARN);
3661         }
3662
3663         if (!page)
3664                 page = alloc_pages(flags, order);
3665         if (!page)
3666                 return NULL;
3667         memset(page_address(page), 0, size);
3668
3669         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3670                                          DMA_BIDIRECTIONAL,
3671                                          dev->coherent_dma_mask);
3672         if (*dma_handle != DMA_MAPPING_ERROR)
3673                 return page_address(page);
3674         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3675                 __free_pages(page, order);
3676
3677         return NULL;
3678 }
3679
3680 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3681                                 dma_addr_t dma_handle, unsigned long attrs)
3682 {
3683         int order;
3684         struct page *page = virt_to_page(vaddr);
3685
3686         if (!iommu_need_mapping(dev))
3687                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3688
3689         size = PAGE_ALIGN(size);
3690         order = get_order(size);
3691
3692         intel_unmap(dev, dma_handle, size);
3693         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3694                 __free_pages(page, order);
3695 }
3696
3697 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3698                            int nelems, enum dma_data_direction dir,
3699                            unsigned long attrs)
3700 {
3701         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3702         unsigned long nrpages = 0;
3703         struct scatterlist *sg;
3704         int i;
3705
3706         if (!iommu_need_mapping(dev))
3707                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3708
3709         for_each_sg(sglist, sg, nelems, i) {
3710                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3711         }
3712
3713         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3714
3715         trace_unmap_sg(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3716 }
3717
3718 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3719                         enum dma_data_direction dir, unsigned long attrs)
3720 {
3721         int i;
3722         struct dmar_domain *domain;
3723         size_t size = 0;
3724         int prot = 0;
3725         unsigned long iova_pfn;
3726         int ret;
3727         struct scatterlist *sg;
3728         unsigned long start_vpfn;
3729         struct intel_iommu *iommu;
3730
3731         BUG_ON(dir == DMA_NONE);
3732         if (!iommu_need_mapping(dev))
3733                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3734
3735         domain = find_domain(dev);
3736         if (!domain)
3737                 return 0;
3738
3739         iommu = domain_get_iommu(domain);
3740
3741         for_each_sg(sglist, sg, nelems, i)
3742                 size += aligned_nrpages(sg->offset, sg->length);
3743
3744         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3745                                 *dev->dma_mask);
3746         if (!iova_pfn) {
3747                 sglist->dma_length = 0;
3748                 return 0;
3749         }
3750
3751         /*
3752          * Check if DMAR supports zero-length reads on write only
3753          * mappings..
3754          */
3755         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3756                         !cap_zlr(iommu->cap))
3757                 prot |= DMA_PTE_READ;
3758         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3759                 prot |= DMA_PTE_WRITE;
3760
3761         start_vpfn = mm_to_dma_pfn(iova_pfn);
3762
3763         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3764         if (unlikely(ret)) {
3765                 dma_pte_free_pagetable(domain, start_vpfn,
3766                                        start_vpfn + size - 1,
3767                                        agaw_to_level(domain->agaw) + 1);
3768                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3769                 return 0;
3770         }
3771
3772         trace_map_sg(dev, iova_pfn << PAGE_SHIFT,
3773                      sg_phys(sglist), size << VTD_PAGE_SHIFT);
3774
3775         return nelems;
3776 }
3777
3778 static u64 intel_get_required_mask(struct device *dev)
3779 {
3780         if (!iommu_need_mapping(dev))
3781                 return dma_direct_get_required_mask(dev);
3782         return DMA_BIT_MASK(32);
3783 }
3784
3785 static const struct dma_map_ops intel_dma_ops = {
3786         .alloc = intel_alloc_coherent,
3787         .free = intel_free_coherent,
3788         .map_sg = intel_map_sg,
3789         .unmap_sg = intel_unmap_sg,
3790         .map_page = intel_map_page,
3791         .unmap_page = intel_unmap_page,
3792         .map_resource = intel_map_resource,
3793         .unmap_resource = intel_unmap_resource,
3794         .dma_supported = dma_direct_supported,
3795         .mmap = dma_common_mmap,
3796         .get_sgtable = dma_common_get_sgtable,
3797         .get_required_mask = intel_get_required_mask,
3798 };
3799
3800 static void
3801 bounce_sync_single(struct device *dev, dma_addr_t addr, size_t size,
3802                    enum dma_data_direction dir, enum dma_sync_target target)
3803 {
3804         struct dmar_domain *domain;
3805         phys_addr_t tlb_addr;
3806
3807         domain = find_domain(dev);
3808         if (WARN_ON(!domain))
3809                 return;
3810
3811         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, addr);
3812         if (is_swiotlb_buffer(tlb_addr))
3813                 swiotlb_tbl_sync_single(dev, tlb_addr, size, dir, target);
3814 }
3815
3816 static dma_addr_t
3817 bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size,
3818                   enum dma_data_direction dir, unsigned long attrs,
3819                   u64 dma_mask)
3820 {
3821         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3822         struct dmar_domain *domain;
3823         struct intel_iommu *iommu;
3824         unsigned long iova_pfn;
3825         unsigned long nrpages;
3826         phys_addr_t tlb_addr;
3827         int prot = 0;
3828         int ret;
3829
3830         domain = find_domain(dev);
3831         if (WARN_ON(dir == DMA_NONE || !domain))
3832                 return DMA_MAPPING_ERROR;
3833
3834         iommu = domain_get_iommu(domain);
3835         if (WARN_ON(!iommu))
3836                 return DMA_MAPPING_ERROR;
3837
3838         nrpages = aligned_nrpages(0, size);
3839         iova_pfn = intel_alloc_iova(dev, domain,
3840                                     dma_to_mm_pfn(nrpages), dma_mask);
3841         if (!iova_pfn)
3842                 return DMA_MAPPING_ERROR;
3843
3844         /*
3845          * Check if DMAR supports zero-length reads on write only
3846          * mappings..
3847          */
3848         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL ||
3849                         !cap_zlr(iommu->cap))
3850                 prot |= DMA_PTE_READ;
3851         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3852                 prot |= DMA_PTE_WRITE;
3853
3854         /*
3855          * If both the physical buffer start address and size are
3856          * page aligned, we don't need to use a bounce page.
3857          */
3858         if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
3859                 tlb_addr = swiotlb_tbl_map_single(dev,
3860                                 __phys_to_dma(dev, io_tlb_start),
3861                                 paddr, size, aligned_size, dir, attrs);
3862                 if (tlb_addr == DMA_MAPPING_ERROR) {
3863                         goto swiotlb_error;
3864                 } else {
3865                         /* Cleanup the padding area. */
3866                         void *padding_start = phys_to_virt(tlb_addr);
3867                         size_t padding_size = aligned_size;
3868
3869                         if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
3870                             (dir == DMA_TO_DEVICE ||
3871                              dir == DMA_BIDIRECTIONAL)) {
3872                                 padding_start += size;
3873                                 padding_size -= size;
3874                         }
3875
3876                         memset(padding_start, 0, padding_size);
3877                 }
3878         } else {
3879                 tlb_addr = paddr;
3880         }
3881
3882         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3883                                  tlb_addr >> VTD_PAGE_SHIFT, nrpages, prot);
3884         if (ret)
3885                 goto mapping_error;
3886
3887         trace_bounce_map_single(dev, iova_pfn << PAGE_SHIFT, paddr, size);
3888
3889         return (phys_addr_t)iova_pfn << PAGE_SHIFT;
3890
3891 mapping_error:
3892         if (is_swiotlb_buffer(tlb_addr))
3893                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3894                                          aligned_size, dir, attrs);
3895 swiotlb_error:
3896         free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3897         dev_err(dev, "Device bounce map: %zx@%llx dir %d --- failed\n",
3898                 size, (unsigned long long)paddr, dir);
3899
3900         return DMA_MAPPING_ERROR;
3901 }
3902
3903 static void
3904 bounce_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
3905                     enum dma_data_direction dir, unsigned long attrs)
3906 {
3907         size_t aligned_size = ALIGN(size, VTD_PAGE_SIZE);
3908         struct dmar_domain *domain;
3909         phys_addr_t tlb_addr;
3910
3911         domain = find_domain(dev);
3912         if (WARN_ON(!domain))
3913                 return;
3914
3915         tlb_addr = intel_iommu_iova_to_phys(&domain->domain, dev_addr);
3916         if (WARN_ON(!tlb_addr))
3917                 return;
3918
3919         intel_unmap(dev, dev_addr, size);
3920         if (is_swiotlb_buffer(tlb_addr))
3921                 swiotlb_tbl_unmap_single(dev, tlb_addr, size,
3922                                          aligned_size, dir, attrs);
3923
3924         trace_bounce_unmap_single(dev, dev_addr, size);
3925 }
3926
3927 static dma_addr_t
3928 bounce_map_page(struct device *dev, struct page *page, unsigned long offset,
3929                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3930 {
3931         return bounce_map_single(dev, page_to_phys(page) + offset,
3932                                  size, dir, attrs, *dev->dma_mask);
3933 }
3934
3935 static dma_addr_t
3936 bounce_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
3937                     enum dma_data_direction dir, unsigned long attrs)
3938 {
3939         return bounce_map_single(dev, phys_addr, size,
3940                                  dir, attrs, *dev->dma_mask);
3941 }
3942
3943 static void
3944 bounce_unmap_page(struct device *dev, dma_addr_t dev_addr, size_t size,
3945                   enum dma_data_direction dir, unsigned long attrs)
3946 {
3947         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3948 }
3949
3950 static void
3951 bounce_unmap_resource(struct device *dev, dma_addr_t dev_addr, size_t size,
3952                       enum dma_data_direction dir, unsigned long attrs)
3953 {
3954         bounce_unmap_single(dev, dev_addr, size, dir, attrs);
3955 }
3956
3957 static void
3958 bounce_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3959                 enum dma_data_direction dir, unsigned long attrs)
3960 {
3961         struct scatterlist *sg;
3962         int i;
3963
3964         for_each_sg(sglist, sg, nelems, i)
3965                 bounce_unmap_page(dev, sg->dma_address,
3966                                   sg_dma_len(sg), dir, attrs);
3967 }
3968
3969 static int
3970 bounce_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3971               enum dma_data_direction dir, unsigned long attrs)
3972 {
3973         int i;
3974         struct scatterlist *sg;
3975
3976         for_each_sg(sglist, sg, nelems, i) {
3977                 sg->dma_address = bounce_map_page(dev, sg_page(sg),
3978                                                   sg->offset, sg->length,
3979                                                   dir, attrs);
3980                 if (sg->dma_address == DMA_MAPPING_ERROR)
3981                         goto out_unmap;
3982                 sg_dma_len(sg) = sg->length;
3983         }
3984
3985         return nelems;
3986
3987 out_unmap:
3988         bounce_unmap_sg(dev, sglist, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
3989         return 0;
3990 }
3991
3992 static void
3993 bounce_sync_single_for_cpu(struct device *dev, dma_addr_t addr,
3994                            size_t size, enum dma_data_direction dir)
3995 {
3996         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_CPU);
3997 }
3998
3999 static void
4000 bounce_sync_single_for_device(struct device *dev, dma_addr_t addr,
4001                               size_t size, enum dma_data_direction dir)
4002 {
4003         bounce_sync_single(dev, addr, size, dir, SYNC_FOR_DEVICE);
4004 }
4005
4006 static void
4007 bounce_sync_sg_for_cpu(struct device *dev, struct scatterlist *sglist,
4008                        int nelems, enum dma_data_direction dir)
4009 {
4010         struct scatterlist *sg;
4011         int i;
4012
4013         for_each_sg(sglist, sg, nelems, i)
4014                 bounce_sync_single(dev, sg_dma_address(sg),
4015                                    sg_dma_len(sg), dir, SYNC_FOR_CPU);
4016 }
4017
4018 static void
4019 bounce_sync_sg_for_device(struct device *dev, struct scatterlist *sglist,
4020                           int nelems, enum dma_data_direction dir)
4021 {
4022         struct scatterlist *sg;
4023         int i;
4024
4025         for_each_sg(sglist, sg, nelems, i)
4026                 bounce_sync_single(dev, sg_dma_address(sg),
4027                                    sg_dma_len(sg), dir, SYNC_FOR_DEVICE);
4028 }
4029
4030 static const struct dma_map_ops bounce_dma_ops = {
4031         .alloc                  = intel_alloc_coherent,
4032         .free                   = intel_free_coherent,
4033         .map_sg                 = bounce_map_sg,
4034         .unmap_sg               = bounce_unmap_sg,
4035         .map_page               = bounce_map_page,
4036         .unmap_page             = bounce_unmap_page,
4037         .sync_single_for_cpu    = bounce_sync_single_for_cpu,
4038         .sync_single_for_device = bounce_sync_single_for_device,
4039         .sync_sg_for_cpu        = bounce_sync_sg_for_cpu,
4040         .sync_sg_for_device     = bounce_sync_sg_for_device,
4041         .map_resource           = bounce_map_resource,
4042         .unmap_resource         = bounce_unmap_resource,
4043         .dma_supported          = dma_direct_supported,
4044 };
4045
4046 static inline int iommu_domain_cache_init(void)
4047 {
4048         int ret = 0;
4049
4050         iommu_domain_cache = kmem_cache_create("iommu_domain",
4051                                          sizeof(struct dmar_domain),
4052                                          0,
4053                                          SLAB_HWCACHE_ALIGN,
4054
4055                                          NULL);
4056         if (!iommu_domain_cache) {
4057                 pr_err("Couldn't create iommu_domain cache\n");
4058                 ret = -ENOMEM;
4059         }
4060
4061         return ret;
4062 }
4063
4064 static inline int iommu_devinfo_cache_init(void)
4065 {
4066         int ret = 0;
4067
4068         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
4069                                          sizeof(struct device_domain_info),
4070                                          0,
4071                                          SLAB_HWCACHE_ALIGN,
4072                                          NULL);
4073         if (!iommu_devinfo_cache) {
4074                 pr_err("Couldn't create devinfo cache\n");
4075                 ret = -ENOMEM;
4076         }
4077
4078         return ret;
4079 }
4080
4081 static int __init iommu_init_mempool(void)
4082 {
4083         int ret;
4084         ret = iova_cache_get();
4085         if (ret)
4086                 return ret;
4087
4088         ret = iommu_domain_cache_init();
4089         if (ret)
4090                 goto domain_error;
4091
4092         ret = iommu_devinfo_cache_init();
4093         if (!ret)
4094                 return ret;
4095
4096         kmem_cache_destroy(iommu_domain_cache);
4097 domain_error:
4098         iova_cache_put();
4099
4100         return -ENOMEM;
4101 }
4102
4103 static void __init iommu_exit_mempool(void)
4104 {
4105         kmem_cache_destroy(iommu_devinfo_cache);
4106         kmem_cache_destroy(iommu_domain_cache);
4107         iova_cache_put();
4108 }
4109
4110 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
4111 {
4112         struct dmar_drhd_unit *drhd;
4113         u32 vtbar;
4114         int rc;
4115
4116         /* We know that this device on this chipset has its own IOMMU.
4117          * If we find it under a different IOMMU, then the BIOS is lying
4118          * to us. Hope that the IOMMU for this device is actually
4119          * disabled, and it needs no translation...
4120          */
4121         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4122         if (rc) {
4123                 /* "can't" happen */
4124                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4125                 return;
4126         }
4127         vtbar &= 0xffff0000;
4128
4129         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4130         drhd = dmar_find_matched_drhd_unit(pdev);
4131         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4132                             TAINT_FIRMWARE_WORKAROUND,
4133                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4134                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4135 }
4136 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4137
4138 static void __init init_no_remapping_devices(void)
4139 {
4140         struct dmar_drhd_unit *drhd;
4141         struct device *dev;
4142         int i;
4143
4144         for_each_drhd_unit(drhd) {
4145                 if (!drhd->include_all) {
4146                         for_each_active_dev_scope(drhd->devices,
4147                                                   drhd->devices_cnt, i, dev)
4148                                 break;
4149                         /* ignore DMAR unit if no devices exist */
4150                         if (i == drhd->devices_cnt)
4151                                 drhd->ignored = 1;
4152                 }
4153         }
4154
4155         for_each_active_drhd_unit(drhd) {
4156                 if (drhd->include_all)
4157                         continue;
4158
4159                 for_each_active_dev_scope(drhd->devices,
4160                                           drhd->devices_cnt, i, dev)
4161                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4162                                 break;
4163                 if (i < drhd->devices_cnt)
4164                         continue;
4165
4166                 /* This IOMMU has *only* gfx devices. Either bypass it or
4167                    set the gfx_mapped flag, as appropriate */
4168                 if (!dmar_map_gfx) {
4169                         drhd->ignored = 1;
4170                         for_each_active_dev_scope(drhd->devices,
4171                                                   drhd->devices_cnt, i, dev)
4172                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4173                 }
4174         }
4175 }
4176
4177 #ifdef CONFIG_SUSPEND
4178 static int init_iommu_hw(void)
4179 {
4180         struct dmar_drhd_unit *drhd;
4181         struct intel_iommu *iommu = NULL;
4182
4183         for_each_active_iommu(iommu, drhd)
4184                 if (iommu->qi)
4185                         dmar_reenable_qi(iommu);
4186
4187         for_each_iommu(iommu, drhd) {
4188                 if (drhd->ignored) {
4189                         /*
4190                          * we always have to disable PMRs or DMA may fail on
4191                          * this device
4192                          */
4193                         if (force_on)
4194                                 iommu_disable_protect_mem_regions(iommu);
4195                         continue;
4196                 }
4197
4198                 iommu_flush_write_buffer(iommu);
4199
4200                 iommu_set_root_entry(iommu);
4201
4202                 iommu->flush.flush_context(iommu, 0, 0, 0,
4203                                            DMA_CCMD_GLOBAL_INVL);
4204                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4205                 iommu_enable_translation(iommu);
4206                 iommu_disable_protect_mem_regions(iommu);
4207         }
4208
4209         return 0;
4210 }
4211
4212 static void iommu_flush_all(void)
4213 {
4214         struct dmar_drhd_unit *drhd;
4215         struct intel_iommu *iommu;
4216
4217         for_each_active_iommu(iommu, drhd) {
4218                 iommu->flush.flush_context(iommu, 0, 0, 0,
4219                                            DMA_CCMD_GLOBAL_INVL);
4220                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4221                                          DMA_TLB_GLOBAL_FLUSH);
4222         }
4223 }
4224
4225 static int iommu_suspend(void)
4226 {
4227         struct dmar_drhd_unit *drhd;
4228         struct intel_iommu *iommu = NULL;
4229         unsigned long flag;
4230
4231         for_each_active_iommu(iommu, drhd) {
4232                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4233                                                  GFP_ATOMIC);
4234                 if (!iommu->iommu_state)
4235                         goto nomem;
4236         }
4237
4238         iommu_flush_all();
4239
4240         for_each_active_iommu(iommu, drhd) {
4241                 iommu_disable_translation(iommu);
4242
4243                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4244
4245                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4246                         readl(iommu->reg + DMAR_FECTL_REG);
4247                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4248                         readl(iommu->reg + DMAR_FEDATA_REG);
4249                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4250                         readl(iommu->reg + DMAR_FEADDR_REG);
4251                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4252                         readl(iommu->reg + DMAR_FEUADDR_REG);
4253
4254                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4255         }
4256         return 0;
4257
4258 nomem:
4259         for_each_active_iommu(iommu, drhd)
4260                 kfree(iommu->iommu_state);
4261
4262         return -ENOMEM;
4263 }
4264
4265 static void iommu_resume(void)
4266 {
4267         struct dmar_drhd_unit *drhd;
4268         struct intel_iommu *iommu = NULL;
4269         unsigned long flag;
4270
4271         if (init_iommu_hw()) {
4272                 if (force_on)
4273                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4274                 else
4275                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4276                 return;
4277         }
4278
4279         for_each_active_iommu(iommu, drhd) {
4280
4281                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4282
4283                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4284                         iommu->reg + DMAR_FECTL_REG);
4285                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4286                         iommu->reg + DMAR_FEDATA_REG);
4287                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4288                         iommu->reg + DMAR_FEADDR_REG);
4289                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4290                         iommu->reg + DMAR_FEUADDR_REG);
4291
4292                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4293         }
4294
4295         for_each_active_iommu(iommu, drhd)
4296                 kfree(iommu->iommu_state);
4297 }
4298
4299 static struct syscore_ops iommu_syscore_ops = {
4300         .resume         = iommu_resume,
4301         .suspend        = iommu_suspend,
4302 };
4303
4304 static void __init init_iommu_pm_ops(void)
4305 {
4306         register_syscore_ops(&iommu_syscore_ops);
4307 }
4308
4309 #else
4310 static inline void init_iommu_pm_ops(void) {}
4311 #endif  /* CONFIG_PM */
4312
4313 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4314 {
4315         struct acpi_dmar_reserved_memory *rmrr;
4316         struct dmar_rmrr_unit *rmrru;
4317
4318         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4319         if (!rmrru)
4320                 goto out;
4321
4322         rmrru->hdr = header;
4323         rmrr = (struct acpi_dmar_reserved_memory *)header;
4324         rmrru->base_address = rmrr->base_address;
4325         rmrru->end_address = rmrr->end_address;
4326
4327         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4328                                 ((void *)rmrr) + rmrr->header.length,
4329                                 &rmrru->devices_cnt);
4330         if (rmrru->devices_cnt && rmrru->devices == NULL)
4331                 goto free_rmrru;
4332
4333         list_add(&rmrru->list, &dmar_rmrr_units);
4334
4335         return 0;
4336 free_rmrru:
4337         kfree(rmrru);
4338 out:
4339         return -ENOMEM;
4340 }
4341
4342 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4343 {
4344         struct dmar_atsr_unit *atsru;
4345         struct acpi_dmar_atsr *tmp;
4346
4347         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4348                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4349                 if (atsr->segment != tmp->segment)
4350                         continue;
4351                 if (atsr->header.length != tmp->header.length)
4352                         continue;
4353                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4354                         return atsru;
4355         }
4356
4357         return NULL;
4358 }
4359
4360 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4361 {
4362         struct acpi_dmar_atsr *atsr;
4363         struct dmar_atsr_unit *atsru;
4364
4365         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4366                 return 0;
4367
4368         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4369         atsru = dmar_find_atsr(atsr);
4370         if (atsru)
4371                 return 0;
4372
4373         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4374         if (!atsru)
4375                 return -ENOMEM;
4376
4377         /*
4378          * If memory is allocated from slab by ACPI _DSM method, we need to
4379          * copy the memory content because the memory buffer will be freed
4380          * on return.
4381          */
4382         atsru->hdr = (void *)(atsru + 1);
4383         memcpy(atsru->hdr, hdr, hdr->length);
4384         atsru->include_all = atsr->flags & 0x1;
4385         if (!atsru->include_all) {
4386                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4387                                 (void *)atsr + atsr->header.length,
4388                                 &atsru->devices_cnt);
4389                 if (atsru->devices_cnt && atsru->devices == NULL) {
4390                         kfree(atsru);
4391                         return -ENOMEM;
4392                 }
4393         }
4394
4395         list_add_rcu(&atsru->list, &dmar_atsr_units);
4396
4397         return 0;
4398 }
4399
4400 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4401 {
4402         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4403         kfree(atsru);
4404 }
4405
4406 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4407 {
4408         struct acpi_dmar_atsr *atsr;
4409         struct dmar_atsr_unit *atsru;
4410
4411         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4412         atsru = dmar_find_atsr(atsr);
4413         if (atsru) {
4414                 list_del_rcu(&atsru->list);
4415                 synchronize_rcu();
4416                 intel_iommu_free_atsr(atsru);
4417         }
4418
4419         return 0;
4420 }
4421
4422 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4423 {
4424         int i;
4425         struct device *dev;
4426         struct acpi_dmar_atsr *atsr;
4427         struct dmar_atsr_unit *atsru;
4428
4429         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4430         atsru = dmar_find_atsr(atsr);
4431         if (!atsru)
4432                 return 0;
4433
4434         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4435                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4436                                           i, dev)
4437                         return -EBUSY;
4438         }
4439
4440         return 0;
4441 }
4442
4443 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4444 {
4445         int sp, ret;
4446         struct intel_iommu *iommu = dmaru->iommu;
4447
4448         if (g_iommus[iommu->seq_id])
4449                 return 0;
4450
4451         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4452                 pr_warn("%s: Doesn't support hardware pass through.\n",
4453                         iommu->name);
4454                 return -ENXIO;
4455         }
4456         if (!ecap_sc_support(iommu->ecap) &&
4457             domain_update_iommu_snooping(iommu)) {
4458                 pr_warn("%s: Doesn't support snooping.\n",
4459                         iommu->name);
4460                 return -ENXIO;
4461         }
4462         sp = domain_update_iommu_superpage(iommu) - 1;
4463         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4464                 pr_warn("%s: Doesn't support large page.\n",
4465                         iommu->name);
4466                 return -ENXIO;
4467         }
4468
4469         /*
4470          * Disable translation if already enabled prior to OS handover.
4471          */
4472         if (iommu->gcmd & DMA_GCMD_TE)
4473                 iommu_disable_translation(iommu);
4474
4475         g_iommus[iommu->seq_id] = iommu;
4476         ret = iommu_init_domains(iommu);
4477         if (ret == 0)
4478                 ret = iommu_alloc_root_entry(iommu);
4479         if (ret)
4480                 goto out;
4481
4482 #ifdef CONFIG_INTEL_IOMMU_SVM
4483         if (pasid_supported(iommu))
4484                 intel_svm_init(iommu);
4485 #endif
4486
4487         if (dmaru->ignored) {
4488                 /*
4489                  * we always have to disable PMRs or DMA may fail on this device
4490                  */
4491                 if (force_on)
4492                         iommu_disable_protect_mem_regions(iommu);
4493                 return 0;
4494         }
4495
4496         intel_iommu_init_qi(iommu);
4497         iommu_flush_write_buffer(iommu);
4498
4499 #ifdef CONFIG_INTEL_IOMMU_SVM
4500         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4501                 ret = intel_svm_enable_prq(iommu);
4502                 if (ret)
4503                         goto disable_iommu;
4504         }
4505 #endif
4506         ret = dmar_set_interrupt(iommu);
4507         if (ret)
4508                 goto disable_iommu;
4509
4510         iommu_set_root_entry(iommu);
4511         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4512         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4513         iommu_enable_translation(iommu);
4514
4515         iommu_disable_protect_mem_regions(iommu);
4516         return 0;
4517
4518 disable_iommu:
4519         disable_dmar_iommu(iommu);
4520 out:
4521         free_dmar_iommu(iommu);
4522         return ret;
4523 }
4524
4525 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4526 {
4527         int ret = 0;
4528         struct intel_iommu *iommu = dmaru->iommu;
4529
4530         if (!intel_iommu_enabled)
4531                 return 0;
4532         if (iommu == NULL)
4533                 return -EINVAL;
4534
4535         if (insert) {
4536                 ret = intel_iommu_add(dmaru);
4537         } else {
4538                 disable_dmar_iommu(iommu);
4539                 free_dmar_iommu(iommu);
4540         }
4541
4542         return ret;
4543 }
4544
4545 static void intel_iommu_free_dmars(void)
4546 {
4547         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4548         struct dmar_atsr_unit *atsru, *atsr_n;
4549
4550         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4551                 list_del(&rmrru->list);
4552                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4553                 kfree(rmrru);
4554         }
4555
4556         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4557                 list_del(&atsru->list);
4558                 intel_iommu_free_atsr(atsru);
4559         }
4560 }
4561
4562 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4563 {
4564         int i, ret = 1;
4565         struct pci_bus *bus;
4566         struct pci_dev *bridge = NULL;
4567         struct device *tmp;
4568         struct acpi_dmar_atsr *atsr;
4569         struct dmar_atsr_unit *atsru;
4570
4571         dev = pci_physfn(dev);
4572         for (bus = dev->bus; bus; bus = bus->parent) {
4573                 bridge = bus->self;
4574                 /* If it's an integrated device, allow ATS */
4575                 if (!bridge)
4576                         return 1;
4577                 /* Connected via non-PCIe: no ATS */
4578                 if (!pci_is_pcie(bridge) ||
4579                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4580                         return 0;
4581                 /* If we found the root port, look it up in the ATSR */
4582                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4583                         break;
4584         }
4585
4586         rcu_read_lock();
4587         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4588                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4589                 if (atsr->segment != pci_domain_nr(dev->bus))
4590                         continue;
4591
4592                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4593                         if (tmp == &bridge->dev)
4594                                 goto out;
4595
4596                 if (atsru->include_all)
4597                         goto out;
4598         }
4599         ret = 0;
4600 out:
4601         rcu_read_unlock();
4602
4603         return ret;
4604 }
4605
4606 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4607 {
4608         int ret;
4609         struct dmar_rmrr_unit *rmrru;
4610         struct dmar_atsr_unit *atsru;
4611         struct acpi_dmar_atsr *atsr;
4612         struct acpi_dmar_reserved_memory *rmrr;
4613
4614         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4615                 return 0;
4616
4617         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4618                 rmrr = container_of(rmrru->hdr,
4619                                     struct acpi_dmar_reserved_memory, header);
4620                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4621                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4622                                 ((void *)rmrr) + rmrr->header.length,
4623                                 rmrr->segment, rmrru->devices,
4624                                 rmrru->devices_cnt);
4625                         if (ret < 0)
4626                                 return ret;
4627                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4628                         dmar_remove_dev_scope(info, rmrr->segment,
4629                                 rmrru->devices, rmrru->devices_cnt);
4630                 }
4631         }
4632
4633         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4634                 if (atsru->include_all)
4635                         continue;
4636
4637                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4638                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4639                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4640                                         (void *)atsr + atsr->header.length,
4641                                         atsr->segment, atsru->devices,
4642                                         atsru->devices_cnt);
4643                         if (ret > 0)
4644                                 break;
4645                         else if (ret < 0)
4646                                 return ret;
4647                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4648                         if (dmar_remove_dev_scope(info, atsr->segment,
4649                                         atsru->devices, atsru->devices_cnt))
4650                                 break;
4651                 }
4652         }
4653
4654         return 0;
4655 }
4656
4657 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4658                                        unsigned long val, void *v)
4659 {
4660         struct memory_notify *mhp = v;
4661         unsigned long long start, end;
4662         unsigned long start_vpfn, last_vpfn;
4663
4664         switch (val) {
4665         case MEM_GOING_ONLINE:
4666                 start = mhp->start_pfn << PAGE_SHIFT;
4667                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4668                 if (iommu_domain_identity_map(si_domain, start, end)) {
4669                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4670                                 start, end);
4671                         return NOTIFY_BAD;
4672                 }
4673                 break;
4674
4675         case MEM_OFFLINE:
4676         case MEM_CANCEL_ONLINE:
4677                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4678                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4679                 while (start_vpfn <= last_vpfn) {
4680                         struct iova *iova;
4681                         struct dmar_drhd_unit *drhd;
4682                         struct intel_iommu *iommu;
4683                         struct page *freelist;
4684
4685                         iova = find_iova(&si_domain->iovad, start_vpfn);
4686                         if (iova == NULL) {
4687                                 pr_debug("Failed get IOVA for PFN %lx\n",
4688                                          start_vpfn);
4689                                 break;
4690                         }
4691
4692                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4693                                                      start_vpfn, last_vpfn);
4694                         if (iova == NULL) {
4695                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4696                                         start_vpfn, last_vpfn);
4697                                 return NOTIFY_BAD;
4698                         }
4699
4700                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4701                                                iova->pfn_hi);
4702
4703                         rcu_read_lock();
4704                         for_each_active_iommu(iommu, drhd)
4705                                 iommu_flush_iotlb_psi(iommu, si_domain,
4706                                         iova->pfn_lo, iova_size(iova),
4707                                         !freelist, 0);
4708                         rcu_read_unlock();
4709                         dma_free_pagelist(freelist);
4710
4711                         start_vpfn = iova->pfn_hi + 1;
4712                         free_iova_mem(iova);
4713                 }
4714                 break;
4715         }
4716
4717         return NOTIFY_OK;
4718 }
4719
4720 static struct notifier_block intel_iommu_memory_nb = {
4721         .notifier_call = intel_iommu_memory_notifier,
4722         .priority = 0
4723 };
4724
4725 static void free_all_cpu_cached_iovas(unsigned int cpu)
4726 {
4727         int i;
4728
4729         for (i = 0; i < g_num_of_iommus; i++) {
4730                 struct intel_iommu *iommu = g_iommus[i];
4731                 struct dmar_domain *domain;
4732                 int did;
4733
4734                 if (!iommu)
4735                         continue;
4736
4737                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4738                         domain = get_iommu_domain(iommu, (u16)did);
4739
4740                         if (!domain)
4741                                 continue;
4742                         free_cpu_cached_iovas(cpu, &domain->iovad);
4743                 }
4744         }
4745 }
4746
4747 static int intel_iommu_cpu_dead(unsigned int cpu)
4748 {
4749         free_all_cpu_cached_iovas(cpu);
4750         return 0;
4751 }
4752
4753 static void intel_disable_iommus(void)
4754 {
4755         struct intel_iommu *iommu = NULL;
4756         struct dmar_drhd_unit *drhd;
4757
4758         for_each_iommu(iommu, drhd)
4759                 iommu_disable_translation(iommu);
4760 }
4761
4762 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4763 {
4764         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4765
4766         return container_of(iommu_dev, struct intel_iommu, iommu);
4767 }
4768
4769 static ssize_t intel_iommu_show_version(struct device *dev,
4770                                         struct device_attribute *attr,
4771                                         char *buf)
4772 {
4773         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4774         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4775         return sprintf(buf, "%d:%d\n",
4776                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4777 }
4778 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4779
4780 static ssize_t intel_iommu_show_address(struct device *dev,
4781                                         struct device_attribute *attr,
4782                                         char *buf)
4783 {
4784         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4785         return sprintf(buf, "%llx\n", iommu->reg_phys);
4786 }
4787 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4788
4789 static ssize_t intel_iommu_show_cap(struct device *dev,
4790                                     struct device_attribute *attr,
4791                                     char *buf)
4792 {
4793         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4794         return sprintf(buf, "%llx\n", iommu->cap);
4795 }
4796 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4797
4798 static ssize_t intel_iommu_show_ecap(struct device *dev,
4799                                     struct device_attribute *attr,
4800                                     char *buf)
4801 {
4802         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4803         return sprintf(buf, "%llx\n", iommu->ecap);
4804 }
4805 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4806
4807 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4808                                       struct device_attribute *attr,
4809                                       char *buf)
4810 {
4811         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4812         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4813 }
4814 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4815
4816 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4817                                            struct device_attribute *attr,
4818                                            char *buf)
4819 {
4820         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4821         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4822                                                   cap_ndoms(iommu->cap)));
4823 }
4824 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4825
4826 static struct attribute *intel_iommu_attrs[] = {
4827         &dev_attr_version.attr,
4828         &dev_attr_address.attr,
4829         &dev_attr_cap.attr,
4830         &dev_attr_ecap.attr,
4831         &dev_attr_domains_supported.attr,
4832         &dev_attr_domains_used.attr,
4833         NULL,
4834 };
4835
4836 static struct attribute_group intel_iommu_group = {
4837         .name = "intel-iommu",
4838         .attrs = intel_iommu_attrs,
4839 };
4840
4841 const struct attribute_group *intel_iommu_groups[] = {
4842         &intel_iommu_group,
4843         NULL,
4844 };
4845
4846 static inline bool has_untrusted_dev(void)
4847 {
4848         struct pci_dev *pdev = NULL;
4849
4850         for_each_pci_dev(pdev)
4851                 if (pdev->untrusted)
4852                         return true;
4853
4854         return false;
4855 }
4856
4857 static int __init platform_optin_force_iommu(void)
4858 {
4859         if (!dmar_platform_optin() || no_platform_optin || !has_untrusted_dev())
4860                 return 0;
4861
4862         if (no_iommu || dmar_disabled)
4863                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4864
4865         /*
4866          * If Intel-IOMMU is disabled by default, we will apply identity
4867          * map for all devices except those marked as being untrusted.
4868          */
4869         if (dmar_disabled)
4870                 iommu_identity_mapping |= IDENTMAP_ALL;
4871
4872         dmar_disabled = 0;
4873         no_iommu = 0;
4874
4875         return 1;
4876 }
4877
4878 static int __init probe_acpi_namespace_devices(void)
4879 {
4880         struct dmar_drhd_unit *drhd;
4881         /* To avoid a -Wunused-but-set-variable warning. */
4882         struct intel_iommu *iommu __maybe_unused;
4883         struct device *dev;
4884         int i, ret = 0;
4885
4886         for_each_active_iommu(iommu, drhd) {
4887                 for_each_active_dev_scope(drhd->devices,
4888                                           drhd->devices_cnt, i, dev) {
4889                         struct acpi_device_physical_node *pn;
4890                         struct iommu_group *group;
4891                         struct acpi_device *adev;
4892
4893                         if (dev->bus != &acpi_bus_type)
4894                                 continue;
4895
4896                         adev = to_acpi_device(dev);
4897                         mutex_lock(&adev->physical_node_lock);
4898                         list_for_each_entry(pn,
4899                                             &adev->physical_node_list, node) {
4900                                 group = iommu_group_get(pn->dev);
4901                                 if (group) {
4902                                         iommu_group_put(group);
4903                                         continue;
4904                                 }
4905
4906                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4907                                 ret = iommu_probe_device(pn->dev);
4908                                 if (ret)
4909                                         break;
4910                         }
4911                         mutex_unlock(&adev->physical_node_lock);
4912
4913                         if (ret)
4914                                 return ret;
4915                 }
4916         }
4917
4918         return 0;
4919 }
4920
4921 int __init intel_iommu_init(void)
4922 {
4923         int ret = -ENODEV;
4924         struct dmar_drhd_unit *drhd;
4925         struct intel_iommu *iommu;
4926
4927         /*
4928          * Intel IOMMU is required for a TXT/tboot launch or platform
4929          * opt in, so enforce that.
4930          */
4931         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4932
4933         if (iommu_init_mempool()) {
4934                 if (force_on)
4935                         panic("tboot: Failed to initialize iommu memory\n");
4936                 return -ENOMEM;
4937         }
4938
4939         down_write(&dmar_global_lock);
4940         if (dmar_table_init()) {
4941                 if (force_on)
4942                         panic("tboot: Failed to initialize DMAR table\n");
4943                 goto out_free_dmar;
4944         }
4945
4946         if (dmar_dev_scope_init() < 0) {
4947                 if (force_on)
4948                         panic("tboot: Failed to initialize DMAR device scope\n");
4949                 goto out_free_dmar;
4950         }
4951
4952         up_write(&dmar_global_lock);
4953
4954         /*
4955          * The bus notifier takes the dmar_global_lock, so lockdep will
4956          * complain later when we register it under the lock.
4957          */
4958         dmar_register_bus_notifier();
4959
4960         down_write(&dmar_global_lock);
4961
4962         if (no_iommu || dmar_disabled) {
4963                 /*
4964                  * We exit the function here to ensure IOMMU's remapping and
4965                  * mempool aren't setup, which means that the IOMMU's PMRs
4966                  * won't be disabled via the call to init_dmars(). So disable
4967                  * it explicitly here. The PMRs were setup by tboot prior to
4968                  * calling SENTER, but the kernel is expected to reset/tear
4969                  * down the PMRs.
4970                  */
4971                 if (intel_iommu_tboot_noforce) {
4972                         for_each_iommu(iommu, drhd)
4973                                 iommu_disable_protect_mem_regions(iommu);
4974                 }
4975
4976                 /*
4977                  * Make sure the IOMMUs are switched off, even when we
4978                  * boot into a kexec kernel and the previous kernel left
4979                  * them enabled
4980                  */
4981                 intel_disable_iommus();
4982                 goto out_free_dmar;
4983         }
4984
4985         if (list_empty(&dmar_rmrr_units))
4986                 pr_info("No RMRR found\n");
4987
4988         if (list_empty(&dmar_atsr_units))
4989                 pr_info("No ATSR found\n");
4990
4991         if (dmar_init_reserved_ranges()) {
4992                 if (force_on)
4993                         panic("tboot: Failed to reserve iommu ranges\n");
4994                 goto out_free_reserved_range;
4995         }
4996
4997         if (dmar_map_gfx)
4998                 intel_iommu_gfx_mapped = 1;
4999
5000         init_no_remapping_devices();
5001
5002         ret = init_dmars();
5003         if (ret) {
5004                 if (force_on)
5005                         panic("tboot: Failed to initialize DMARs\n");
5006                 pr_err("Initialization failed\n");
5007                 goto out_free_reserved_range;
5008         }
5009         up_write(&dmar_global_lock);
5010
5011 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
5012         /*
5013          * If the system has no untrusted device or the user has decided
5014          * to disable the bounce page mechanisms, we don't need swiotlb.
5015          * Mark this and the pre-allocated bounce pages will be released
5016          * later.
5017          */
5018         if (!has_untrusted_dev() || intel_no_bounce)
5019                 swiotlb = 0;
5020 #endif
5021         dma_ops = &intel_dma_ops;
5022
5023         init_iommu_pm_ops();
5024
5025         for_each_active_iommu(iommu, drhd) {
5026                 iommu_device_sysfs_add(&iommu->iommu, NULL,
5027                                        intel_iommu_groups,
5028                                        "%s", iommu->name);
5029                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
5030                 iommu_device_register(&iommu->iommu);
5031         }
5032
5033         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
5034         if (si_domain && !hw_pass_through)
5035                 register_memory_notifier(&intel_iommu_memory_nb);
5036         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
5037                           intel_iommu_cpu_dead);
5038
5039         down_read(&dmar_global_lock);
5040         if (probe_acpi_namespace_devices())
5041                 pr_warn("ACPI name space devices didn't probe correctly\n");
5042         up_read(&dmar_global_lock);
5043
5044         /* Finally, we enable the DMA remapping hardware. */
5045         for_each_iommu(iommu, drhd) {
5046                 if (!drhd->ignored && !translation_pre_enabled(iommu))
5047                         iommu_enable_translation(iommu);
5048
5049                 iommu_disable_protect_mem_regions(iommu);
5050         }
5051         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
5052
5053         intel_iommu_enabled = 1;
5054         intel_iommu_debugfs_init();
5055
5056         return 0;
5057
5058 out_free_reserved_range:
5059         put_iova_domain(&reserved_iova_list);
5060 out_free_dmar:
5061         intel_iommu_free_dmars();
5062         up_write(&dmar_global_lock);
5063         iommu_exit_mempool();
5064         return ret;
5065 }
5066
5067 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
5068 {
5069         struct intel_iommu *iommu = opaque;
5070
5071         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
5072         return 0;
5073 }
5074
5075 /*
5076  * NB - intel-iommu lacks any sort of reference counting for the users of
5077  * dependent devices.  If multiple endpoints have intersecting dependent
5078  * devices, unbinding the driver from any one of them will possibly leave
5079  * the others unable to operate.
5080  */
5081 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
5082 {
5083         if (!iommu || !dev || !dev_is_pci(dev))
5084                 return;
5085
5086         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
5087 }
5088
5089 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
5090 {
5091         struct dmar_domain *domain;
5092         struct intel_iommu *iommu;
5093         unsigned long flags;
5094
5095         assert_spin_locked(&device_domain_lock);
5096
5097         if (WARN_ON(!info))
5098                 return;
5099
5100         iommu = info->iommu;
5101         domain = info->domain;
5102
5103         if (info->dev) {
5104                 if (dev_is_pci(info->dev) && sm_supported(iommu))
5105                         intel_pasid_tear_down_entry(iommu, info->dev,
5106                                         PASID_RID2PASID);
5107
5108                 iommu_disable_dev_iotlb(info);
5109                 domain_context_clear(iommu, info->dev);
5110                 intel_pasid_free_table(info->dev);
5111         }
5112
5113         unlink_domain_info(info);
5114
5115         spin_lock_irqsave(&iommu->lock, flags);
5116         domain_detach_iommu(domain, iommu);
5117         spin_unlock_irqrestore(&iommu->lock, flags);
5118
5119         /* free the private domain */
5120         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
5121             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
5122             list_empty(&domain->devices))
5123                 domain_exit(info->domain);
5124
5125         free_devinfo_mem(info);
5126 }
5127
5128 static void dmar_remove_one_dev_info(struct device *dev)
5129 {
5130         struct device_domain_info *info;
5131         unsigned long flags;
5132
5133         spin_lock_irqsave(&device_domain_lock, flags);
5134         info = dev->archdata.iommu;
5135         if (info)
5136                 __dmar_remove_one_dev_info(info);
5137         spin_unlock_irqrestore(&device_domain_lock, flags);
5138 }
5139
5140 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5141 {
5142         int adjust_width;
5143
5144         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5145         domain_reserve_special_ranges(domain);
5146
5147         /* calculate AGAW */
5148         domain->gaw = guest_width;
5149         adjust_width = guestwidth_to_adjustwidth(guest_width);
5150         domain->agaw = width_to_agaw(adjust_width);
5151
5152         domain->iommu_coherency = 0;
5153         domain->iommu_snooping = 0;
5154         domain->iommu_superpage = 0;
5155         domain->max_addr = 0;
5156
5157         /* always allocate the top pgd */
5158         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5159         if (!domain->pgd)
5160                 return -ENOMEM;
5161         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5162         return 0;
5163 }
5164
5165 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5166 {
5167         struct dmar_domain *dmar_domain;
5168         struct iommu_domain *domain;
5169
5170         switch (type) {
5171         case IOMMU_DOMAIN_DMA:
5172         /* fallthrough */
5173         case IOMMU_DOMAIN_UNMANAGED:
5174                 dmar_domain = alloc_domain(0);
5175                 if (!dmar_domain) {
5176                         pr_err("Can't allocate dmar_domain\n");
5177                         return NULL;
5178                 }
5179                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5180                         pr_err("Domain initialization failed\n");
5181                         domain_exit(dmar_domain);
5182                         return NULL;
5183                 }
5184
5185                 if (type == IOMMU_DOMAIN_DMA &&
5186                     init_iova_flush_queue(&dmar_domain->iovad,
5187                                           iommu_flush_iova, iova_entry_free)) {
5188                         pr_warn("iova flush queue initialization failed\n");
5189                         intel_iommu_strict = 1;
5190                 }
5191
5192                 domain_update_iommu_cap(dmar_domain);
5193
5194                 domain = &dmar_domain->domain;
5195                 domain->geometry.aperture_start = 0;
5196                 domain->geometry.aperture_end   =
5197                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5198                 domain->geometry.force_aperture = true;
5199
5200                 return domain;
5201         case IOMMU_DOMAIN_IDENTITY:
5202                 return &si_domain->domain;
5203         default:
5204                 return NULL;
5205         }
5206
5207         return NULL;
5208 }
5209
5210 static void intel_iommu_domain_free(struct iommu_domain *domain)
5211 {
5212         if (domain != &si_domain->domain)
5213                 domain_exit(to_dmar_domain(domain));
5214 }
5215
5216 /*
5217  * Check whether a @domain could be attached to the @dev through the
5218  * aux-domain attach/detach APIs.
5219  */
5220 static inline bool
5221 is_aux_domain(struct device *dev, struct iommu_domain *domain)
5222 {
5223         struct device_domain_info *info = dev->archdata.iommu;
5224
5225         return info && info->auxd_enabled &&
5226                         domain->type == IOMMU_DOMAIN_UNMANAGED;
5227 }
5228
5229 static void auxiliary_link_device(struct dmar_domain *domain,
5230                                   struct device *dev)
5231 {
5232         struct device_domain_info *info = dev->archdata.iommu;
5233
5234         assert_spin_locked(&device_domain_lock);
5235         if (WARN_ON(!info))
5236                 return;
5237
5238         domain->auxd_refcnt++;
5239         list_add(&domain->auxd, &info->auxiliary_domains);
5240 }
5241
5242 static void auxiliary_unlink_device(struct dmar_domain *domain,
5243                                     struct device *dev)
5244 {
5245         struct device_domain_info *info = dev->archdata.iommu;
5246
5247         assert_spin_locked(&device_domain_lock);
5248         if (WARN_ON(!info))
5249                 return;
5250
5251         list_del(&domain->auxd);
5252         domain->auxd_refcnt--;
5253
5254         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5255                 intel_pasid_free_id(domain->default_pasid);
5256 }
5257
5258 static int aux_domain_add_dev(struct dmar_domain *domain,
5259                               struct device *dev)
5260 {
5261         int ret;
5262         u8 bus, devfn;
5263         unsigned long flags;
5264         struct intel_iommu *iommu;
5265
5266         iommu = device_to_iommu(dev, &bus, &devfn);
5267         if (!iommu)
5268                 return -ENODEV;
5269
5270         if (domain->default_pasid <= 0) {
5271                 int pasid;
5272
5273                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5274                                              pci_max_pasids(to_pci_dev(dev)),
5275                                              GFP_KERNEL);
5276                 if (pasid <= 0) {
5277                         pr_err("Can't allocate default pasid\n");
5278                         return -ENODEV;
5279                 }
5280                 domain->default_pasid = pasid;
5281         }
5282
5283         spin_lock_irqsave(&device_domain_lock, flags);
5284         /*
5285          * iommu->lock must be held to attach domain to iommu and setup the
5286          * pasid entry for second level translation.
5287          */
5288         spin_lock(&iommu->lock);
5289         ret = domain_attach_iommu(domain, iommu);
5290         if (ret)
5291                 goto attach_failed;
5292
5293         /* Setup the PASID entry for mediated devices: */
5294         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5295                                              domain->default_pasid);
5296         if (ret)
5297                 goto table_failed;
5298         spin_unlock(&iommu->lock);
5299
5300         auxiliary_link_device(domain, dev);
5301
5302         spin_unlock_irqrestore(&device_domain_lock, flags);
5303
5304         return 0;
5305
5306 table_failed:
5307         domain_detach_iommu(domain, iommu);
5308 attach_failed:
5309         spin_unlock(&iommu->lock);
5310         spin_unlock_irqrestore(&device_domain_lock, flags);
5311         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5312                 intel_pasid_free_id(domain->default_pasid);
5313
5314         return ret;
5315 }
5316
5317 static void aux_domain_remove_dev(struct dmar_domain *domain,
5318                                   struct device *dev)
5319 {
5320         struct device_domain_info *info;
5321         struct intel_iommu *iommu;
5322         unsigned long flags;
5323
5324         if (!is_aux_domain(dev, &domain->domain))
5325                 return;
5326
5327         spin_lock_irqsave(&device_domain_lock, flags);
5328         info = dev->archdata.iommu;
5329         iommu = info->iommu;
5330
5331         auxiliary_unlink_device(domain, dev);
5332
5333         spin_lock(&iommu->lock);
5334         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5335         domain_detach_iommu(domain, iommu);
5336         spin_unlock(&iommu->lock);
5337
5338         spin_unlock_irqrestore(&device_domain_lock, flags);
5339 }
5340
5341 static int prepare_domain_attach_device(struct iommu_domain *domain,
5342                                         struct device *dev)
5343 {
5344         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5345         struct intel_iommu *iommu;
5346         int addr_width;
5347         u8 bus, devfn;
5348
5349         iommu = device_to_iommu(dev, &bus, &devfn);
5350         if (!iommu)
5351                 return -ENODEV;
5352
5353         /* check if this iommu agaw is sufficient for max mapped address */
5354         addr_width = agaw_to_width(iommu->agaw);
5355         if (addr_width > cap_mgaw(iommu->cap))
5356                 addr_width = cap_mgaw(iommu->cap);
5357
5358         if (dmar_domain->max_addr > (1LL << addr_width)) {
5359                 dev_err(dev, "%s: iommu width (%d) is not "
5360                         "sufficient for the mapped address (%llx)\n",
5361                         __func__, addr_width, dmar_domain->max_addr);
5362                 return -EFAULT;
5363         }
5364         dmar_domain->gaw = addr_width;
5365
5366         /*
5367          * Knock out extra levels of page tables if necessary
5368          */
5369         while (iommu->agaw < dmar_domain->agaw) {
5370                 struct dma_pte *pte;
5371
5372                 pte = dmar_domain->pgd;
5373                 if (dma_pte_present(pte)) {
5374                         dmar_domain->pgd = (struct dma_pte *)
5375                                 phys_to_virt(dma_pte_addr(pte));
5376                         free_pgtable_page(pte);
5377                 }
5378                 dmar_domain->agaw--;
5379         }
5380
5381         return 0;
5382 }
5383
5384 static int intel_iommu_attach_device(struct iommu_domain *domain,
5385                                      struct device *dev)
5386 {
5387         int ret;
5388
5389         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5390             device_is_rmrr_locked(dev)) {
5391                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5392                 return -EPERM;
5393         }
5394
5395         if (is_aux_domain(dev, domain))
5396                 return -EPERM;
5397
5398         /* normally dev is not mapped */
5399         if (unlikely(domain_context_mapped(dev))) {
5400                 struct dmar_domain *old_domain;
5401
5402                 old_domain = find_domain(dev);
5403                 if (old_domain)
5404                         dmar_remove_one_dev_info(dev);
5405         }
5406
5407         ret = prepare_domain_attach_device(domain, dev);
5408         if (ret)
5409                 return ret;
5410
5411         return domain_add_dev_info(to_dmar_domain(domain), dev);
5412 }
5413
5414 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5415                                          struct device *dev)
5416 {
5417         int ret;
5418
5419         if (!is_aux_domain(dev, domain))
5420                 return -EPERM;
5421
5422         ret = prepare_domain_attach_device(domain, dev);
5423         if (ret)
5424                 return ret;
5425
5426         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5427 }
5428
5429 static void intel_iommu_detach_device(struct iommu_domain *domain,
5430                                       struct device *dev)
5431 {
5432         dmar_remove_one_dev_info(dev);
5433 }
5434
5435 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5436                                           struct device *dev)
5437 {
5438         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5439 }
5440
5441 static int intel_iommu_map(struct iommu_domain *domain,
5442                            unsigned long iova, phys_addr_t hpa,
5443                            size_t size, int iommu_prot)
5444 {
5445         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5446         u64 max_addr;
5447         int prot = 0;
5448         int ret;
5449
5450         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5451                 return -EINVAL;
5452
5453         if (iommu_prot & IOMMU_READ)
5454                 prot |= DMA_PTE_READ;
5455         if (iommu_prot & IOMMU_WRITE)
5456                 prot |= DMA_PTE_WRITE;
5457         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5458                 prot |= DMA_PTE_SNP;
5459
5460         max_addr = iova + size;
5461         if (dmar_domain->max_addr < max_addr) {
5462                 u64 end;
5463
5464                 /* check if minimum agaw is sufficient for mapped address */
5465                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5466                 if (end < max_addr) {
5467                         pr_err("%s: iommu width (%d) is not "
5468                                "sufficient for the mapped address (%llx)\n",
5469                                __func__, dmar_domain->gaw, max_addr);
5470                         return -EFAULT;
5471                 }
5472                 dmar_domain->max_addr = max_addr;
5473         }
5474         /* Round up size to next multiple of PAGE_SIZE, if it and
5475            the low bits of hpa would take us onto the next page */
5476         size = aligned_nrpages(hpa, size);
5477         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5478                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5479         return ret;
5480 }
5481
5482 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5483                                 unsigned long iova, size_t size,
5484                                 struct iommu_iotlb_gather *gather)
5485 {
5486         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5487         struct page *freelist = NULL;
5488         unsigned long start_pfn, last_pfn;
5489         unsigned int npages;
5490         int iommu_id, level = 0;
5491
5492         /* Cope with horrid API which requires us to unmap more than the
5493            size argument if it happens to be a large-page mapping. */
5494         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5495         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5496                 return 0;
5497
5498         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5499                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5500
5501         start_pfn = iova >> VTD_PAGE_SHIFT;
5502         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5503
5504         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5505
5506         npages = last_pfn - start_pfn + 1;
5507
5508         for_each_domain_iommu(iommu_id, dmar_domain)
5509                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5510                                       start_pfn, npages, !freelist, 0);
5511
5512         dma_free_pagelist(freelist);
5513
5514         if (dmar_domain->max_addr == iova + size)
5515                 dmar_domain->max_addr = iova;
5516
5517         return size;
5518 }
5519
5520 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5521                                             dma_addr_t iova)
5522 {
5523         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5524         struct dma_pte *pte;
5525         int level = 0;
5526         u64 phys = 0;
5527
5528         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5529                 return 0;
5530
5531         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5532         if (pte)
5533                 phys = dma_pte_addr(pte);
5534
5535         return phys;
5536 }
5537
5538 static inline bool scalable_mode_support(void)
5539 {
5540         struct dmar_drhd_unit *drhd;
5541         struct intel_iommu *iommu;
5542         bool ret = true;
5543
5544         rcu_read_lock();
5545         for_each_active_iommu(iommu, drhd) {
5546                 if (!sm_supported(iommu)) {
5547                         ret = false;
5548                         break;
5549                 }
5550         }
5551         rcu_read_unlock();
5552
5553         return ret;
5554 }
5555
5556 static inline bool iommu_pasid_support(void)
5557 {
5558         struct dmar_drhd_unit *drhd;
5559         struct intel_iommu *iommu;
5560         bool ret = true;
5561
5562         rcu_read_lock();
5563         for_each_active_iommu(iommu, drhd) {
5564                 if (!pasid_supported(iommu)) {
5565                         ret = false;
5566                         break;
5567                 }
5568         }
5569         rcu_read_unlock();
5570
5571         return ret;
5572 }
5573
5574 static bool intel_iommu_capable(enum iommu_cap cap)
5575 {
5576         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5577                 return domain_update_iommu_snooping(NULL) == 1;
5578         if (cap == IOMMU_CAP_INTR_REMAP)
5579                 return irq_remapping_enabled == 1;
5580
5581         return false;
5582 }
5583
5584 static int intel_iommu_add_device(struct device *dev)
5585 {
5586         struct dmar_domain *dmar_domain;
5587         struct iommu_domain *domain;
5588         struct intel_iommu *iommu;
5589         struct iommu_group *group;
5590         u8 bus, devfn;
5591         int ret;
5592
5593         iommu = device_to_iommu(dev, &bus, &devfn);
5594         if (!iommu)
5595                 return -ENODEV;
5596
5597         iommu_device_link(&iommu->iommu, dev);
5598
5599         if (translation_pre_enabled(iommu))
5600                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5601
5602         group = iommu_group_get_for_dev(dev);
5603
5604         if (IS_ERR(group))
5605                 return PTR_ERR(group);
5606
5607         iommu_group_put(group);
5608
5609         domain = iommu_get_domain_for_dev(dev);
5610         dmar_domain = to_dmar_domain(domain);
5611         if (domain->type == IOMMU_DOMAIN_DMA) {
5612                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5613                         ret = iommu_request_dm_for_dev(dev);
5614                         if (ret) {
5615                                 dmar_remove_one_dev_info(dev);
5616                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5617                                 domain_add_dev_info(si_domain, dev);
5618                                 dev_info(dev,
5619                                          "Device uses a private identity domain.\n");
5620                         }
5621                 }
5622         } else {
5623                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5624                         ret = iommu_request_dma_domain_for_dev(dev);
5625                         if (ret) {
5626                                 dmar_remove_one_dev_info(dev);
5627                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5628                                 if (!get_private_domain_for_dev(dev)) {
5629                                         dev_warn(dev,
5630                                                  "Failed to get a private domain.\n");
5631                                         return -ENOMEM;
5632                                 }
5633
5634                                 dev_info(dev,
5635                                          "Device uses a private dma domain.\n");
5636                         }
5637                 }
5638         }
5639
5640         if (device_needs_bounce(dev)) {
5641                 dev_info(dev, "Use Intel IOMMU bounce page dma_ops\n");
5642                 set_dma_ops(dev, &bounce_dma_ops);
5643         }
5644
5645         return 0;
5646 }
5647
5648 static void intel_iommu_remove_device(struct device *dev)
5649 {
5650         struct intel_iommu *iommu;
5651         u8 bus, devfn;
5652
5653         iommu = device_to_iommu(dev, &bus, &devfn);
5654         if (!iommu)
5655                 return;
5656
5657         dmar_remove_one_dev_info(dev);
5658
5659         iommu_group_remove_device(dev);
5660
5661         iommu_device_unlink(&iommu->iommu, dev);
5662
5663         if (device_needs_bounce(dev))
5664                 set_dma_ops(dev, NULL);
5665 }
5666
5667 static void intel_iommu_get_resv_regions(struct device *device,
5668                                          struct list_head *head)
5669 {
5670         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5671         struct iommu_resv_region *reg;
5672         struct dmar_rmrr_unit *rmrr;
5673         struct device *i_dev;
5674         int i;
5675
5676         down_read(&dmar_global_lock);
5677         for_each_rmrr_units(rmrr) {
5678                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5679                                           i, i_dev) {
5680                         struct iommu_resv_region *resv;
5681                         enum iommu_resv_type type;
5682                         size_t length;
5683
5684                         if (i_dev != device &&
5685                             !is_downstream_to_pci_bridge(device, i_dev))
5686                                 continue;
5687
5688                         length = rmrr->end_address - rmrr->base_address + 1;
5689
5690                         type = device_rmrr_is_relaxable(device) ?
5691                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5692
5693                         resv = iommu_alloc_resv_region(rmrr->base_address,
5694                                                        length, prot, type);
5695                         if (!resv)
5696                                 break;
5697
5698                         list_add_tail(&resv->list, head);
5699                 }
5700         }
5701         up_read(&dmar_global_lock);
5702
5703 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5704         if (dev_is_pci(device)) {
5705                 struct pci_dev *pdev = to_pci_dev(device);
5706
5707                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5708                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5709                                                       IOMMU_RESV_DIRECT);
5710                         if (reg)
5711                                 list_add_tail(&reg->list, head);
5712                 }
5713         }
5714 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5715
5716         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5717                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5718                                       0, IOMMU_RESV_MSI);
5719         if (!reg)
5720                 return;
5721         list_add_tail(&reg->list, head);
5722 }
5723
5724 static void intel_iommu_put_resv_regions(struct device *dev,
5725                                          struct list_head *head)
5726 {
5727         struct iommu_resv_region *entry, *next;
5728
5729         list_for_each_entry_safe(entry, next, head, list)
5730                 kfree(entry);
5731 }
5732
5733 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5734 {
5735         struct device_domain_info *info;
5736         struct context_entry *context;
5737         struct dmar_domain *domain;
5738         unsigned long flags;
5739         u64 ctx_lo;
5740         int ret;
5741
5742         domain = find_domain(dev);
5743         if (!domain)
5744                 return -EINVAL;
5745
5746         spin_lock_irqsave(&device_domain_lock, flags);
5747         spin_lock(&iommu->lock);
5748
5749         ret = -EINVAL;
5750         info = dev->archdata.iommu;
5751         if (!info || !info->pasid_supported)
5752                 goto out;
5753
5754         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5755         if (WARN_ON(!context))
5756                 goto out;
5757
5758         ctx_lo = context[0].lo;
5759
5760         if (!(ctx_lo & CONTEXT_PASIDE)) {
5761                 ctx_lo |= CONTEXT_PASIDE;
5762                 context[0].lo = ctx_lo;
5763                 wmb();
5764                 iommu->flush.flush_context(iommu,
5765                                            domain->iommu_did[iommu->seq_id],
5766                                            PCI_DEVID(info->bus, info->devfn),
5767                                            DMA_CCMD_MASK_NOBIT,
5768                                            DMA_CCMD_DEVICE_INVL);
5769         }
5770
5771         /* Enable PASID support in the device, if it wasn't already */
5772         if (!info->pasid_enabled)
5773                 iommu_enable_dev_iotlb(info);
5774
5775         ret = 0;
5776
5777  out:
5778         spin_unlock(&iommu->lock);
5779         spin_unlock_irqrestore(&device_domain_lock, flags);
5780
5781         return ret;
5782 }
5783
5784 static void intel_iommu_apply_resv_region(struct device *dev,
5785                                           struct iommu_domain *domain,
5786                                           struct iommu_resv_region *region)
5787 {
5788         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5789         unsigned long start, end;
5790
5791         start = IOVA_PFN(region->start);
5792         end   = IOVA_PFN(region->start + region->length - 1);
5793
5794         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5795 }
5796
5797 #ifdef CONFIG_INTEL_IOMMU_SVM
5798 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5799 {
5800         struct intel_iommu *iommu;
5801         u8 bus, devfn;
5802
5803         if (iommu_dummy(dev)) {
5804                 dev_warn(dev,
5805                          "No IOMMU translation for device; cannot enable SVM\n");
5806                 return NULL;
5807         }
5808
5809         iommu = device_to_iommu(dev, &bus, &devfn);
5810         if ((!iommu)) {
5811                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5812                 return NULL;
5813         }
5814
5815         return iommu;
5816 }
5817 #endif /* CONFIG_INTEL_IOMMU_SVM */
5818
5819 static int intel_iommu_enable_auxd(struct device *dev)
5820 {
5821         struct device_domain_info *info;
5822         struct intel_iommu *iommu;
5823         unsigned long flags;
5824         u8 bus, devfn;
5825         int ret;
5826
5827         iommu = device_to_iommu(dev, &bus, &devfn);
5828         if (!iommu || dmar_disabled)
5829                 return -EINVAL;
5830
5831         if (!sm_supported(iommu) || !pasid_supported(iommu))
5832                 return -EINVAL;
5833
5834         ret = intel_iommu_enable_pasid(iommu, dev);
5835         if (ret)
5836                 return -ENODEV;
5837
5838         spin_lock_irqsave(&device_domain_lock, flags);
5839         info = dev->archdata.iommu;
5840         info->auxd_enabled = 1;
5841         spin_unlock_irqrestore(&device_domain_lock, flags);
5842
5843         return 0;
5844 }
5845
5846 static int intel_iommu_disable_auxd(struct device *dev)
5847 {
5848         struct device_domain_info *info;
5849         unsigned long flags;
5850
5851         spin_lock_irqsave(&device_domain_lock, flags);
5852         info = dev->archdata.iommu;
5853         if (!WARN_ON(!info))
5854                 info->auxd_enabled = 0;
5855         spin_unlock_irqrestore(&device_domain_lock, flags);
5856
5857         return 0;
5858 }
5859
5860 /*
5861  * A PCI express designated vendor specific extended capability is defined
5862  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5863  * for system software and tools to detect endpoint devices supporting the
5864  * Intel scalable IO virtualization without host driver dependency.
5865  *
5866  * Returns the address of the matching extended capability structure within
5867  * the device's PCI configuration space or 0 if the device does not support
5868  * it.
5869  */
5870 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5871 {
5872         int pos;
5873         u16 vendor, id;
5874
5875         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5876         while (pos) {
5877                 pci_read_config_word(pdev, pos + 4, &vendor);
5878                 pci_read_config_word(pdev, pos + 8, &id);
5879                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5880                         return pos;
5881
5882                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5883         }
5884
5885         return 0;
5886 }
5887
5888 static bool
5889 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5890 {
5891         if (feat == IOMMU_DEV_FEAT_AUX) {
5892                 int ret;
5893
5894                 if (!dev_is_pci(dev) || dmar_disabled ||
5895                     !scalable_mode_support() || !iommu_pasid_support())
5896                         return false;
5897
5898                 ret = pci_pasid_features(to_pci_dev(dev));
5899                 if (ret < 0)
5900                         return false;
5901
5902                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5903         }
5904
5905         return false;
5906 }
5907
5908 static int
5909 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5910 {
5911         if (feat == IOMMU_DEV_FEAT_AUX)
5912                 return intel_iommu_enable_auxd(dev);
5913
5914         return -ENODEV;
5915 }
5916
5917 static int
5918 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5919 {
5920         if (feat == IOMMU_DEV_FEAT_AUX)
5921                 return intel_iommu_disable_auxd(dev);
5922
5923         return -ENODEV;
5924 }
5925
5926 static bool
5927 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5928 {
5929         struct device_domain_info *info = dev->archdata.iommu;
5930
5931         if (feat == IOMMU_DEV_FEAT_AUX)
5932                 return scalable_mode_support() && info && info->auxd_enabled;
5933
5934         return false;
5935 }
5936
5937 static int
5938 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5939 {
5940         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5941
5942         return dmar_domain->default_pasid > 0 ?
5943                         dmar_domain->default_pasid : -EINVAL;
5944 }
5945
5946 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5947                                            struct device *dev)
5948 {
5949         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5950 }
5951
5952 const struct iommu_ops intel_iommu_ops = {
5953         .capable                = intel_iommu_capable,
5954         .domain_alloc           = intel_iommu_domain_alloc,
5955         .domain_free            = intel_iommu_domain_free,
5956         .attach_dev             = intel_iommu_attach_device,
5957         .detach_dev             = intel_iommu_detach_device,
5958         .aux_attach_dev         = intel_iommu_aux_attach_device,
5959         .aux_detach_dev         = intel_iommu_aux_detach_device,
5960         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5961         .map                    = intel_iommu_map,
5962         .unmap                  = intel_iommu_unmap,
5963         .iova_to_phys           = intel_iommu_iova_to_phys,
5964         .add_device             = intel_iommu_add_device,
5965         .remove_device          = intel_iommu_remove_device,
5966         .get_resv_regions       = intel_iommu_get_resv_regions,
5967         .put_resv_regions       = intel_iommu_put_resv_regions,
5968         .apply_resv_region      = intel_iommu_apply_resv_region,
5969         .device_group           = pci_device_group,
5970         .dev_has_feat           = intel_iommu_dev_has_feat,
5971         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5972         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5973         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5974         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5975         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5976 };
5977
5978 static void quirk_iommu_igfx(struct pci_dev *dev)
5979 {
5980         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5981         dmar_map_gfx = 0;
5982 }
5983
5984 /* G4x/GM45 integrated gfx dmar support is totally busted. */
5985 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
5986 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
5987 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
5988 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
5989 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
5990 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
5991 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
5992
5993 /* Broadwell igfx malfunctions with dmar */
5994 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
5995 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
5996 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
5997 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
5998 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
5999 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
6000 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
6001 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
6002 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
6003 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
6004 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
6005 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
6006 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
6007 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
6008 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
6009 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
6010 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
6011 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
6012 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
6013 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
6014 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
6015 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
6016 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
6017 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
6018
6019 static void quirk_iommu_rwbf(struct pci_dev *dev)
6020 {
6021         /*
6022          * Mobile 4 Series Chipset neglects to set RWBF capability,
6023          * but needs it. Same seems to hold for the desktop versions.
6024          */
6025         pci_info(dev, "Forcing write-buffer flush capability\n");
6026         rwbf_quirk = 1;
6027 }
6028
6029 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
6030 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
6031 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
6032 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
6033 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
6034 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
6035 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
6036
6037 #define GGC 0x52
6038 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
6039 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
6040 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
6041 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
6042 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
6043 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
6044 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
6045 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
6046
6047 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
6048 {
6049         unsigned short ggc;
6050
6051         if (pci_read_config_word(dev, GGC, &ggc))
6052                 return;
6053
6054         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
6055                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
6056                 dmar_map_gfx = 0;
6057         } else if (dmar_map_gfx) {
6058                 /* we have to ensure the gfx device is idle before we flush */
6059                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
6060                 intel_iommu_strict = 1;
6061        }
6062 }
6063 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
6064 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
6065 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
6066 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
6067
6068 /* On Tylersburg chipsets, some BIOSes have been known to enable the
6069    ISOCH DMAR unit for the Azalia sound device, but not give it any
6070    TLB entries, which causes it to deadlock. Check for that.  We do
6071    this in a function called from init_dmars(), instead of in a PCI
6072    quirk, because we don't want to print the obnoxious "BIOS broken"
6073    message if VT-d is actually disabled.
6074 */
6075 static void __init check_tylersburg_isoch(void)
6076 {
6077         struct pci_dev *pdev;
6078         uint32_t vtisochctrl;
6079
6080         /* If there's no Azalia in the system anyway, forget it. */
6081         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
6082         if (!pdev)
6083                 return;
6084         pci_dev_put(pdev);
6085
6086         /* System Management Registers. Might be hidden, in which case
6087            we can't do the sanity check. But that's OK, because the
6088            known-broken BIOSes _don't_ actually hide it, so far. */
6089         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
6090         if (!pdev)
6091                 return;
6092
6093         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
6094                 pci_dev_put(pdev);
6095                 return;
6096         }
6097
6098         pci_dev_put(pdev);
6099
6100         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
6101         if (vtisochctrl & 1)
6102                 return;
6103
6104         /* Drop all bits other than the number of TLB entries */
6105         vtisochctrl &= 0x1c;
6106
6107         /* If we have the recommended number of TLB entries (16), fine. */
6108         if (vtisochctrl == 0x10)
6109                 return;
6110
6111         /* Zero TLB entries? You get to ride the short bus to school. */
6112         if (!vtisochctrl) {
6113                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
6114                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
6115                      dmi_get_system_info(DMI_BIOS_VENDOR),
6116                      dmi_get_system_info(DMI_BIOS_VERSION),
6117                      dmi_get_system_info(DMI_PRODUCT_VERSION));
6118                 iommu_identity_mapping |= IDENTMAP_AZALIA;
6119                 return;
6120         }
6121
6122         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
6123                vtisochctrl);
6124 }