]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
Merge tag 'asoc-fix-v5.4-rc1' of https://git.kernel.org/pub/scm/linux/kernel/git...
[linux.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN          (1)
79
80 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
182
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
184
185 /*
186  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
187  * if marked present.
188  */
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
190 {
191         if (!(re->lo & 1))
192                 return 0;
193
194         return re->lo & VTD_PAGE_MASK;
195 }
196
197 /*
198  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
202 {
203         if (!(re->hi & 1))
204                 return 0;
205
206         return re->hi & VTD_PAGE_MASK;
207 }
208
209 static inline void context_clear_pasid_enable(struct context_entry *context)
210 {
211         context->lo &= ~(1ULL << 11);
212 }
213
214 static inline bool context_pasid_enabled(struct context_entry *context)
215 {
216         return !!(context->lo & (1ULL << 11));
217 }
218
219 static inline void context_set_copied(struct context_entry *context)
220 {
221         context->hi |= (1ull << 3);
222 }
223
224 static inline bool context_copied(struct context_entry *context)
225 {
226         return !!(context->hi & (1ULL << 3));
227 }
228
229 static inline bool __context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233
234 bool context_present(struct context_entry *context)
235 {
236         return context_pasid_enabled(context) ?
237              __context_present(context) :
238              __context_present(context) && !context_copied(context);
239 }
240
241 static inline void context_set_present(struct context_entry *context)
242 {
243         context->lo |= 1;
244 }
245
246 static inline void context_set_fault_enable(struct context_entry *context)
247 {
248         context->lo &= (((u64)-1) << 2) | 1;
249 }
250
251 static inline void context_set_translation_type(struct context_entry *context,
252                                                 unsigned long value)
253 {
254         context->lo &= (((u64)-1) << 4) | 3;
255         context->lo |= (value & 3) << 2;
256 }
257
258 static inline void context_set_address_root(struct context_entry *context,
259                                             unsigned long value)
260 {
261         context->lo &= ~VTD_PAGE_MASK;
262         context->lo |= value & VTD_PAGE_MASK;
263 }
264
265 static inline void context_set_address_width(struct context_entry *context,
266                                              unsigned long value)
267 {
268         context->hi |= value & 7;
269 }
270
271 static inline void context_set_domain_id(struct context_entry *context,
272                                          unsigned long value)
273 {
274         context->hi |= (value & ((1 << 16) - 1)) << 8;
275 }
276
277 static inline int context_domain_id(struct context_entry *c)
278 {
279         return((c->hi >> 8) & 0xffff);
280 }
281
282 static inline void context_clear_entry(struct context_entry *context)
283 {
284         context->lo = 0;
285         context->hi = 0;
286 }
287
288 /*
289  * This domain is a statically identity mapping domain.
290  *      1. This domain creats a static 1:1 mapping to all usable memory.
291  *      2. It maps to each iommu if successful.
292  *      3. Each iommu mapps to this domain if successful.
293  */
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
296
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
299
300 /*
301  * This is a DMA domain allocated through the iommu domain allocation
302  * interface. But one or more devices belonging to this domain have
303  * been chosen to use a private domain. We should avoid to use the
304  * map/unmap/iova_to_phys APIs on it.
305  */
306 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319 };
320
321 struct dmar_atsr_unit {
322         struct list_head list;          /* list of ATSR units */
323         struct acpi_dmar_header *hdr;   /* ACPI header */
324         struct dmar_dev_scope *devices; /* target devices */
325         int devices_cnt;                /* target device count */
326         u8 include_all:1;               /* include all ports */
327 };
328
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331
332 #define for_each_rmrr_units(rmrr) \
333         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
343                                  struct device *dev);
344 static int domain_detach_iommu(struct dmar_domain *domain,
345                                struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
348                                      struct device *dev);
349
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
352 #else
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355
356 int intel_iommu_sm;
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
359
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
365
366 #define IDENTMAP_ALL            1
367 #define IDENTMAP_GFX            2
368 #define IDENTMAP_AZALIA         4
369
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
377
378 /*
379  * Iterate over elements in device_domain_list and call the specified
380  * callback @fn against each element.
381  */
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383                                      void *data), void *data)
384 {
385         int ret = 0;
386         unsigned long flags;
387         struct device_domain_info *info;
388
389         spin_lock_irqsave(&device_domain_lock, flags);
390         list_for_each_entry(info, &device_domain_list, global) {
391                 ret = fn(info, data);
392                 if (ret) {
393                         spin_unlock_irqrestore(&device_domain_lock, flags);
394                         return ret;
395                 }
396         }
397         spin_unlock_irqrestore(&device_domain_lock, flags);
398
399         return 0;
400 }
401
402 const struct iommu_ops intel_iommu_ops;
403
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
405 {
406         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 }
408
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
410 {
411         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 }
413
414 static void init_translation_status(struct intel_iommu *iommu)
415 {
416         u32 gsts;
417
418         gsts = readl(iommu->reg + DMAR_GSTS_REG);
419         if (gsts & DMA_GSTS_TES)
420                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 }
422
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
425 {
426         return container_of(dom, struct dmar_domain, domain);
427 }
428
429 static int __init intel_iommu_setup(char *str)
430 {
431         if (!str)
432                 return -EINVAL;
433         while (*str) {
434                 if (!strncmp(str, "on", 2)) {
435                         dmar_disabled = 0;
436                         pr_info("IOMMU enabled\n");
437                 } else if (!strncmp(str, "off", 3)) {
438                         dmar_disabled = 1;
439                         no_platform_optin = 1;
440                         pr_info("IOMMU disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         pr_info("Disable GFX device mapping\n");
444                 } else if (!strncmp(str, "forcedac", 8)) {
445                         pr_info("Forcing DAC for PCI devices\n");
446                         dmar_forcedac = 1;
447                 } else if (!strncmp(str, "strict", 6)) {
448                         pr_info("Disable batched IOTLB flush\n");
449                         intel_iommu_strict = 1;
450                 } else if (!strncmp(str, "sp_off", 6)) {
451                         pr_info("Disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 } else if (!strncmp(str, "sm_on", 5)) {
454                         pr_info("Intel-IOMMU: scalable mode supported\n");
455                         intel_iommu_sm = 1;
456                 } else if (!strncmp(str, "tboot_noforce", 13)) {
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459                         intel_iommu_tboot_noforce = 1;
460                 }
461
462                 str += strcspn(str, ",");
463                 while (*str == ',')
464                         str++;
465         }
466         return 0;
467 }
468 __setup("intel_iommu=", intel_iommu_setup);
469
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
472
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
474 {
475         struct dmar_domain **domains;
476         int idx = did >> 8;
477
478         domains = iommu->domains[idx];
479         if (!domains)
480                 return NULL;
481
482         return domains[did & 0xff];
483 }
484
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486                              struct dmar_domain *domain)
487 {
488         struct dmar_domain **domains;
489         int idx = did >> 8;
490
491         if (!iommu->domains[idx]) {
492                 size_t size = 256 * sizeof(struct dmar_domain *);
493                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494         }
495
496         domains = iommu->domains[idx];
497         if (WARN_ON(!domains))
498                 return;
499         else
500                 domains[did & 0xff] = domain;
501 }
502
503 void *alloc_pgtable_page(int node)
504 {
505         struct page *page;
506         void *vaddr = NULL;
507
508         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
509         if (page)
510                 vaddr = page_address(page);
511         return vaddr;
512 }
513
514 void free_pgtable_page(void *vaddr)
515 {
516         free_page((unsigned long)vaddr);
517 }
518
519 static inline void *alloc_domain_mem(void)
520 {
521         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 }
523
524 static void free_domain_mem(void *vaddr)
525 {
526         kmem_cache_free(iommu_domain_cache, vaddr);
527 }
528
529 static inline void * alloc_devinfo_mem(void)
530 {
531         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 }
533
534 static inline void free_devinfo_mem(void *vaddr)
535 {
536         kmem_cache_free(iommu_devinfo_cache, vaddr);
537 }
538
539 static inline int domain_type_is_si(struct dmar_domain *domain)
540 {
541         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 }
543
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
545                                        unsigned long pfn)
546 {
547         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
548
549         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 }
551
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
553 {
554         unsigned long sagaw;
555         int agaw = -1;
556
557         sagaw = cap_sagaw(iommu->cap);
558         for (agaw = width_to_agaw(max_gaw);
559              agaw >= 0; agaw--) {
560                 if (test_bit(agaw, &sagaw))
561                         break;
562         }
563
564         return agaw;
565 }
566
567 /*
568  * Calculate max SAGAW for each iommu.
569  */
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
571 {
572         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
573 }
574
575 /*
576  * calculate agaw for each iommu.
577  * "SAGAW" may be different across iommus, use a default agaw, and
578  * get a supported less agaw for iommus that don't support the default agaw.
579  */
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
581 {
582         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 }
584
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
587 {
588         int iommu_id;
589
590         /* si_domain and vm domain should not get here. */
591         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592                 return NULL;
593
594         for_each_domain_iommu(iommu_id, domain)
595                 break;
596
597         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598                 return NULL;
599
600         return g_iommus[iommu_id];
601 }
602
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 {
605         struct dmar_drhd_unit *drhd;
606         struct intel_iommu *iommu;
607         bool found = false;
608         int i;
609
610         domain->iommu_coherency = 1;
611
612         for_each_domain_iommu(i, domain) {
613                 found = true;
614                 if (!ecap_coherent(g_iommus[i]->ecap)) {
615                         domain->iommu_coherency = 0;
616                         break;
617                 }
618         }
619         if (found)
620                 return;
621
622         /* No hardware attached; use lowest common denominator */
623         rcu_read_lock();
624         for_each_active_iommu(iommu, drhd) {
625                 if (!ecap_coherent(iommu->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         rcu_read_unlock();
631 }
632
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
634 {
635         struct dmar_drhd_unit *drhd;
636         struct intel_iommu *iommu;
637         int ret = 1;
638
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (iommu != skip) {
642                         if (!ecap_sc_support(iommu->ecap)) {
643                                 ret = 0;
644                                 break;
645                         }
646                 }
647         }
648         rcu_read_unlock();
649
650         return ret;
651 }
652
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
654 {
655         struct dmar_drhd_unit *drhd;
656         struct intel_iommu *iommu;
657         int mask = 0xf;
658
659         if (!intel_iommu_superpage) {
660                 return 0;
661         }
662
663         /* set iommu_superpage to the smallest common denominator */
664         rcu_read_lock();
665         for_each_active_iommu(iommu, drhd) {
666                 if (iommu != skip) {
667                         mask &= cap_super_page_val(iommu->cap);
668                         if (!mask)
669                                 break;
670                 }
671         }
672         rcu_read_unlock();
673
674         return fls(mask);
675 }
676
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
679 {
680         domain_update_iommu_coherency(domain);
681         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 }
684
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686                                          u8 devfn, int alloc)
687 {
688         struct root_entry *root = &iommu->root_entry[bus];
689         struct context_entry *context;
690         u64 *entry;
691
692         entry = &root->lo;
693         if (sm_supported(iommu)) {
694                 if (devfn >= 0x80) {
695                         devfn -= 0x80;
696                         entry = &root->hi;
697                 }
698                 devfn *= 2;
699         }
700         if (*entry & 1)
701                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
702         else {
703                 unsigned long phy_addr;
704                 if (!alloc)
705                         return NULL;
706
707                 context = alloc_pgtable_page(iommu->node);
708                 if (!context)
709                         return NULL;
710
711                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712                 phy_addr = virt_to_phys((void *)context);
713                 *entry = phy_addr | 1;
714                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
715         }
716         return &context[devfn];
717 }
718
719 static int iommu_dummy(struct device *dev)
720 {
721         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
722 }
723
724 /**
725  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726  *                               sub-hierarchy of a candidate PCI-PCI bridge
727  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728  * @bridge: the candidate PCI-PCI bridge
729  *
730  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731  */
732 static bool
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
734 {
735         struct pci_dev *pdev, *pbridge;
736
737         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738                 return false;
739
740         pdev = to_pci_dev(dev);
741         pbridge = to_pci_dev(bridge);
742
743         if (pbridge->subordinate &&
744             pbridge->subordinate->number <= pdev->bus->number &&
745             pbridge->subordinate->busn_res.end >= pdev->bus->number)
746                 return true;
747
748         return false;
749 }
750
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
752 {
753         struct dmar_drhd_unit *drhd = NULL;
754         struct intel_iommu *iommu;
755         struct device *tmp;
756         struct pci_dev *pdev = NULL;
757         u16 segment = 0;
758         int i;
759
760         if (iommu_dummy(dev))
761                 return NULL;
762
763         if (dev_is_pci(dev)) {
764                 struct pci_dev *pf_pdev;
765
766                 pdev = to_pci_dev(dev);
767
768 #ifdef CONFIG_X86
769                 /* VMD child devices currently cannot be handled individually */
770                 if (is_vmd(pdev->bus))
771                         return NULL;
772 #endif
773
774                 /* VFs aren't listed in scope tables; we need to look up
775                  * the PF instead to find the IOMMU. */
776                 pf_pdev = pci_physfn(pdev);
777                 dev = &pf_pdev->dev;
778                 segment = pci_domain_nr(pdev->bus);
779         } else if (has_acpi_companion(dev))
780                 dev = &ACPI_COMPANION(dev)->dev;
781
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (pdev && segment != drhd->segment)
785                         continue;
786
787                 for_each_active_dev_scope(drhd->devices,
788                                           drhd->devices_cnt, i, tmp) {
789                         if (tmp == dev) {
790                                 /* For a VF use its original BDF# not that of the PF
791                                  * which we used for the IOMMU lookup. Strictly speaking
792                                  * we could do this for all PCI devices; we only need to
793                                  * get the BDF# from the scope table for ACPI matches. */
794                                 if (pdev && pdev->is_virtfn)
795                                         goto got_pdev;
796
797                                 *bus = drhd->devices[i].bus;
798                                 *devfn = drhd->devices[i].devfn;
799                                 goto out;
800                         }
801
802                         if (is_downstream_to_pci_bridge(dev, tmp))
803                                 goto got_pdev;
804                 }
805
806                 if (pdev && drhd->include_all) {
807                 got_pdev:
808                         *bus = pdev->bus->number;
809                         *devfn = pdev->devfn;
810                         goto out;
811                 }
812         }
813         iommu = NULL;
814  out:
815         rcu_read_unlock();
816
817         return iommu;
818 }
819
820 static void domain_flush_cache(struct dmar_domain *domain,
821                                void *addr, int size)
822 {
823         if (!domain->iommu_coherency)
824                 clflush_cache_range(addr, size);
825 }
826
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829         struct context_entry *context;
830         int ret = 0;
831         unsigned long flags;
832
833         spin_lock_irqsave(&iommu->lock, flags);
834         context = iommu_context_addr(iommu, bus, devfn, 0);
835         if (context)
836                 ret = context_present(context);
837         spin_unlock_irqrestore(&iommu->lock, flags);
838         return ret;
839 }
840
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843         int i;
844         unsigned long flags;
845         struct context_entry *context;
846
847         spin_lock_irqsave(&iommu->lock, flags);
848         if (!iommu->root_entry) {
849                 goto out;
850         }
851         for (i = 0; i < ROOT_ENTRY_NR; i++) {
852                 context = iommu_context_addr(iommu, i, 0, 0);
853                 if (context)
854                         free_pgtable_page(context);
855
856                 if (!sm_supported(iommu))
857                         continue;
858
859                 context = iommu_context_addr(iommu, i, 0x80, 0);
860                 if (context)
861                         free_pgtable_page(context);
862
863         }
864         free_pgtable_page(iommu->root_entry);
865         iommu->root_entry = NULL;
866 out:
867         spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871                                       unsigned long pfn, int *target_level)
872 {
873         struct dma_pte *parent, *pte;
874         int level = agaw_to_level(domain->agaw);
875         int offset;
876
877         BUG_ON(!domain->pgd);
878
879         if (!domain_pfn_supported(domain, pfn))
880                 /* Address beyond IOMMU's addressing capabilities. */
881                 return NULL;
882
883         parent = domain->pgd;
884
885         while (1) {
886                 void *tmp_page;
887
888                 offset = pfn_level_offset(pfn, level);
889                 pte = &parent[offset];
890                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
891                         break;
892                 if (level == *target_level)
893                         break;
894
895                 if (!dma_pte_present(pte)) {
896                         uint64_t pteval;
897
898                         tmp_page = alloc_pgtable_page(domain->nid);
899
900                         if (!tmp_page)
901                                 return NULL;
902
903                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905                         if (cmpxchg64(&pte->val, 0ULL, pteval))
906                                 /* Someone else set it while we were thinking; use theirs. */
907                                 free_pgtable_page(tmp_page);
908                         else
909                                 domain_flush_cache(domain, pte, sizeof(*pte));
910                 }
911                 if (level == 1)
912                         break;
913
914                 parent = phys_to_virt(dma_pte_addr(pte));
915                 level--;
916         }
917
918         if (!*target_level)
919                 *target_level = level;
920
921         return pte;
922 }
923
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
926                                          unsigned long pfn,
927                                          int level, int *large_page)
928 {
929         struct dma_pte *parent, *pte;
930         int total = agaw_to_level(domain->agaw);
931         int offset;
932
933         parent = domain->pgd;
934         while (level <= total) {
935                 offset = pfn_level_offset(pfn, total);
936                 pte = &parent[offset];
937                 if (level == total)
938                         return pte;
939
940                 if (!dma_pte_present(pte)) {
941                         *large_page = total;
942                         break;
943                 }
944
945                 if (dma_pte_superpage(pte)) {
946                         *large_page = total;
947                         return pte;
948                 }
949
950                 parent = phys_to_virt(dma_pte_addr(pte));
951                 total--;
952         }
953         return NULL;
954 }
955
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958                                 unsigned long start_pfn,
959                                 unsigned long last_pfn)
960 {
961         unsigned int large_page;
962         struct dma_pte *first_pte, *pte;
963
964         BUG_ON(!domain_pfn_supported(domain, start_pfn));
965         BUG_ON(!domain_pfn_supported(domain, last_pfn));
966         BUG_ON(start_pfn > last_pfn);
967
968         /* we don't need lock here; nobody else touches the iova range */
969         do {
970                 large_page = 1;
971                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
972                 if (!pte) {
973                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974                         continue;
975                 }
976                 do {
977                         dma_clear_pte(pte);
978                         start_pfn += lvl_to_nr_pages(large_page);
979                         pte++;
980                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
981
982                 domain_flush_cache(domain, first_pte,
983                                    (void *)pte - (void *)first_pte);
984
985         } while (start_pfn && start_pfn <= last_pfn);
986 }
987
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989                                int retain_level, struct dma_pte *pte,
990                                unsigned long pfn, unsigned long start_pfn,
991                                unsigned long last_pfn)
992 {
993         pfn = max(start_pfn, pfn);
994         pte = &pte[pfn_level_offset(pfn, level)];
995
996         do {
997                 unsigned long level_pfn;
998                 struct dma_pte *level_pte;
999
1000                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001                         goto next;
1002
1003                 level_pfn = pfn & level_mask(level);
1004                 level_pte = phys_to_virt(dma_pte_addr(pte));
1005
1006                 if (level > 2) {
1007                         dma_pte_free_level(domain, level - 1, retain_level,
1008                                            level_pte, level_pfn, start_pfn,
1009                                            last_pfn);
1010                 }
1011
1012                 /*
1013                  * Free the page table if we're below the level we want to
1014                  * retain and the range covers the entire table.
1015                  */
1016                 if (level < retain_level && !(start_pfn > level_pfn ||
1017                       last_pfn < level_pfn + level_size(level) - 1)) {
1018                         dma_clear_pte(pte);
1019                         domain_flush_cache(domain, pte, sizeof(*pte));
1020                         free_pgtable_page(level_pte);
1021                 }
1022 next:
1023                 pfn += level_size(level);
1024         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1025 }
1026
1027 /*
1028  * clear last level (leaf) ptes and free page table pages below the
1029  * level we wish to keep intact.
1030  */
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032                                    unsigned long start_pfn,
1033                                    unsigned long last_pfn,
1034                                    int retain_level)
1035 {
1036         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038         BUG_ON(start_pfn > last_pfn);
1039
1040         dma_pte_clear_range(domain, start_pfn, last_pfn);
1041
1042         /* We don't need lock here; nobody else touches the iova range */
1043         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044                            domain->pgd, 0, start_pfn, last_pfn);
1045
1046         /* free pgd */
1047         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048                 free_pgtable_page(domain->pgd);
1049                 domain->pgd = NULL;
1050         }
1051 }
1052
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054    need to *modify* it at all. All we need to do is make a list of all the
1055    pages which can be freed just as soon as we've flushed the IOTLB and we
1056    know the hardware page-walk will no longer touch them.
1057    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1058    be freed. */
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060                                             int level, struct dma_pte *pte,
1061                                             struct page *freelist)
1062 {
1063         struct page *pg;
1064
1065         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066         pg->freelist = freelist;
1067         freelist = pg;
1068
1069         if (level == 1)
1070                 return freelist;
1071
1072         pte = page_address(pg);
1073         do {
1074                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075                         freelist = dma_pte_list_pagetables(domain, level - 1,
1076                                                            pte, freelist);
1077                 pte++;
1078         } while (!first_pte_in_page(pte));
1079
1080         return freelist;
1081 }
1082
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084                                         struct dma_pte *pte, unsigned long pfn,
1085                                         unsigned long start_pfn,
1086                                         unsigned long last_pfn,
1087                                         struct page *freelist)
1088 {
1089         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1090
1091         pfn = max(start_pfn, pfn);
1092         pte = &pte[pfn_level_offset(pfn, level)];
1093
1094         do {
1095                 unsigned long level_pfn;
1096
1097                 if (!dma_pte_present(pte))
1098                         goto next;
1099
1100                 level_pfn = pfn & level_mask(level);
1101
1102                 /* If range covers entire pagetable, free it */
1103                 if (start_pfn <= level_pfn &&
1104                     last_pfn >= level_pfn + level_size(level) - 1) {
1105                         /* These suborbinate page tables are going away entirely. Don't
1106                            bother to clear them; we're just going to *free* them. */
1107                         if (level > 1 && !dma_pte_superpage(pte))
1108                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1109
1110                         dma_clear_pte(pte);
1111                         if (!first_pte)
1112                                 first_pte = pte;
1113                         last_pte = pte;
1114                 } else if (level > 1) {
1115                         /* Recurse down into a level that isn't *entirely* obsolete */
1116                         freelist = dma_pte_clear_level(domain, level - 1,
1117                                                        phys_to_virt(dma_pte_addr(pte)),
1118                                                        level_pfn, start_pfn, last_pfn,
1119                                                        freelist);
1120                 }
1121 next:
1122                 pfn += level_size(level);
1123         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124
1125         if (first_pte)
1126                 domain_flush_cache(domain, first_pte,
1127                                    (void *)++last_pte - (void *)first_pte);
1128
1129         return freelist;
1130 }
1131
1132 /* We can't just free the pages because the IOMMU may still be walking
1133    the page tables, and may have cached the intermediate levels. The
1134    pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136                                  unsigned long start_pfn,
1137                                  unsigned long last_pfn)
1138 {
1139         struct page *freelist;
1140
1141         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143         BUG_ON(start_pfn > last_pfn);
1144
1145         /* we don't need lock here; nobody else touches the iova range */
1146         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1148
1149         /* free pgd */
1150         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151                 struct page *pgd_page = virt_to_page(domain->pgd);
1152                 pgd_page->freelist = freelist;
1153                 freelist = pgd_page;
1154
1155                 domain->pgd = NULL;
1156         }
1157
1158         return freelist;
1159 }
1160
1161 static void dma_free_pagelist(struct page *freelist)
1162 {
1163         struct page *pg;
1164
1165         while ((pg = freelist)) {
1166                 freelist = pg->freelist;
1167                 free_pgtable_page(page_address(pg));
1168         }
1169 }
1170
1171 static void iova_entry_free(unsigned long data)
1172 {
1173         struct page *freelist = (struct page *)data;
1174
1175         dma_free_pagelist(freelist);
1176 }
1177
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1180 {
1181         struct root_entry *root;
1182         unsigned long flags;
1183
1184         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1185         if (!root) {
1186                 pr_err("Allocating root entry for %s failed\n",
1187                         iommu->name);
1188                 return -ENOMEM;
1189         }
1190
1191         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1192
1193         spin_lock_irqsave(&iommu->lock, flags);
1194         iommu->root_entry = root;
1195         spin_unlock_irqrestore(&iommu->lock, flags);
1196
1197         return 0;
1198 }
1199
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1201 {
1202         u64 addr;
1203         u32 sts;
1204         unsigned long flag;
1205
1206         addr = virt_to_phys(iommu->root_entry);
1207         if (sm_supported(iommu))
1208                 addr |= DMA_RTADDR_SMT;
1209
1210         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1212
1213         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                       readl, (sts & DMA_GSTS_RTPS), sts);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 {
1224         u32 val;
1225         unsigned long flag;
1226
1227         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228                 return;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1232
1233         /* Make sure hardware complete it */
1234         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235                       readl, (!(val & DMA_GSTS_WBFS)), val);
1236
1237         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 }
1239
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242                                   u16 did, u16 source_id, u8 function_mask,
1243                                   u64 type)
1244 {
1245         u64 val = 0;
1246         unsigned long flag;
1247
1248         switch (type) {
1249         case DMA_CCMD_GLOBAL_INVL:
1250                 val = DMA_CCMD_GLOBAL_INVL;
1251                 break;
1252         case DMA_CCMD_DOMAIN_INVL:
1253                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1254                 break;
1255         case DMA_CCMD_DEVICE_INVL:
1256                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258                 break;
1259         default:
1260                 BUG();
1261         }
1262         val |= DMA_CCMD_ICC;
1263
1264         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1266
1267         /* Make sure hardware complete it */
1268         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1270
1271         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 }
1273
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276                                 u64 addr, unsigned int size_order, u64 type)
1277 {
1278         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279         u64 val = 0, val_iva = 0;
1280         unsigned long flag;
1281
1282         switch (type) {
1283         case DMA_TLB_GLOBAL_FLUSH:
1284                 /* global flush doesn't need set IVA_REG */
1285                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1286                 break;
1287         case DMA_TLB_DSI_FLUSH:
1288                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1289                 break;
1290         case DMA_TLB_PSI_FLUSH:
1291                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292                 /* IH bit is passed in as part of address */
1293                 val_iva = size_order | addr;
1294                 break;
1295         default:
1296                 BUG();
1297         }
1298         /* Note: set drain read/write */
1299 #if 0
1300         /*
1301          * This is probably to be super secure.. Looks like we can
1302          * ignore it without any impact.
1303          */
1304         if (cap_read_drain(iommu->cap))
1305                 val |= DMA_TLB_READ_DRAIN;
1306 #endif
1307         if (cap_write_drain(iommu->cap))
1308                 val |= DMA_TLB_WRITE_DRAIN;
1309
1310         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311         /* Note: Only uses first TLB reg currently */
1312         if (val_iva)
1313                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1315
1316         /* Make sure hardware complete it */
1317         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1319
1320         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1321
1322         /* check IOTLB invalidation granularity */
1323         if (DMA_TLB_IAIG(val) == 0)
1324                 pr_err("Flush IOTLB failed\n");
1325         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327                         (unsigned long long)DMA_TLB_IIRG(type),
1328                         (unsigned long long)DMA_TLB_IAIG(val));
1329 }
1330
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333                          u8 bus, u8 devfn)
1334 {
1335         struct device_domain_info *info;
1336
1337         assert_spin_locked(&device_domain_lock);
1338
1339         if (!iommu->qi)
1340                 return NULL;
1341
1342         list_for_each_entry(info, &domain->devices, link)
1343                 if (info->iommu == iommu && info->bus == bus &&
1344                     info->devfn == devfn) {
1345                         if (info->ats_supported && info->dev)
1346                                 return info;
1347                         break;
1348                 }
1349
1350         return NULL;
1351 }
1352
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1354 {
1355         struct device_domain_info *info;
1356         bool has_iotlb_device = false;
1357
1358         assert_spin_locked(&device_domain_lock);
1359
1360         list_for_each_entry(info, &domain->devices, link) {
1361                 struct pci_dev *pdev;
1362
1363                 if (!info->dev || !dev_is_pci(info->dev))
1364                         continue;
1365
1366                 pdev = to_pci_dev(info->dev);
1367                 if (pdev->ats_enabled) {
1368                         has_iotlb_device = true;
1369                         break;
1370                 }
1371         }
1372
1373         domain->has_iotlb_device = has_iotlb_device;
1374 }
1375
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1377 {
1378         struct pci_dev *pdev;
1379
1380         assert_spin_locked(&device_domain_lock);
1381
1382         if (!info || !dev_is_pci(info->dev))
1383                 return;
1384
1385         pdev = to_pci_dev(info->dev);
1386         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389          * reserved, which should be set to 0.
1390          */
1391         if (!ecap_dit(info->iommu->ecap))
1392                 info->pfsid = 0;
1393         else {
1394                 struct pci_dev *pf_pdev;
1395
1396                 /* pdev will be returned if device is not a vf */
1397                 pf_pdev = pci_physfn(pdev);
1398                 info->pfsid = pci_dev_id(pf_pdev);
1399         }
1400
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402         /* The PCIe spec, in its wisdom, declares that the behaviour of
1403            the device if you enable PASID support after ATS support is
1404            undefined. So always enable PASID support on devices which
1405            have it, even if we can't yet know if we're ever going to
1406            use it. */
1407         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408                 info->pasid_enabled = 1;
1409
1410         if (info->pri_supported &&
1411             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1412             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413                 info->pri_enabled = 1;
1414 #endif
1415         if (!pdev->untrusted && info->ats_supported &&
1416             pci_ats_page_aligned(pdev) &&
1417             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418                 info->ats_enabled = 1;
1419                 domain_update_iotlb(info->domain);
1420                 info->ats_qdep = pci_ats_queue_depth(pdev);
1421         }
1422 }
1423
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1425 {
1426         struct pci_dev *pdev;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!dev_is_pci(info->dev))
1431                 return;
1432
1433         pdev = to_pci_dev(info->dev);
1434
1435         if (info->ats_enabled) {
1436                 pci_disable_ats(pdev);
1437                 info->ats_enabled = 0;
1438                 domain_update_iotlb(info->domain);
1439         }
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441         if (info->pri_enabled) {
1442                 pci_disable_pri(pdev);
1443                 info->pri_enabled = 0;
1444         }
1445         if (info->pasid_enabled) {
1446                 pci_disable_pasid(pdev);
1447                 info->pasid_enabled = 0;
1448         }
1449 #endif
1450 }
1451
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453                                   u64 addr, unsigned mask)
1454 {
1455         u16 sid, qdep;
1456         unsigned long flags;
1457         struct device_domain_info *info;
1458
1459         if (!domain->has_iotlb_device)
1460                 return;
1461
1462         spin_lock_irqsave(&device_domain_lock, flags);
1463         list_for_each_entry(info, &domain->devices, link) {
1464                 if (!info->ats_enabled)
1465                         continue;
1466
1467                 sid = info->bus << 8 | info->devfn;
1468                 qdep = info->ats_qdep;
1469                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470                                 qdep, addr, mask);
1471         }
1472         spin_unlock_irqrestore(&device_domain_lock, flags);
1473 }
1474
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476                                   struct dmar_domain *domain,
1477                                   unsigned long pfn, unsigned int pages,
1478                                   int ih, int map)
1479 {
1480         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482         u16 did = domain->iommu_did[iommu->seq_id];
1483
1484         BUG_ON(pages == 0);
1485
1486         if (ih)
1487                 ih = 1 << 6;
1488         /*
1489          * Fallback to domain selective flush if no PSI support or the size is
1490          * too big.
1491          * PSI requires page size to be 2 ^ x, and the base address is naturally
1492          * aligned to the size
1493          */
1494         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496                                                 DMA_TLB_DSI_FLUSH);
1497         else
1498                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1499                                                 DMA_TLB_PSI_FLUSH);
1500
1501         /*
1502          * In caching mode, changes of pages from non-present to present require
1503          * flush. However, device IOTLB doesn't need to be flushed in this case.
1504          */
1505         if (!cap_caching_mode(iommu->cap) || !map)
1506                 iommu_flush_dev_iotlb(domain, addr, mask);
1507 }
1508
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511                                         struct dmar_domain *domain,
1512                                         unsigned long pfn, unsigned int pages)
1513 {
1514         /* It's a non-present to present mapping. Only flush if caching mode */
1515         if (cap_caching_mode(iommu->cap))
1516                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1517         else
1518                 iommu_flush_write_buffer(iommu);
1519 }
1520
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1522 {
1523         struct dmar_domain *domain;
1524         int idx;
1525
1526         domain = container_of(iovad, struct dmar_domain, iovad);
1527
1528         for_each_domain_iommu(idx, domain) {
1529                 struct intel_iommu *iommu = g_iommus[idx];
1530                 u16 did = domain->iommu_did[iommu->seq_id];
1531
1532                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1533
1534                 if (!cap_caching_mode(iommu->cap))
1535                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536                                               0, MAX_AGAW_PFN_WIDTH);
1537         }
1538 }
1539
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 {
1542         u32 pmen;
1543         unsigned long flags;
1544
1545         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546                 return;
1547
1548         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550         pmen &= ~DMA_PMEN_EPM;
1551         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1552
1553         /* wait for the protected region status bit to clear */
1554         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1556
1557         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 }
1559
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 {
1562         u32 sts;
1563         unsigned long flags;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         iommu->gcmd |= DMA_GCMD_TE;
1567         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569         /* Make sure hardware complete it */
1570         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                       readl, (sts & DMA_GSTS_TES), sts);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 }
1575
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1577 {
1578         u32 sts;
1579         unsigned long flag;
1580
1581         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582         iommu->gcmd &= ~DMA_GCMD_TE;
1583         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585         /* Make sure hardware complete it */
1586         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                       readl, (!(sts & DMA_GSTS_TES)), sts);
1588
1589         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 }
1591
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1593 {
1594         u32 ndomains, nlongs;
1595         size_t size;
1596
1597         ndomains = cap_ndoms(iommu->cap);
1598         pr_debug("%s: Number of Domains supported <%d>\n",
1599                  iommu->name, ndomains);
1600         nlongs = BITS_TO_LONGS(ndomains);
1601
1602         spin_lock_init(&iommu->lock);
1603
1604         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605         if (!iommu->domain_ids) {
1606                 pr_err("%s: Allocating domain id array failed\n",
1607                        iommu->name);
1608                 return -ENOMEM;
1609         }
1610
1611         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612         iommu->domains = kzalloc(size, GFP_KERNEL);
1613
1614         if (iommu->domains) {
1615                 size = 256 * sizeof(struct dmar_domain *);
1616                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617         }
1618
1619         if (!iommu->domains || !iommu->domains[0]) {
1620                 pr_err("%s: Allocating domain array failed\n",
1621                        iommu->name);
1622                 kfree(iommu->domain_ids);
1623                 kfree(iommu->domains);
1624                 iommu->domain_ids = NULL;
1625                 iommu->domains    = NULL;
1626                 return -ENOMEM;
1627         }
1628
1629         /*
1630          * If Caching mode is set, then invalid translations are tagged
1631          * with domain-id 0, hence we need to pre-allocate it. We also
1632          * use domain-id 0 as a marker for non-allocated domain-id, so
1633          * make sure it is not used for a real domain.
1634          */
1635         set_bit(0, iommu->domain_ids);
1636
1637         /*
1638          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639          * entry for first-level or pass-through translation modes should
1640          * be programmed with a domain id different from those used for
1641          * second-level or nested translation. We reserve a domain id for
1642          * this purpose.
1643          */
1644         if (sm_supported(iommu))
1645                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646
1647         return 0;
1648 }
1649
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1651 {
1652         struct device_domain_info *info, *tmp;
1653         unsigned long flags;
1654
1655         if (!iommu->domains || !iommu->domain_ids)
1656                 return;
1657
1658         spin_lock_irqsave(&device_domain_lock, flags);
1659         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                 if (info->iommu != iommu)
1661                         continue;
1662
1663                 if (!info->dev || !info->domain)
1664                         continue;
1665
1666                 __dmar_remove_one_dev_info(info);
1667         }
1668         spin_unlock_irqrestore(&device_domain_lock, flags);
1669
1670         if (iommu->gcmd & DMA_GCMD_TE)
1671                 iommu_disable_translation(iommu);
1672 }
1673
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1675 {
1676         if ((iommu->domains) && (iommu->domain_ids)) {
1677                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678                 int i;
1679
1680                 for (i = 0; i < elems; i++)
1681                         kfree(iommu->domains[i]);
1682                 kfree(iommu->domains);
1683                 kfree(iommu->domain_ids);
1684                 iommu->domains = NULL;
1685                 iommu->domain_ids = NULL;
1686         }
1687
1688         g_iommus[iommu->seq_id] = NULL;
1689
1690         /* free context mapping */
1691         free_context_table(iommu);
1692
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694         if (pasid_supported(iommu)) {
1695                 if (ecap_prs(iommu->ecap))
1696                         intel_svm_finish_prq(iommu);
1697         }
1698 #endif
1699 }
1700
1701 static struct dmar_domain *alloc_domain(int flags)
1702 {
1703         struct dmar_domain *domain;
1704
1705         domain = alloc_domain_mem();
1706         if (!domain)
1707                 return NULL;
1708
1709         memset(domain, 0, sizeof(*domain));
1710         domain->nid = NUMA_NO_NODE;
1711         domain->flags = flags;
1712         domain->has_iotlb_device = false;
1713         INIT_LIST_HEAD(&domain->devices);
1714
1715         return domain;
1716 }
1717
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720                                struct intel_iommu *iommu)
1721 {
1722         unsigned long ndomains;
1723         int num;
1724
1725         assert_spin_locked(&device_domain_lock);
1726         assert_spin_locked(&iommu->lock);
1727
1728         domain->iommu_refcnt[iommu->seq_id] += 1;
1729         domain->iommu_count += 1;
1730         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731                 ndomains = cap_ndoms(iommu->cap);
1732                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1733
1734                 if (num >= ndomains) {
1735                         pr_err("%s: No free domain ids\n", iommu->name);
1736                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1737                         domain->iommu_count -= 1;
1738                         return -ENOSPC;
1739                 }
1740
1741                 set_bit(num, iommu->domain_ids);
1742                 set_iommu_domain(iommu, num, domain);
1743
1744                 domain->iommu_did[iommu->seq_id] = num;
1745                 domain->nid                      = iommu->node;
1746
1747                 domain_update_iommu_cap(domain);
1748         }
1749
1750         return 0;
1751 }
1752
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754                                struct intel_iommu *iommu)
1755 {
1756         int num, count;
1757
1758         assert_spin_locked(&device_domain_lock);
1759         assert_spin_locked(&iommu->lock);
1760
1761         domain->iommu_refcnt[iommu->seq_id] -= 1;
1762         count = --domain->iommu_count;
1763         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764                 num = domain->iommu_did[iommu->seq_id];
1765                 clear_bit(num, iommu->domain_ids);
1766                 set_iommu_domain(iommu, num, NULL);
1767
1768                 domain_update_iommu_cap(domain);
1769                 domain->iommu_did[iommu->seq_id] = 0;
1770         }
1771
1772         return count;
1773 }
1774
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1777
1778 static int dmar_init_reserved_ranges(void)
1779 {
1780         struct pci_dev *pdev = NULL;
1781         struct iova *iova;
1782         int i;
1783
1784         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1785
1786         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787                 &reserved_rbtree_key);
1788
1789         /* IOAPIC ranges shouldn't be accessed by DMA */
1790         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791                 IOVA_PFN(IOAPIC_RANGE_END));
1792         if (!iova) {
1793                 pr_err("Reserve IOAPIC range failed\n");
1794                 return -ENODEV;
1795         }
1796
1797         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798         for_each_pci_dev(pdev) {
1799                 struct resource *r;
1800
1801                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802                         r = &pdev->resource[i];
1803                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1804                                 continue;
1805                         iova = reserve_iova(&reserved_iova_list,
1806                                             IOVA_PFN(r->start),
1807                                             IOVA_PFN(r->end));
1808                         if (!iova) {
1809                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1810                                 return -ENODEV;
1811                         }
1812                 }
1813         }
1814         return 0;
1815 }
1816
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1818 {
1819         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 }
1821
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1823 {
1824         int agaw;
1825         int r = (gaw - 12) % 9;
1826
1827         if (r == 0)
1828                 agaw = gaw;
1829         else
1830                 agaw = gaw + 9 - r;
1831         if (agaw > 64)
1832                 agaw = 64;
1833         return agaw;
1834 }
1835
1836 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837                        int guest_width)
1838 {
1839         int adjust_width, agaw;
1840         unsigned long sagaw;
1841         int err;
1842
1843         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1844
1845         err = init_iova_flush_queue(&domain->iovad,
1846                                     iommu_flush_iova, iova_entry_free);
1847         if (err)
1848                 return err;
1849
1850         domain_reserve_special_ranges(domain);
1851
1852         /* calculate AGAW */
1853         if (guest_width > cap_mgaw(iommu->cap))
1854                 guest_width = cap_mgaw(iommu->cap);
1855         domain->gaw = guest_width;
1856         adjust_width = guestwidth_to_adjustwidth(guest_width);
1857         agaw = width_to_agaw(adjust_width);
1858         sagaw = cap_sagaw(iommu->cap);
1859         if (!test_bit(agaw, &sagaw)) {
1860                 /* hardware doesn't support it, choose a bigger one */
1861                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862                 agaw = find_next_bit(&sagaw, 5, agaw);
1863                 if (agaw >= 5)
1864                         return -ENODEV;
1865         }
1866         domain->agaw = agaw;
1867
1868         if (ecap_coherent(iommu->ecap))
1869                 domain->iommu_coherency = 1;
1870         else
1871                 domain->iommu_coherency = 0;
1872
1873         if (ecap_sc_support(iommu->ecap))
1874                 domain->iommu_snooping = 1;
1875         else
1876                 domain->iommu_snooping = 0;
1877
1878         if (intel_iommu_superpage)
1879                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1880         else
1881                 domain->iommu_superpage = 0;
1882
1883         domain->nid = iommu->node;
1884
1885         /* always allocate the top pgd */
1886         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887         if (!domain->pgd)
1888                 return -ENOMEM;
1889         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1890         return 0;
1891 }
1892
1893 static void domain_exit(struct dmar_domain *domain)
1894 {
1895
1896         /* Remove associated devices and clear attached or cached domains */
1897         domain_remove_dev_info(domain);
1898
1899         /* destroy iovas */
1900         put_iova_domain(&domain->iovad);
1901
1902         if (domain->pgd) {
1903                 struct page *freelist;
1904
1905                 freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1906                 dma_free_pagelist(freelist);
1907         }
1908
1909         free_domain_mem(domain);
1910 }
1911
1912 /*
1913  * Get the PASID directory size for scalable mode context entry.
1914  * Value of X in the PDTS field of a scalable mode context entry
1915  * indicates PASID directory with 2^(X + 7) entries.
1916  */
1917 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1918 {
1919         int pds, max_pde;
1920
1921         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1922         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1923         if (pds < 7)
1924                 return 0;
1925
1926         return pds - 7;
1927 }
1928
1929 /*
1930  * Set the RID_PASID field of a scalable mode context entry. The
1931  * IOMMU hardware will use the PASID value set in this field for
1932  * DMA translations of DMA requests without PASID.
1933  */
1934 static inline void
1935 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1936 {
1937         context->hi |= pasid & ((1 << 20) - 1);
1938         context->hi |= (1 << 20);
1939 }
1940
1941 /*
1942  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1943  * entry.
1944  */
1945 static inline void context_set_sm_dte(struct context_entry *context)
1946 {
1947         context->lo |= (1 << 2);
1948 }
1949
1950 /*
1951  * Set the PRE(Page Request Enable) field of a scalable mode context
1952  * entry.
1953  */
1954 static inline void context_set_sm_pre(struct context_entry *context)
1955 {
1956         context->lo |= (1 << 4);
1957 }
1958
1959 /* Convert value to context PASID directory size field coding. */
1960 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1961
1962 static int domain_context_mapping_one(struct dmar_domain *domain,
1963                                       struct intel_iommu *iommu,
1964                                       struct pasid_table *table,
1965                                       u8 bus, u8 devfn)
1966 {
1967         u16 did = domain->iommu_did[iommu->seq_id];
1968         int translation = CONTEXT_TT_MULTI_LEVEL;
1969         struct device_domain_info *info = NULL;
1970         struct context_entry *context;
1971         unsigned long flags;
1972         int ret;
1973
1974         WARN_ON(did == 0);
1975
1976         if (hw_pass_through && domain_type_is_si(domain))
1977                 translation = CONTEXT_TT_PASS_THROUGH;
1978
1979         pr_debug("Set context mapping for %02x:%02x.%d\n",
1980                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1981
1982         BUG_ON(!domain->pgd);
1983
1984         spin_lock_irqsave(&device_domain_lock, flags);
1985         spin_lock(&iommu->lock);
1986
1987         ret = -ENOMEM;
1988         context = iommu_context_addr(iommu, bus, devfn, 1);
1989         if (!context)
1990                 goto out_unlock;
1991
1992         ret = 0;
1993         if (context_present(context))
1994                 goto out_unlock;
1995
1996         /*
1997          * For kdump cases, old valid entries may be cached due to the
1998          * in-flight DMA and copied pgtable, but there is no unmapping
1999          * behaviour for them, thus we need an explicit cache flush for
2000          * the newly-mapped device. For kdump, at this point, the device
2001          * is supposed to finish reset at its driver probe stage, so no
2002          * in-flight DMA will exist, and we don't need to worry anymore
2003          * hereafter.
2004          */
2005         if (context_copied(context)) {
2006                 u16 did_old = context_domain_id(context);
2007
2008                 if (did_old < cap_ndoms(iommu->cap)) {
2009                         iommu->flush.flush_context(iommu, did_old,
2010                                                    (((u16)bus) << 8) | devfn,
2011                                                    DMA_CCMD_MASK_NOBIT,
2012                                                    DMA_CCMD_DEVICE_INVL);
2013                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2014                                                  DMA_TLB_DSI_FLUSH);
2015                 }
2016         }
2017
2018         context_clear_entry(context);
2019
2020         if (sm_supported(iommu)) {
2021                 unsigned long pds;
2022
2023                 WARN_ON(!table);
2024
2025                 /* Setup the PASID DIR pointer: */
2026                 pds = context_get_sm_pds(table);
2027                 context->lo = (u64)virt_to_phys(table->table) |
2028                                 context_pdts(pds);
2029
2030                 /* Setup the RID_PASID field: */
2031                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2032
2033                 /*
2034                  * Setup the Device-TLB enable bit and Page request
2035                  * Enable bit:
2036                  */
2037                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2038                 if (info && info->ats_supported)
2039                         context_set_sm_dte(context);
2040                 if (info && info->pri_supported)
2041                         context_set_sm_pre(context);
2042         } else {
2043                 struct dma_pte *pgd = domain->pgd;
2044                 int agaw;
2045
2046                 context_set_domain_id(context, did);
2047
2048                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2049                         /*
2050                          * Skip top levels of page tables for iommu which has
2051                          * less agaw than default. Unnecessary for PT mode.
2052                          */
2053                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2054                                 ret = -ENOMEM;
2055                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2056                                 if (!dma_pte_present(pgd))
2057                                         goto out_unlock;
2058                         }
2059
2060                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2061                         if (info && info->ats_supported)
2062                                 translation = CONTEXT_TT_DEV_IOTLB;
2063                         else
2064                                 translation = CONTEXT_TT_MULTI_LEVEL;
2065
2066                         context_set_address_root(context, virt_to_phys(pgd));
2067                         context_set_address_width(context, agaw);
2068                 } else {
2069                         /*
2070                          * In pass through mode, AW must be programmed to
2071                          * indicate the largest AGAW value supported by
2072                          * hardware. And ASR is ignored by hardware.
2073                          */
2074                         context_set_address_width(context, iommu->msagaw);
2075                 }
2076
2077                 context_set_translation_type(context, translation);
2078         }
2079
2080         context_set_fault_enable(context);
2081         context_set_present(context);
2082         domain_flush_cache(domain, context, sizeof(*context));
2083
2084         /*
2085          * It's a non-present to present mapping. If hardware doesn't cache
2086          * non-present entry we only need to flush the write-buffer. If the
2087          * _does_ cache non-present entries, then it does so in the special
2088          * domain #0, which we have to flush:
2089          */
2090         if (cap_caching_mode(iommu->cap)) {
2091                 iommu->flush.flush_context(iommu, 0,
2092                                            (((u16)bus) << 8) | devfn,
2093                                            DMA_CCMD_MASK_NOBIT,
2094                                            DMA_CCMD_DEVICE_INVL);
2095                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2096         } else {
2097                 iommu_flush_write_buffer(iommu);
2098         }
2099         iommu_enable_dev_iotlb(info);
2100
2101         ret = 0;
2102
2103 out_unlock:
2104         spin_unlock(&iommu->lock);
2105         spin_unlock_irqrestore(&device_domain_lock, flags);
2106
2107         return ret;
2108 }
2109
2110 struct domain_context_mapping_data {
2111         struct dmar_domain *domain;
2112         struct intel_iommu *iommu;
2113         struct pasid_table *table;
2114 };
2115
2116 static int domain_context_mapping_cb(struct pci_dev *pdev,
2117                                      u16 alias, void *opaque)
2118 {
2119         struct domain_context_mapping_data *data = opaque;
2120
2121         return domain_context_mapping_one(data->domain, data->iommu,
2122                                           data->table, PCI_BUS_NUM(alias),
2123                                           alias & 0xff);
2124 }
2125
2126 static int
2127 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2128 {
2129         struct domain_context_mapping_data data;
2130         struct pasid_table *table;
2131         struct intel_iommu *iommu;
2132         u8 bus, devfn;
2133
2134         iommu = device_to_iommu(dev, &bus, &devfn);
2135         if (!iommu)
2136                 return -ENODEV;
2137
2138         table = intel_pasid_get_table(dev);
2139
2140         if (!dev_is_pci(dev))
2141                 return domain_context_mapping_one(domain, iommu, table,
2142                                                   bus, devfn);
2143
2144         data.domain = domain;
2145         data.iommu = iommu;
2146         data.table = table;
2147
2148         return pci_for_each_dma_alias(to_pci_dev(dev),
2149                                       &domain_context_mapping_cb, &data);
2150 }
2151
2152 static int domain_context_mapped_cb(struct pci_dev *pdev,
2153                                     u16 alias, void *opaque)
2154 {
2155         struct intel_iommu *iommu = opaque;
2156
2157         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2158 }
2159
2160 static int domain_context_mapped(struct device *dev)
2161 {
2162         struct intel_iommu *iommu;
2163         u8 bus, devfn;
2164
2165         iommu = device_to_iommu(dev, &bus, &devfn);
2166         if (!iommu)
2167                 return -ENODEV;
2168
2169         if (!dev_is_pci(dev))
2170                 return device_context_mapped(iommu, bus, devfn);
2171
2172         return !pci_for_each_dma_alias(to_pci_dev(dev),
2173                                        domain_context_mapped_cb, iommu);
2174 }
2175
2176 /* Returns a number of VTD pages, but aligned to MM page size */
2177 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2178                                             size_t size)
2179 {
2180         host_addr &= ~PAGE_MASK;
2181         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2182 }
2183
2184 /* Return largest possible superpage level for a given mapping */
2185 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2186                                           unsigned long iov_pfn,
2187                                           unsigned long phy_pfn,
2188                                           unsigned long pages)
2189 {
2190         int support, level = 1;
2191         unsigned long pfnmerge;
2192
2193         support = domain->iommu_superpage;
2194
2195         /* To use a large page, the virtual *and* physical addresses
2196            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2197            of them will mean we have to use smaller pages. So just
2198            merge them and check both at once. */
2199         pfnmerge = iov_pfn | phy_pfn;
2200
2201         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2202                 pages >>= VTD_STRIDE_SHIFT;
2203                 if (!pages)
2204                         break;
2205                 pfnmerge >>= VTD_STRIDE_SHIFT;
2206                 level++;
2207                 support--;
2208         }
2209         return level;
2210 }
2211
2212 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2213                             struct scatterlist *sg, unsigned long phys_pfn,
2214                             unsigned long nr_pages, int prot)
2215 {
2216         struct dma_pte *first_pte = NULL, *pte = NULL;
2217         phys_addr_t uninitialized_var(pteval);
2218         unsigned long sg_res = 0;
2219         unsigned int largepage_lvl = 0;
2220         unsigned long lvl_pages = 0;
2221
2222         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2223
2224         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2225                 return -EINVAL;
2226
2227         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2228
2229         if (!sg) {
2230                 sg_res = nr_pages;
2231                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2232         }
2233
2234         while (nr_pages > 0) {
2235                 uint64_t tmp;
2236
2237                 if (!sg_res) {
2238                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2239
2240                         sg_res = aligned_nrpages(sg->offset, sg->length);
2241                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2242                         sg->dma_length = sg->length;
2243                         pteval = (sg_phys(sg) - pgoff) | prot;
2244                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2245                 }
2246
2247                 if (!pte) {
2248                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2249
2250                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2251                         if (!pte)
2252                                 return -ENOMEM;
2253                         /* It is large page*/
2254                         if (largepage_lvl > 1) {
2255                                 unsigned long nr_superpages, end_pfn;
2256
2257                                 pteval |= DMA_PTE_LARGE_PAGE;
2258                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2259
2260                                 nr_superpages = sg_res / lvl_pages;
2261                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2262
2263                                 /*
2264                                  * Ensure that old small page tables are
2265                                  * removed to make room for superpage(s).
2266                                  * We're adding new large pages, so make sure
2267                                  * we don't remove their parent tables.
2268                                  */
2269                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2270                                                        largepage_lvl + 1);
2271                         } else {
2272                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2273                         }
2274
2275                 }
2276                 /* We don't need lock here, nobody else
2277                  * touches the iova range
2278                  */
2279                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2280                 if (tmp) {
2281                         static int dumps = 5;
2282                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2283                                 iov_pfn, tmp, (unsigned long long)pteval);
2284                         if (dumps) {
2285                                 dumps--;
2286                                 debug_dma_dump_mappings(NULL);
2287                         }
2288                         WARN_ON(1);
2289                 }
2290
2291                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2292
2293                 BUG_ON(nr_pages < lvl_pages);
2294                 BUG_ON(sg_res < lvl_pages);
2295
2296                 nr_pages -= lvl_pages;
2297                 iov_pfn += lvl_pages;
2298                 phys_pfn += lvl_pages;
2299                 pteval += lvl_pages * VTD_PAGE_SIZE;
2300                 sg_res -= lvl_pages;
2301
2302                 /* If the next PTE would be the first in a new page, then we
2303                    need to flush the cache on the entries we've just written.
2304                    And then we'll need to recalculate 'pte', so clear it and
2305                    let it get set again in the if (!pte) block above.
2306
2307                    If we're done (!nr_pages) we need to flush the cache too.
2308
2309                    Also if we've been setting superpages, we may need to
2310                    recalculate 'pte' and switch back to smaller pages for the
2311                    end of the mapping, if the trailing size is not enough to
2312                    use another superpage (i.e. sg_res < lvl_pages). */
2313                 pte++;
2314                 if (!nr_pages || first_pte_in_page(pte) ||
2315                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2316                         domain_flush_cache(domain, first_pte,
2317                                            (void *)pte - (void *)first_pte);
2318                         pte = NULL;
2319                 }
2320
2321                 if (!sg_res && nr_pages)
2322                         sg = sg_next(sg);
2323         }
2324         return 0;
2325 }
2326
2327 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2328                           struct scatterlist *sg, unsigned long phys_pfn,
2329                           unsigned long nr_pages, int prot)
2330 {
2331         int iommu_id, ret;
2332         struct intel_iommu *iommu;
2333
2334         /* Do the real mapping first */
2335         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2336         if (ret)
2337                 return ret;
2338
2339         for_each_domain_iommu(iommu_id, domain) {
2340                 iommu = g_iommus[iommu_id];
2341                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2342         }
2343
2344         return 0;
2345 }
2346
2347 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2348                                     struct scatterlist *sg, unsigned long nr_pages,
2349                                     int prot)
2350 {
2351         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2352 }
2353
2354 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2355                                      unsigned long phys_pfn, unsigned long nr_pages,
2356                                      int prot)
2357 {
2358         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2359 }
2360
2361 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2362 {
2363         unsigned long flags;
2364         struct context_entry *context;
2365         u16 did_old;
2366
2367         if (!iommu)
2368                 return;
2369
2370         spin_lock_irqsave(&iommu->lock, flags);
2371         context = iommu_context_addr(iommu, bus, devfn, 0);
2372         if (!context) {
2373                 spin_unlock_irqrestore(&iommu->lock, flags);
2374                 return;
2375         }
2376         did_old = context_domain_id(context);
2377         context_clear_entry(context);
2378         __iommu_flush_cache(iommu, context, sizeof(*context));
2379         spin_unlock_irqrestore(&iommu->lock, flags);
2380         iommu->flush.flush_context(iommu,
2381                                    did_old,
2382                                    (((u16)bus) << 8) | devfn,
2383                                    DMA_CCMD_MASK_NOBIT,
2384                                    DMA_CCMD_DEVICE_INVL);
2385         iommu->flush.flush_iotlb(iommu,
2386                                  did_old,
2387                                  0,
2388                                  0,
2389                                  DMA_TLB_DSI_FLUSH);
2390 }
2391
2392 static inline void unlink_domain_info(struct device_domain_info *info)
2393 {
2394         assert_spin_locked(&device_domain_lock);
2395         list_del(&info->link);
2396         list_del(&info->global);
2397         if (info->dev)
2398                 info->dev->archdata.iommu = NULL;
2399 }
2400
2401 static void domain_remove_dev_info(struct dmar_domain *domain)
2402 {
2403         struct device_domain_info *info, *tmp;
2404         unsigned long flags;
2405
2406         spin_lock_irqsave(&device_domain_lock, flags);
2407         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2408                 __dmar_remove_one_dev_info(info);
2409         spin_unlock_irqrestore(&device_domain_lock, flags);
2410 }
2411
2412 /*
2413  * find_domain
2414  * Note: we use struct device->archdata.iommu stores the info
2415  */
2416 static struct dmar_domain *find_domain(struct device *dev)
2417 {
2418         struct device_domain_info *info;
2419
2420         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2421                 struct iommu_domain *domain;
2422
2423                 dev->archdata.iommu = NULL;
2424                 domain = iommu_get_domain_for_dev(dev);
2425                 if (domain)
2426                         intel_iommu_attach_device(domain, dev);
2427         }
2428
2429         /* No lock here, assumes no domain exit in normal case */
2430         info = dev->archdata.iommu;
2431
2432         if (likely(info))
2433                 return info->domain;
2434         return NULL;
2435 }
2436
2437 static inline struct device_domain_info *
2438 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2439 {
2440         struct device_domain_info *info;
2441
2442         list_for_each_entry(info, &device_domain_list, global)
2443                 if (info->iommu->segment == segment && info->bus == bus &&
2444                     info->devfn == devfn)
2445                         return info;
2446
2447         return NULL;
2448 }
2449
2450 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2451                                                     int bus, int devfn,
2452                                                     struct device *dev,
2453                                                     struct dmar_domain *domain)
2454 {
2455         struct dmar_domain *found = NULL;
2456         struct device_domain_info *info;
2457         unsigned long flags;
2458         int ret;
2459
2460         info = alloc_devinfo_mem();
2461         if (!info)
2462                 return NULL;
2463
2464         info->bus = bus;
2465         info->devfn = devfn;
2466         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2467         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2468         info->ats_qdep = 0;
2469         info->dev = dev;
2470         info->domain = domain;
2471         info->iommu = iommu;
2472         info->pasid_table = NULL;
2473         info->auxd_enabled = 0;
2474         INIT_LIST_HEAD(&info->auxiliary_domains);
2475
2476         if (dev && dev_is_pci(dev)) {
2477                 struct pci_dev *pdev = to_pci_dev(info->dev);
2478
2479                 if (!pdev->untrusted &&
2480                     !pci_ats_disabled() &&
2481                     ecap_dev_iotlb_support(iommu->ecap) &&
2482                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2483                     dmar_find_matched_atsr_unit(pdev))
2484                         info->ats_supported = 1;
2485
2486                 if (sm_supported(iommu)) {
2487                         if (pasid_supported(iommu)) {
2488                                 int features = pci_pasid_features(pdev);
2489                                 if (features >= 0)
2490                                         info->pasid_supported = features | 1;
2491                         }
2492
2493                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2494                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2495                                 info->pri_supported = 1;
2496                 }
2497         }
2498
2499         spin_lock_irqsave(&device_domain_lock, flags);
2500         if (dev)
2501                 found = find_domain(dev);
2502
2503         if (!found) {
2504                 struct device_domain_info *info2;
2505                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2506                 if (info2) {
2507                         found      = info2->domain;
2508                         info2->dev = dev;
2509                 }
2510         }
2511
2512         if (found) {
2513                 spin_unlock_irqrestore(&device_domain_lock, flags);
2514                 free_devinfo_mem(info);
2515                 /* Caller must free the original domain */
2516                 return found;
2517         }
2518
2519         spin_lock(&iommu->lock);
2520         ret = domain_attach_iommu(domain, iommu);
2521         spin_unlock(&iommu->lock);
2522
2523         if (ret) {
2524                 spin_unlock_irqrestore(&device_domain_lock, flags);
2525                 free_devinfo_mem(info);
2526                 return NULL;
2527         }
2528
2529         list_add(&info->link, &domain->devices);
2530         list_add(&info->global, &device_domain_list);
2531         if (dev)
2532                 dev->archdata.iommu = info;
2533         spin_unlock_irqrestore(&device_domain_lock, flags);
2534
2535         /* PASID table is mandatory for a PCI device in scalable mode. */
2536         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2537                 ret = intel_pasid_alloc_table(dev);
2538                 if (ret) {
2539                         dev_err(dev, "PASID table allocation failed\n");
2540                         dmar_remove_one_dev_info(dev);
2541                         return NULL;
2542                 }
2543
2544                 /* Setup the PASID entry for requests without PASID: */
2545                 spin_lock(&iommu->lock);
2546                 if (hw_pass_through && domain_type_is_si(domain))
2547                         ret = intel_pasid_setup_pass_through(iommu, domain,
2548                                         dev, PASID_RID2PASID);
2549                 else
2550                         ret = intel_pasid_setup_second_level(iommu, domain,
2551                                         dev, PASID_RID2PASID);
2552                 spin_unlock(&iommu->lock);
2553                 if (ret) {
2554                         dev_err(dev, "Setup RID2PASID failed\n");
2555                         dmar_remove_one_dev_info(dev);
2556                         return NULL;
2557                 }
2558         }
2559
2560         if (dev && domain_context_mapping(domain, dev)) {
2561                 dev_err(dev, "Domain context map failed\n");
2562                 dmar_remove_one_dev_info(dev);
2563                 return NULL;
2564         }
2565
2566         return domain;
2567 }
2568
2569 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2570 {
2571         *(u16 *)opaque = alias;
2572         return 0;
2573 }
2574
2575 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2576 {
2577         struct device_domain_info *info;
2578         struct dmar_domain *domain = NULL;
2579         struct intel_iommu *iommu;
2580         u16 dma_alias;
2581         unsigned long flags;
2582         u8 bus, devfn;
2583
2584         iommu = device_to_iommu(dev, &bus, &devfn);
2585         if (!iommu)
2586                 return NULL;
2587
2588         if (dev_is_pci(dev)) {
2589                 struct pci_dev *pdev = to_pci_dev(dev);
2590
2591                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2592
2593                 spin_lock_irqsave(&device_domain_lock, flags);
2594                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2595                                                       PCI_BUS_NUM(dma_alias),
2596                                                       dma_alias & 0xff);
2597                 if (info) {
2598                         iommu = info->iommu;
2599                         domain = info->domain;
2600                 }
2601                 spin_unlock_irqrestore(&device_domain_lock, flags);
2602
2603                 /* DMA alias already has a domain, use it */
2604                 if (info)
2605                         goto out;
2606         }
2607
2608         /* Allocate and initialize new domain for the device */
2609         domain = alloc_domain(0);
2610         if (!domain)
2611                 return NULL;
2612         if (domain_init(domain, iommu, gaw)) {
2613                 domain_exit(domain);
2614                 return NULL;
2615         }
2616
2617 out:
2618         return domain;
2619 }
2620
2621 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2622                                               struct dmar_domain *domain)
2623 {
2624         struct intel_iommu *iommu;
2625         struct dmar_domain *tmp;
2626         u16 req_id, dma_alias;
2627         u8 bus, devfn;
2628
2629         iommu = device_to_iommu(dev, &bus, &devfn);
2630         if (!iommu)
2631                 return NULL;
2632
2633         req_id = ((u16)bus << 8) | devfn;
2634
2635         if (dev_is_pci(dev)) {
2636                 struct pci_dev *pdev = to_pci_dev(dev);
2637
2638                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2639
2640                 /* register PCI DMA alias device */
2641                 if (req_id != dma_alias) {
2642                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2643                                         dma_alias & 0xff, NULL, domain);
2644
2645                         if (!tmp || tmp != domain)
2646                                 return tmp;
2647                 }
2648         }
2649
2650         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2651         if (!tmp || tmp != domain)
2652                 return tmp;
2653
2654         return domain;
2655 }
2656
2657 static int iommu_domain_identity_map(struct dmar_domain *domain,
2658                                      unsigned long long start,
2659                                      unsigned long long end)
2660 {
2661         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2662         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2663
2664         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2665                           dma_to_mm_pfn(last_vpfn))) {
2666                 pr_err("Reserving iova failed\n");
2667                 return -ENOMEM;
2668         }
2669
2670         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2671         /*
2672          * RMRR range might have overlap with physical memory range,
2673          * clear it first
2674          */
2675         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2676
2677         return __domain_mapping(domain, first_vpfn, NULL,
2678                                 first_vpfn, last_vpfn - first_vpfn + 1,
2679                                 DMA_PTE_READ|DMA_PTE_WRITE);
2680 }
2681
2682 static int domain_prepare_identity_map(struct device *dev,
2683                                        struct dmar_domain *domain,
2684                                        unsigned long long start,
2685                                        unsigned long long end)
2686 {
2687         /* For _hardware_ passthrough, don't bother. But for software
2688            passthrough, we do it anyway -- it may indicate a memory
2689            range which is reserved in E820, so which didn't get set
2690            up to start with in si_domain */
2691         if (domain == si_domain && hw_pass_through) {
2692                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2693                          start, end);
2694                 return 0;
2695         }
2696
2697         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2698
2699         if (end < start) {
2700                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2701                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2702                         dmi_get_system_info(DMI_BIOS_VENDOR),
2703                         dmi_get_system_info(DMI_BIOS_VERSION),
2704                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2705                 return -EIO;
2706         }
2707
2708         if (end >> agaw_to_width(domain->agaw)) {
2709                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2710                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2711                      agaw_to_width(domain->agaw),
2712                      dmi_get_system_info(DMI_BIOS_VENDOR),
2713                      dmi_get_system_info(DMI_BIOS_VERSION),
2714                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2715                 return -EIO;
2716         }
2717
2718         return iommu_domain_identity_map(domain, start, end);
2719 }
2720
2721 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2722
2723 static int __init si_domain_init(int hw)
2724 {
2725         struct dmar_rmrr_unit *rmrr;
2726         struct device *dev;
2727         int i, nid, ret;
2728
2729         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2730         if (!si_domain)
2731                 return -EFAULT;
2732
2733         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2734                 domain_exit(si_domain);
2735                 return -EFAULT;
2736         }
2737
2738         if (hw)
2739                 return 0;
2740
2741         for_each_online_node(nid) {
2742                 unsigned long start_pfn, end_pfn;
2743                 int i;
2744
2745                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2746                         ret = iommu_domain_identity_map(si_domain,
2747                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2748                         if (ret)
2749                                 return ret;
2750                 }
2751         }
2752
2753         /*
2754          * Normally we use DMA domains for devices which have RMRRs. But we
2755          * loose this requirement for graphic and usb devices. Identity map
2756          * the RMRRs for graphic and USB devices so that they could use the
2757          * si_domain.
2758          */
2759         for_each_rmrr_units(rmrr) {
2760                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2761                                           i, dev) {
2762                         unsigned long long start = rmrr->base_address;
2763                         unsigned long long end = rmrr->end_address;
2764
2765                         if (device_is_rmrr_locked(dev))
2766                                 continue;
2767
2768                         if (WARN_ON(end < start ||
2769                                     end >> agaw_to_width(si_domain->agaw)))
2770                                 continue;
2771
2772                         ret = iommu_domain_identity_map(si_domain, start, end);
2773                         if (ret)
2774                                 return ret;
2775                 }
2776         }
2777
2778         return 0;
2779 }
2780
2781 static int identity_mapping(struct device *dev)
2782 {
2783         struct device_domain_info *info;
2784
2785         info = dev->archdata.iommu;
2786         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2787                 return (info->domain == si_domain);
2788
2789         return 0;
2790 }
2791
2792 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2793 {
2794         struct dmar_domain *ndomain;
2795         struct intel_iommu *iommu;
2796         u8 bus, devfn;
2797
2798         iommu = device_to_iommu(dev, &bus, &devfn);
2799         if (!iommu)
2800                 return -ENODEV;
2801
2802         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2803         if (ndomain != domain)
2804                 return -EBUSY;
2805
2806         return 0;
2807 }
2808
2809 static bool device_has_rmrr(struct device *dev)
2810 {
2811         struct dmar_rmrr_unit *rmrr;
2812         struct device *tmp;
2813         int i;
2814
2815         rcu_read_lock();
2816         for_each_rmrr_units(rmrr) {
2817                 /*
2818                  * Return TRUE if this RMRR contains the device that
2819                  * is passed in.
2820                  */
2821                 for_each_active_dev_scope(rmrr->devices,
2822                                           rmrr->devices_cnt, i, tmp)
2823                         if (tmp == dev ||
2824                             is_downstream_to_pci_bridge(dev, tmp)) {
2825                                 rcu_read_unlock();
2826                                 return true;
2827                         }
2828         }
2829         rcu_read_unlock();
2830         return false;
2831 }
2832
2833 /**
2834  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2835  * is relaxable (ie. is allowed to be not enforced under some conditions)
2836  * @dev: device handle
2837  *
2838  * We assume that PCI USB devices with RMRRs have them largely
2839  * for historical reasons and that the RMRR space is not actively used post
2840  * boot.  This exclusion may change if vendors begin to abuse it.
2841  *
2842  * The same exception is made for graphics devices, with the requirement that
2843  * any use of the RMRR regions will be torn down before assigning the device
2844  * to a guest.
2845  *
2846  * Return: true if the RMRR is relaxable, false otherwise
2847  */
2848 static bool device_rmrr_is_relaxable(struct device *dev)
2849 {
2850         struct pci_dev *pdev;
2851
2852         if (!dev_is_pci(dev))
2853                 return false;
2854
2855         pdev = to_pci_dev(dev);
2856         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2857                 return true;
2858         else
2859                 return false;
2860 }
2861
2862 /*
2863  * There are a couple cases where we need to restrict the functionality of
2864  * devices associated with RMRRs.  The first is when evaluating a device for
2865  * identity mapping because problems exist when devices are moved in and out
2866  * of domains and their respective RMRR information is lost.  This means that
2867  * a device with associated RMRRs will never be in a "passthrough" domain.
2868  * The second is use of the device through the IOMMU API.  This interface
2869  * expects to have full control of the IOVA space for the device.  We cannot
2870  * satisfy both the requirement that RMRR access is maintained and have an
2871  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2872  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2873  * We therefore prevent devices associated with an RMRR from participating in
2874  * the IOMMU API, which eliminates them from device assignment.
2875  *
2876  * In both cases, devices which have relaxable RMRRs are not concerned by this
2877  * restriction. See device_rmrr_is_relaxable comment.
2878  */
2879 static bool device_is_rmrr_locked(struct device *dev)
2880 {
2881         if (!device_has_rmrr(dev))
2882                 return false;
2883
2884         if (device_rmrr_is_relaxable(dev))
2885                 return false;
2886
2887         return true;
2888 }
2889
2890 /*
2891  * Return the required default domain type for a specific device.
2892  *
2893  * @dev: the device in query
2894  * @startup: true if this is during early boot
2895  *
2896  * Returns:
2897  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2898  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2899  *  - 0: both identity and dynamic domains work for this device
2900  */
2901 static int device_def_domain_type(struct device *dev)
2902 {
2903         if (dev_is_pci(dev)) {
2904                 struct pci_dev *pdev = to_pci_dev(dev);
2905
2906                 if (device_is_rmrr_locked(dev))
2907                         return IOMMU_DOMAIN_DMA;
2908
2909                 /*
2910                  * Prevent any device marked as untrusted from getting
2911                  * placed into the statically identity mapping domain.
2912                  */
2913                 if (pdev->untrusted)
2914                         return IOMMU_DOMAIN_DMA;
2915
2916                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2917                         return IOMMU_DOMAIN_IDENTITY;
2918
2919                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2920                         return IOMMU_DOMAIN_IDENTITY;
2921
2922                 /*
2923                  * We want to start off with all devices in the 1:1 domain, and
2924                  * take them out later if we find they can't access all of memory.
2925                  *
2926                  * However, we can't do this for PCI devices behind bridges,
2927                  * because all PCI devices behind the same bridge will end up
2928                  * with the same source-id on their transactions.
2929                  *
2930                  * Practically speaking, we can't change things around for these
2931                  * devices at run-time, because we can't be sure there'll be no
2932                  * DMA transactions in flight for any of their siblings.
2933                  *
2934                  * So PCI devices (unless they're on the root bus) as well as
2935                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2936                  * the 1:1 domain, just in _case_ one of their siblings turns out
2937                  * not to be able to map all of memory.
2938                  */
2939                 if (!pci_is_pcie(pdev)) {
2940                         if (!pci_is_root_bus(pdev->bus))
2941                                 return IOMMU_DOMAIN_DMA;
2942                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2943                                 return IOMMU_DOMAIN_DMA;
2944                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2945                         return IOMMU_DOMAIN_DMA;
2946         } else {
2947                 if (device_has_rmrr(dev))
2948                         return IOMMU_DOMAIN_DMA;
2949         }
2950
2951         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2952                         IOMMU_DOMAIN_IDENTITY : 0;
2953 }
2954
2955 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2956 {
2957         /*
2958          * Start from the sane iommu hardware state.
2959          * If the queued invalidation is already initialized by us
2960          * (for example, while enabling interrupt-remapping) then
2961          * we got the things already rolling from a sane state.
2962          */
2963         if (!iommu->qi) {
2964                 /*
2965                  * Clear any previous faults.
2966                  */
2967                 dmar_fault(-1, iommu);
2968                 /*
2969                  * Disable queued invalidation if supported and already enabled
2970                  * before OS handover.
2971                  */
2972                 dmar_disable_qi(iommu);
2973         }
2974
2975         if (dmar_enable_qi(iommu)) {
2976                 /*
2977                  * Queued Invalidate not enabled, use Register Based Invalidate
2978                  */
2979                 iommu->flush.flush_context = __iommu_flush_context;
2980                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2981                 pr_info("%s: Using Register based invalidation\n",
2982                         iommu->name);
2983         } else {
2984                 iommu->flush.flush_context = qi_flush_context;
2985                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2986                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2987         }
2988 }
2989
2990 static int copy_context_table(struct intel_iommu *iommu,
2991                               struct root_entry *old_re,
2992                               struct context_entry **tbl,
2993                               int bus, bool ext)
2994 {
2995         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2996         struct context_entry *new_ce = NULL, ce;
2997         struct context_entry *old_ce = NULL;
2998         struct root_entry re;
2999         phys_addr_t old_ce_phys;
3000
3001         tbl_idx = ext ? bus * 2 : bus;
3002         memcpy(&re, old_re, sizeof(re));
3003
3004         for (devfn = 0; devfn < 256; devfn++) {
3005                 /* First calculate the correct index */
3006                 idx = (ext ? devfn * 2 : devfn) % 256;
3007
3008                 if (idx == 0) {
3009                         /* First save what we may have and clean up */
3010                         if (new_ce) {
3011                                 tbl[tbl_idx] = new_ce;
3012                                 __iommu_flush_cache(iommu, new_ce,
3013                                                     VTD_PAGE_SIZE);
3014                                 pos = 1;
3015                         }
3016
3017                         if (old_ce)
3018                                 memunmap(old_ce);
3019
3020                         ret = 0;
3021                         if (devfn < 0x80)
3022                                 old_ce_phys = root_entry_lctp(&re);
3023                         else
3024                                 old_ce_phys = root_entry_uctp(&re);
3025
3026                         if (!old_ce_phys) {
3027                                 if (ext && devfn == 0) {
3028                                         /* No LCTP, try UCTP */
3029                                         devfn = 0x7f;
3030                                         continue;
3031                                 } else {
3032                                         goto out;
3033                                 }
3034                         }
3035
3036                         ret = -ENOMEM;
3037                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3038                                         MEMREMAP_WB);
3039                         if (!old_ce)
3040                                 goto out;
3041
3042                         new_ce = alloc_pgtable_page(iommu->node);
3043                         if (!new_ce)
3044                                 goto out_unmap;
3045
3046                         ret = 0;
3047                 }
3048
3049                 /* Now copy the context entry */
3050                 memcpy(&ce, old_ce + idx, sizeof(ce));
3051
3052                 if (!__context_present(&ce))
3053                         continue;
3054
3055                 did = context_domain_id(&ce);
3056                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3057                         set_bit(did, iommu->domain_ids);
3058
3059                 /*
3060                  * We need a marker for copied context entries. This
3061                  * marker needs to work for the old format as well as
3062                  * for extended context entries.
3063                  *
3064                  * Bit 67 of the context entry is used. In the old
3065                  * format this bit is available to software, in the
3066                  * extended format it is the PGE bit, but PGE is ignored
3067                  * by HW if PASIDs are disabled (and thus still
3068                  * available).
3069                  *
3070                  * So disable PASIDs first and then mark the entry
3071                  * copied. This means that we don't copy PASID
3072                  * translations from the old kernel, but this is fine as
3073                  * faults there are not fatal.
3074                  */
3075                 context_clear_pasid_enable(&ce);
3076                 context_set_copied(&ce);
3077
3078                 new_ce[idx] = ce;
3079         }
3080
3081         tbl[tbl_idx + pos] = new_ce;
3082
3083         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3084
3085 out_unmap:
3086         memunmap(old_ce);
3087
3088 out:
3089         return ret;
3090 }
3091
3092 static int copy_translation_tables(struct intel_iommu *iommu)
3093 {
3094         struct context_entry **ctxt_tbls;
3095         struct root_entry *old_rt;
3096         phys_addr_t old_rt_phys;
3097         int ctxt_table_entries;
3098         unsigned long flags;
3099         u64 rtaddr_reg;
3100         int bus, ret;
3101         bool new_ext, ext;
3102
3103         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3104         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3105         new_ext    = !!ecap_ecs(iommu->ecap);
3106
3107         /*
3108          * The RTT bit can only be changed when translation is disabled,
3109          * but disabling translation means to open a window for data
3110          * corruption. So bail out and don't copy anything if we would
3111          * have to change the bit.
3112          */
3113         if (new_ext != ext)
3114                 return -EINVAL;
3115
3116         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3117         if (!old_rt_phys)
3118                 return -EINVAL;
3119
3120         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3121         if (!old_rt)
3122                 return -ENOMEM;
3123
3124         /* This is too big for the stack - allocate it from slab */
3125         ctxt_table_entries = ext ? 512 : 256;
3126         ret = -ENOMEM;
3127         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3128         if (!ctxt_tbls)
3129                 goto out_unmap;
3130
3131         for (bus = 0; bus < 256; bus++) {
3132                 ret = copy_context_table(iommu, &old_rt[bus],
3133                                          ctxt_tbls, bus, ext);
3134                 if (ret) {
3135                         pr_err("%s: Failed to copy context table for bus %d\n",
3136                                 iommu->name, bus);
3137                         continue;
3138                 }
3139         }
3140
3141         spin_lock_irqsave(&iommu->lock, flags);
3142
3143         /* Context tables are copied, now write them to the root_entry table */
3144         for (bus = 0; bus < 256; bus++) {
3145                 int idx = ext ? bus * 2 : bus;
3146                 u64 val;
3147
3148                 if (ctxt_tbls[idx]) {
3149                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3150                         iommu->root_entry[bus].lo = val;
3151                 }
3152
3153                 if (!ext || !ctxt_tbls[idx + 1])
3154                         continue;
3155
3156                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3157                 iommu->root_entry[bus].hi = val;
3158         }
3159
3160         spin_unlock_irqrestore(&iommu->lock, flags);
3161
3162         kfree(ctxt_tbls);
3163
3164         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3165
3166         ret = 0;
3167
3168 out_unmap:
3169         memunmap(old_rt);
3170
3171         return ret;
3172 }
3173
3174 static int __init init_dmars(void)
3175 {
3176         struct dmar_drhd_unit *drhd;
3177         struct intel_iommu *iommu;
3178         int ret;
3179
3180         /*
3181          * for each drhd
3182          *    allocate root
3183          *    initialize and program root entry to not present
3184          * endfor
3185          */
3186         for_each_drhd_unit(drhd) {
3187                 /*
3188                  * lock not needed as this is only incremented in the single
3189                  * threaded kernel __init code path all other access are read
3190                  * only
3191                  */
3192                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3193                         g_num_of_iommus++;
3194                         continue;
3195                 }
3196                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3197         }
3198
3199         /* Preallocate enough resources for IOMMU hot-addition */
3200         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3201                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3202
3203         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3204                         GFP_KERNEL);
3205         if (!g_iommus) {
3206                 pr_err("Allocating global iommu array failed\n");
3207                 ret = -ENOMEM;
3208                 goto error;
3209         }
3210
3211         for_each_iommu(iommu, drhd) {
3212                 if (drhd->ignored) {
3213                         iommu_disable_translation(iommu);
3214                         continue;
3215                 }
3216
3217                 /*
3218                  * Find the max pasid size of all IOMMU's in the system.
3219                  * We need to ensure the system pasid table is no bigger
3220                  * than the smallest supported.
3221                  */
3222                 if (pasid_supported(iommu)) {
3223                         u32 temp = 2 << ecap_pss(iommu->ecap);
3224
3225                         intel_pasid_max_id = min_t(u32, temp,
3226                                                    intel_pasid_max_id);
3227                 }
3228
3229                 g_iommus[iommu->seq_id] = iommu;
3230
3231                 intel_iommu_init_qi(iommu);
3232
3233                 ret = iommu_init_domains(iommu);
3234                 if (ret)
3235                         goto free_iommu;
3236
3237                 init_translation_status(iommu);
3238
3239                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240                         iommu_disable_translation(iommu);
3241                         clear_translation_pre_enabled(iommu);
3242                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3243                                 iommu->name);
3244                 }
3245
3246                 /*
3247                  * TBD:
3248                  * we could share the same root & context tables
3249                  * among all IOMMU's. Need to Split it later.
3250                  */
3251                 ret = iommu_alloc_root_entry(iommu);
3252                 if (ret)
3253                         goto free_iommu;
3254
3255                 if (translation_pre_enabled(iommu)) {
3256                         pr_info("Translation already enabled - trying to copy translation structures\n");
3257
3258                         ret = copy_translation_tables(iommu);
3259                         if (ret) {
3260                                 /*
3261                                  * We found the IOMMU with translation
3262                                  * enabled - but failed to copy over the
3263                                  * old root-entry table. Try to proceed
3264                                  * by disabling translation now and
3265                                  * allocating a clean root-entry table.
3266                                  * This might cause DMAR faults, but
3267                                  * probably the dump will still succeed.
3268                                  */
3269                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3270                                        iommu->name);
3271                                 iommu_disable_translation(iommu);
3272                                 clear_translation_pre_enabled(iommu);
3273                         } else {
3274                                 pr_info("Copied translation tables from previous kernel for %s\n",
3275                                         iommu->name);
3276                         }
3277                 }
3278
3279                 if (!ecap_pass_through(iommu->ecap))
3280                         hw_pass_through = 0;
3281 #ifdef CONFIG_INTEL_IOMMU_SVM
3282                 if (pasid_supported(iommu))
3283                         intel_svm_init(iommu);
3284 #endif
3285         }
3286
3287         /*
3288          * Now that qi is enabled on all iommus, set the root entry and flush
3289          * caches. This is required on some Intel X58 chipsets, otherwise the
3290          * flush_context function will loop forever and the boot hangs.
3291          */
3292         for_each_active_iommu(iommu, drhd) {
3293                 iommu_flush_write_buffer(iommu);
3294                 iommu_set_root_entry(iommu);
3295                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3296                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3297         }
3298
3299         if (iommu_pass_through)
3300                 iommu_identity_mapping |= IDENTMAP_ALL;
3301
3302 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3303         dmar_map_gfx = 0;
3304 #endif
3305
3306         if (!dmar_map_gfx)
3307                 iommu_identity_mapping |= IDENTMAP_GFX;
3308
3309         check_tylersburg_isoch();
3310
3311         ret = si_domain_init(hw_pass_through);
3312         if (ret)
3313                 goto free_iommu;
3314
3315         /*
3316          * for each drhd
3317          *   enable fault log
3318          *   global invalidate context cache
3319          *   global invalidate iotlb
3320          *   enable translation
3321          */
3322         for_each_iommu(iommu, drhd) {
3323                 if (drhd->ignored) {
3324                         /*
3325                          * we always have to disable PMRs or DMA may fail on
3326                          * this device
3327                          */
3328                         if (force_on)
3329                                 iommu_disable_protect_mem_regions(iommu);
3330                         continue;
3331                 }
3332
3333                 iommu_flush_write_buffer(iommu);
3334
3335 #ifdef CONFIG_INTEL_IOMMU_SVM
3336                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3337                         /*
3338                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3339                          * could cause possible lock race condition.
3340                          */
3341                         up_write(&dmar_global_lock);
3342                         ret = intel_svm_enable_prq(iommu);
3343                         down_write(&dmar_global_lock);
3344                         if (ret)
3345                                 goto free_iommu;
3346                 }
3347 #endif
3348                 ret = dmar_set_interrupt(iommu);
3349                 if (ret)
3350                         goto free_iommu;
3351         }
3352
3353         return 0;
3354
3355 free_iommu:
3356         for_each_active_iommu(iommu, drhd) {
3357                 disable_dmar_iommu(iommu);
3358                 free_dmar_iommu(iommu);
3359         }
3360
3361         kfree(g_iommus);
3362
3363 error:
3364         return ret;
3365 }
3366
3367 /* This takes a number of _MM_ pages, not VTD pages */
3368 static unsigned long intel_alloc_iova(struct device *dev,
3369                                      struct dmar_domain *domain,
3370                                      unsigned long nrpages, uint64_t dma_mask)
3371 {
3372         unsigned long iova_pfn;
3373
3374         /* Restrict dma_mask to the width that the iommu can handle */
3375         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3376         /* Ensure we reserve the whole size-aligned region */
3377         nrpages = __roundup_pow_of_two(nrpages);
3378
3379         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3380                 /*
3381                  * First try to allocate an io virtual address in
3382                  * DMA_BIT_MASK(32) and if that fails then try allocating
3383                  * from higher range
3384                  */
3385                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3386                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3387                 if (iova_pfn)
3388                         return iova_pfn;
3389         }
3390         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3391                                    IOVA_PFN(dma_mask), true);
3392         if (unlikely(!iova_pfn)) {
3393                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3394                 return 0;
3395         }
3396
3397         return iova_pfn;
3398 }
3399
3400 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3401 {
3402         struct dmar_domain *domain, *tmp;
3403         struct dmar_rmrr_unit *rmrr;
3404         struct device *i_dev;
3405         int i, ret;
3406
3407         /* Device shouldn't be attached by any domains. */
3408         domain = find_domain(dev);
3409         if (domain)
3410                 return NULL;
3411
3412         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3413         if (!domain)
3414                 goto out;
3415
3416         /* We have a new domain - setup possible RMRRs for the device */
3417         rcu_read_lock();
3418         for_each_rmrr_units(rmrr) {
3419                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3420                                           i, i_dev) {
3421                         if (i_dev != dev)
3422                                 continue;
3423
3424                         ret = domain_prepare_identity_map(dev, domain,
3425                                                           rmrr->base_address,
3426                                                           rmrr->end_address);
3427                         if (ret)
3428                                 dev_err(dev, "Mapping reserved region failed\n");
3429                 }
3430         }
3431         rcu_read_unlock();
3432
3433         tmp = set_domain_for_dev(dev, domain);
3434         if (!tmp || domain != tmp) {
3435                 domain_exit(domain);
3436                 domain = tmp;
3437         }
3438
3439 out:
3440         if (!domain)
3441                 dev_err(dev, "Allocating domain failed\n");
3442         else
3443                 domain->domain.type = IOMMU_DOMAIN_DMA;
3444
3445         return domain;
3446 }
3447
3448 /* Check if the dev needs to go through non-identity map and unmap process.*/
3449 static bool iommu_need_mapping(struct device *dev)
3450 {
3451         int ret;
3452
3453         if (iommu_dummy(dev))
3454                 return false;
3455
3456         ret = identity_mapping(dev);
3457         if (ret) {
3458                 u64 dma_mask = *dev->dma_mask;
3459
3460                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3461                         dma_mask = dev->coherent_dma_mask;
3462
3463                 if (dma_mask >= dma_get_required_mask(dev))
3464                         return false;
3465
3466                 /*
3467                  * 32 bit DMA is removed from si_domain and fall back to
3468                  * non-identity mapping.
3469                  */
3470                 dmar_remove_one_dev_info(dev);
3471                 ret = iommu_request_dma_domain_for_dev(dev);
3472                 if (ret) {
3473                         struct iommu_domain *domain;
3474                         struct dmar_domain *dmar_domain;
3475
3476                         domain = iommu_get_domain_for_dev(dev);
3477                         if (domain) {
3478                                 dmar_domain = to_dmar_domain(domain);
3479                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3480                         }
3481                         dmar_remove_one_dev_info(dev);
3482                         get_private_domain_for_dev(dev);
3483                 }
3484
3485                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3486         }
3487
3488         return true;
3489 }
3490
3491 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3492                                      size_t size, int dir, u64 dma_mask)
3493 {
3494         struct dmar_domain *domain;
3495         phys_addr_t start_paddr;
3496         unsigned long iova_pfn;
3497         int prot = 0;
3498         int ret;
3499         struct intel_iommu *iommu;
3500         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3501
3502         BUG_ON(dir == DMA_NONE);
3503
3504         domain = find_domain(dev);
3505         if (!domain)
3506                 return DMA_MAPPING_ERROR;
3507
3508         iommu = domain_get_iommu(domain);
3509         size = aligned_nrpages(paddr, size);
3510
3511         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3512         if (!iova_pfn)
3513                 goto error;
3514
3515         /*
3516          * Check if DMAR supports zero-length reads on write only
3517          * mappings..
3518          */
3519         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3520                         !cap_zlr(iommu->cap))
3521                 prot |= DMA_PTE_READ;
3522         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3523                 prot |= DMA_PTE_WRITE;
3524         /*
3525          * paddr - (paddr + size) might be partial page, we should map the whole
3526          * page.  Note: if two part of one page are separately mapped, we
3527          * might have two guest_addr mapping to the same host paddr, but this
3528          * is not a big problem
3529          */
3530         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3531                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3532         if (ret)
3533                 goto error;
3534
3535         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3536         start_paddr += paddr & ~PAGE_MASK;
3537         return start_paddr;
3538
3539 error:
3540         if (iova_pfn)
3541                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3542         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3543                 size, (unsigned long long)paddr, dir);
3544         return DMA_MAPPING_ERROR;
3545 }
3546
3547 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3548                                  unsigned long offset, size_t size,
3549                                  enum dma_data_direction dir,
3550                                  unsigned long attrs)
3551 {
3552         if (iommu_need_mapping(dev))
3553                 return __intel_map_single(dev, page_to_phys(page) + offset,
3554                                 size, dir, *dev->dma_mask);
3555         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3556 }
3557
3558 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3559                                      size_t size, enum dma_data_direction dir,
3560                                      unsigned long attrs)
3561 {
3562         if (iommu_need_mapping(dev))
3563                 return __intel_map_single(dev, phys_addr, size, dir,
3564                                 *dev->dma_mask);
3565         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3566 }
3567
3568 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3569 {
3570         struct dmar_domain *domain;
3571         unsigned long start_pfn, last_pfn;
3572         unsigned long nrpages;
3573         unsigned long iova_pfn;
3574         struct intel_iommu *iommu;
3575         struct page *freelist;
3576         struct pci_dev *pdev = NULL;
3577
3578         domain = find_domain(dev);
3579         BUG_ON(!domain);
3580
3581         iommu = domain_get_iommu(domain);
3582
3583         iova_pfn = IOVA_PFN(dev_addr);
3584
3585         nrpages = aligned_nrpages(dev_addr, size);
3586         start_pfn = mm_to_dma_pfn(iova_pfn);
3587         last_pfn = start_pfn + nrpages - 1;
3588
3589         if (dev_is_pci(dev))
3590                 pdev = to_pci_dev(dev);
3591
3592         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3593
3594         freelist = domain_unmap(domain, start_pfn, last_pfn);
3595
3596         if (intel_iommu_strict || (pdev && pdev->untrusted) ||
3597                         !has_iova_flush_queue(&domain->iovad)) {
3598                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3599                                       nrpages, !freelist, 0);
3600                 /* free iova */
3601                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3602                 dma_free_pagelist(freelist);
3603         } else {
3604                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3605                            (unsigned long)freelist);
3606                 /*
3607                  * queue up the release of the unmap to save the 1/6th of the
3608                  * cpu used up by the iotlb flush operation...
3609                  */
3610         }
3611 }
3612
3613 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3614                              size_t size, enum dma_data_direction dir,
3615                              unsigned long attrs)
3616 {
3617         if (iommu_need_mapping(dev))
3618                 intel_unmap(dev, dev_addr, size);
3619         else
3620                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3621 }
3622
3623 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3624                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3625 {
3626         if (iommu_need_mapping(dev))
3627                 intel_unmap(dev, dev_addr, size);
3628 }
3629
3630 static void *intel_alloc_coherent(struct device *dev, size_t size,
3631                                   dma_addr_t *dma_handle, gfp_t flags,
3632                                   unsigned long attrs)
3633 {
3634         struct page *page = NULL;
3635         int order;
3636
3637         if (!iommu_need_mapping(dev))
3638                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3639
3640         size = PAGE_ALIGN(size);
3641         order = get_order(size);
3642
3643         if (gfpflags_allow_blocking(flags)) {
3644                 unsigned int count = size >> PAGE_SHIFT;
3645
3646                 page = dma_alloc_from_contiguous(dev, count, order,
3647                                                  flags & __GFP_NOWARN);
3648         }
3649
3650         if (!page)
3651                 page = alloc_pages(flags, order);
3652         if (!page)
3653                 return NULL;
3654         memset(page_address(page), 0, size);
3655
3656         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3657                                          DMA_BIDIRECTIONAL,
3658                                          dev->coherent_dma_mask);
3659         if (*dma_handle != DMA_MAPPING_ERROR)
3660                 return page_address(page);
3661         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3662                 __free_pages(page, order);
3663
3664         return NULL;
3665 }
3666
3667 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3668                                 dma_addr_t dma_handle, unsigned long attrs)
3669 {
3670         int order;
3671         struct page *page = virt_to_page(vaddr);
3672
3673         if (!iommu_need_mapping(dev))
3674                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3675
3676         size = PAGE_ALIGN(size);
3677         order = get_order(size);
3678
3679         intel_unmap(dev, dma_handle, size);
3680         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3681                 __free_pages(page, order);
3682 }
3683
3684 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3685                            int nelems, enum dma_data_direction dir,
3686                            unsigned long attrs)
3687 {
3688         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3689         unsigned long nrpages = 0;
3690         struct scatterlist *sg;
3691         int i;
3692
3693         if (!iommu_need_mapping(dev))
3694                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3695
3696         for_each_sg(sglist, sg, nelems, i) {
3697                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3698         }
3699
3700         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3701 }
3702
3703 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3704                         enum dma_data_direction dir, unsigned long attrs)
3705 {
3706         int i;
3707         struct dmar_domain *domain;
3708         size_t size = 0;
3709         int prot = 0;
3710         unsigned long iova_pfn;
3711         int ret;
3712         struct scatterlist *sg;
3713         unsigned long start_vpfn;
3714         struct intel_iommu *iommu;
3715
3716         BUG_ON(dir == DMA_NONE);
3717         if (!iommu_need_mapping(dev))
3718                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3719
3720         domain = find_domain(dev);
3721         if (!domain)
3722                 return 0;
3723
3724         iommu = domain_get_iommu(domain);
3725
3726         for_each_sg(sglist, sg, nelems, i)
3727                 size += aligned_nrpages(sg->offset, sg->length);
3728
3729         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3730                                 *dev->dma_mask);
3731         if (!iova_pfn) {
3732                 sglist->dma_length = 0;
3733                 return 0;
3734         }
3735
3736         /*
3737          * Check if DMAR supports zero-length reads on write only
3738          * mappings..
3739          */
3740         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3741                         !cap_zlr(iommu->cap))
3742                 prot |= DMA_PTE_READ;
3743         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3744                 prot |= DMA_PTE_WRITE;
3745
3746         start_vpfn = mm_to_dma_pfn(iova_pfn);
3747
3748         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3749         if (unlikely(ret)) {
3750                 dma_pte_free_pagetable(domain, start_vpfn,
3751                                        start_vpfn + size - 1,
3752                                        agaw_to_level(domain->agaw) + 1);
3753                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3754                 return 0;
3755         }
3756
3757         return nelems;
3758 }
3759
3760 static const struct dma_map_ops intel_dma_ops = {
3761         .alloc = intel_alloc_coherent,
3762         .free = intel_free_coherent,
3763         .map_sg = intel_map_sg,
3764         .unmap_sg = intel_unmap_sg,
3765         .map_page = intel_map_page,
3766         .unmap_page = intel_unmap_page,
3767         .map_resource = intel_map_resource,
3768         .unmap_resource = intel_unmap_resource,
3769         .dma_supported = dma_direct_supported,
3770 };
3771
3772 static inline int iommu_domain_cache_init(void)
3773 {
3774         int ret = 0;
3775
3776         iommu_domain_cache = kmem_cache_create("iommu_domain",
3777                                          sizeof(struct dmar_domain),
3778                                          0,
3779                                          SLAB_HWCACHE_ALIGN,
3780
3781                                          NULL);
3782         if (!iommu_domain_cache) {
3783                 pr_err("Couldn't create iommu_domain cache\n");
3784                 ret = -ENOMEM;
3785         }
3786
3787         return ret;
3788 }
3789
3790 static inline int iommu_devinfo_cache_init(void)
3791 {
3792         int ret = 0;
3793
3794         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3795                                          sizeof(struct device_domain_info),
3796                                          0,
3797                                          SLAB_HWCACHE_ALIGN,
3798                                          NULL);
3799         if (!iommu_devinfo_cache) {
3800                 pr_err("Couldn't create devinfo cache\n");
3801                 ret = -ENOMEM;
3802         }
3803
3804         return ret;
3805 }
3806
3807 static int __init iommu_init_mempool(void)
3808 {
3809         int ret;
3810         ret = iova_cache_get();
3811         if (ret)
3812                 return ret;
3813
3814         ret = iommu_domain_cache_init();
3815         if (ret)
3816                 goto domain_error;
3817
3818         ret = iommu_devinfo_cache_init();
3819         if (!ret)
3820                 return ret;
3821
3822         kmem_cache_destroy(iommu_domain_cache);
3823 domain_error:
3824         iova_cache_put();
3825
3826         return -ENOMEM;
3827 }
3828
3829 static void __init iommu_exit_mempool(void)
3830 {
3831         kmem_cache_destroy(iommu_devinfo_cache);
3832         kmem_cache_destroy(iommu_domain_cache);
3833         iova_cache_put();
3834 }
3835
3836 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3837 {
3838         struct dmar_drhd_unit *drhd;
3839         u32 vtbar;
3840         int rc;
3841
3842         /* We know that this device on this chipset has its own IOMMU.
3843          * If we find it under a different IOMMU, then the BIOS is lying
3844          * to us. Hope that the IOMMU for this device is actually
3845          * disabled, and it needs no translation...
3846          */
3847         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3848         if (rc) {
3849                 /* "can't" happen */
3850                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3851                 return;
3852         }
3853         vtbar &= 0xffff0000;
3854
3855         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3856         drhd = dmar_find_matched_drhd_unit(pdev);
3857         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3858                             TAINT_FIRMWARE_WORKAROUND,
3859                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3860                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3861 }
3862 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3863
3864 static void __init init_no_remapping_devices(void)
3865 {
3866         struct dmar_drhd_unit *drhd;
3867         struct device *dev;
3868         int i;
3869
3870         for_each_drhd_unit(drhd) {
3871                 if (!drhd->include_all) {
3872                         for_each_active_dev_scope(drhd->devices,
3873                                                   drhd->devices_cnt, i, dev)
3874                                 break;
3875                         /* ignore DMAR unit if no devices exist */
3876                         if (i == drhd->devices_cnt)
3877                                 drhd->ignored = 1;
3878                 }
3879         }
3880
3881         for_each_active_drhd_unit(drhd) {
3882                 if (drhd->include_all)
3883                         continue;
3884
3885                 for_each_active_dev_scope(drhd->devices,
3886                                           drhd->devices_cnt, i, dev)
3887                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3888                                 break;
3889                 if (i < drhd->devices_cnt)
3890                         continue;
3891
3892                 /* This IOMMU has *only* gfx devices. Either bypass it or
3893                    set the gfx_mapped flag, as appropriate */
3894                 if (!dmar_map_gfx) {
3895                         drhd->ignored = 1;
3896                         for_each_active_dev_scope(drhd->devices,
3897                                                   drhd->devices_cnt, i, dev)
3898                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3899                 }
3900         }
3901 }
3902
3903 #ifdef CONFIG_SUSPEND
3904 static int init_iommu_hw(void)
3905 {
3906         struct dmar_drhd_unit *drhd;
3907         struct intel_iommu *iommu = NULL;
3908
3909         for_each_active_iommu(iommu, drhd)
3910                 if (iommu->qi)
3911                         dmar_reenable_qi(iommu);
3912
3913         for_each_iommu(iommu, drhd) {
3914                 if (drhd->ignored) {
3915                         /*
3916                          * we always have to disable PMRs or DMA may fail on
3917                          * this device
3918                          */
3919                         if (force_on)
3920                                 iommu_disable_protect_mem_regions(iommu);
3921                         continue;
3922                 }
3923
3924                 iommu_flush_write_buffer(iommu);
3925
3926                 iommu_set_root_entry(iommu);
3927
3928                 iommu->flush.flush_context(iommu, 0, 0, 0,
3929                                            DMA_CCMD_GLOBAL_INVL);
3930                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3931                 iommu_enable_translation(iommu);
3932                 iommu_disable_protect_mem_regions(iommu);
3933         }
3934
3935         return 0;
3936 }
3937
3938 static void iommu_flush_all(void)
3939 {
3940         struct dmar_drhd_unit *drhd;
3941         struct intel_iommu *iommu;
3942
3943         for_each_active_iommu(iommu, drhd) {
3944                 iommu->flush.flush_context(iommu, 0, 0, 0,
3945                                            DMA_CCMD_GLOBAL_INVL);
3946                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3947                                          DMA_TLB_GLOBAL_FLUSH);
3948         }
3949 }
3950
3951 static int iommu_suspend(void)
3952 {
3953         struct dmar_drhd_unit *drhd;
3954         struct intel_iommu *iommu = NULL;
3955         unsigned long flag;
3956
3957         for_each_active_iommu(iommu, drhd) {
3958                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3959                                                  GFP_ATOMIC);
3960                 if (!iommu->iommu_state)
3961                         goto nomem;
3962         }
3963
3964         iommu_flush_all();
3965
3966         for_each_active_iommu(iommu, drhd) {
3967                 iommu_disable_translation(iommu);
3968
3969                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3970
3971                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3972                         readl(iommu->reg + DMAR_FECTL_REG);
3973                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3974                         readl(iommu->reg + DMAR_FEDATA_REG);
3975                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3976                         readl(iommu->reg + DMAR_FEADDR_REG);
3977                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3978                         readl(iommu->reg + DMAR_FEUADDR_REG);
3979
3980                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3981         }
3982         return 0;
3983
3984 nomem:
3985         for_each_active_iommu(iommu, drhd)
3986                 kfree(iommu->iommu_state);
3987
3988         return -ENOMEM;
3989 }
3990
3991 static void iommu_resume(void)
3992 {
3993         struct dmar_drhd_unit *drhd;
3994         struct intel_iommu *iommu = NULL;
3995         unsigned long flag;
3996
3997         if (init_iommu_hw()) {
3998                 if (force_on)
3999                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4000                 else
4001                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4002                 return;
4003         }
4004
4005         for_each_active_iommu(iommu, drhd) {
4006
4007                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4008
4009                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4010                         iommu->reg + DMAR_FECTL_REG);
4011                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4012                         iommu->reg + DMAR_FEDATA_REG);
4013                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4014                         iommu->reg + DMAR_FEADDR_REG);
4015                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4016                         iommu->reg + DMAR_FEUADDR_REG);
4017
4018                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4019         }
4020
4021         for_each_active_iommu(iommu, drhd)
4022                 kfree(iommu->iommu_state);
4023 }
4024
4025 static struct syscore_ops iommu_syscore_ops = {
4026         .resume         = iommu_resume,
4027         .suspend        = iommu_suspend,
4028 };
4029
4030 static void __init init_iommu_pm_ops(void)
4031 {
4032         register_syscore_ops(&iommu_syscore_ops);
4033 }
4034
4035 #else
4036 static inline void init_iommu_pm_ops(void) {}
4037 #endif  /* CONFIG_PM */
4038
4039 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4040 {
4041         struct acpi_dmar_reserved_memory *rmrr;
4042         struct dmar_rmrr_unit *rmrru;
4043
4044         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4045         if (!rmrru)
4046                 goto out;
4047
4048         rmrru->hdr = header;
4049         rmrr = (struct acpi_dmar_reserved_memory *)header;
4050         rmrru->base_address = rmrr->base_address;
4051         rmrru->end_address = rmrr->end_address;
4052
4053         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4054                                 ((void *)rmrr) + rmrr->header.length,
4055                                 &rmrru->devices_cnt);
4056         if (rmrru->devices_cnt && rmrru->devices == NULL)
4057                 goto free_rmrru;
4058
4059         list_add(&rmrru->list, &dmar_rmrr_units);
4060
4061         return 0;
4062 free_rmrru:
4063         kfree(rmrru);
4064 out:
4065         return -ENOMEM;
4066 }
4067
4068 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4069 {
4070         struct dmar_atsr_unit *atsru;
4071         struct acpi_dmar_atsr *tmp;
4072
4073         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4074                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4075                 if (atsr->segment != tmp->segment)
4076                         continue;
4077                 if (atsr->header.length != tmp->header.length)
4078                         continue;
4079                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4080                         return atsru;
4081         }
4082
4083         return NULL;
4084 }
4085
4086 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4087 {
4088         struct acpi_dmar_atsr *atsr;
4089         struct dmar_atsr_unit *atsru;
4090
4091         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4092                 return 0;
4093
4094         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4095         atsru = dmar_find_atsr(atsr);
4096         if (atsru)
4097                 return 0;
4098
4099         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4100         if (!atsru)
4101                 return -ENOMEM;
4102
4103         /*
4104          * If memory is allocated from slab by ACPI _DSM method, we need to
4105          * copy the memory content because the memory buffer will be freed
4106          * on return.
4107          */
4108         atsru->hdr = (void *)(atsru + 1);
4109         memcpy(atsru->hdr, hdr, hdr->length);
4110         atsru->include_all = atsr->flags & 0x1;
4111         if (!atsru->include_all) {
4112                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4113                                 (void *)atsr + atsr->header.length,
4114                                 &atsru->devices_cnt);
4115                 if (atsru->devices_cnt && atsru->devices == NULL) {
4116                         kfree(atsru);
4117                         return -ENOMEM;
4118                 }
4119         }
4120
4121         list_add_rcu(&atsru->list, &dmar_atsr_units);
4122
4123         return 0;
4124 }
4125
4126 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4127 {
4128         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4129         kfree(atsru);
4130 }
4131
4132 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4133 {
4134         struct acpi_dmar_atsr *atsr;
4135         struct dmar_atsr_unit *atsru;
4136
4137         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4138         atsru = dmar_find_atsr(atsr);
4139         if (atsru) {
4140                 list_del_rcu(&atsru->list);
4141                 synchronize_rcu();
4142                 intel_iommu_free_atsr(atsru);
4143         }
4144
4145         return 0;
4146 }
4147
4148 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4149 {
4150         int i;
4151         struct device *dev;
4152         struct acpi_dmar_atsr *atsr;
4153         struct dmar_atsr_unit *atsru;
4154
4155         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4156         atsru = dmar_find_atsr(atsr);
4157         if (!atsru)
4158                 return 0;
4159
4160         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4161                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4162                                           i, dev)
4163                         return -EBUSY;
4164         }
4165
4166         return 0;
4167 }
4168
4169 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4170 {
4171         int sp, ret;
4172         struct intel_iommu *iommu = dmaru->iommu;
4173
4174         if (g_iommus[iommu->seq_id])
4175                 return 0;
4176
4177         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4178                 pr_warn("%s: Doesn't support hardware pass through.\n",
4179                         iommu->name);
4180                 return -ENXIO;
4181         }
4182         if (!ecap_sc_support(iommu->ecap) &&
4183             domain_update_iommu_snooping(iommu)) {
4184                 pr_warn("%s: Doesn't support snooping.\n",
4185                         iommu->name);
4186                 return -ENXIO;
4187         }
4188         sp = domain_update_iommu_superpage(iommu) - 1;
4189         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4190                 pr_warn("%s: Doesn't support large page.\n",
4191                         iommu->name);
4192                 return -ENXIO;
4193         }
4194
4195         /*
4196          * Disable translation if already enabled prior to OS handover.
4197          */
4198         if (iommu->gcmd & DMA_GCMD_TE)
4199                 iommu_disable_translation(iommu);
4200
4201         g_iommus[iommu->seq_id] = iommu;
4202         ret = iommu_init_domains(iommu);
4203         if (ret == 0)
4204                 ret = iommu_alloc_root_entry(iommu);
4205         if (ret)
4206                 goto out;
4207
4208 #ifdef CONFIG_INTEL_IOMMU_SVM
4209         if (pasid_supported(iommu))
4210                 intel_svm_init(iommu);
4211 #endif
4212
4213         if (dmaru->ignored) {
4214                 /*
4215                  * we always have to disable PMRs or DMA may fail on this device
4216                  */
4217                 if (force_on)
4218                         iommu_disable_protect_mem_regions(iommu);
4219                 return 0;
4220         }
4221
4222         intel_iommu_init_qi(iommu);
4223         iommu_flush_write_buffer(iommu);
4224
4225 #ifdef CONFIG_INTEL_IOMMU_SVM
4226         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4227                 ret = intel_svm_enable_prq(iommu);
4228                 if (ret)
4229                         goto disable_iommu;
4230         }
4231 #endif
4232         ret = dmar_set_interrupt(iommu);
4233         if (ret)
4234                 goto disable_iommu;
4235
4236         iommu_set_root_entry(iommu);
4237         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4238         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4239         iommu_enable_translation(iommu);
4240
4241         iommu_disable_protect_mem_regions(iommu);
4242         return 0;
4243
4244 disable_iommu:
4245         disable_dmar_iommu(iommu);
4246 out:
4247         free_dmar_iommu(iommu);
4248         return ret;
4249 }
4250
4251 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4252 {
4253         int ret = 0;
4254         struct intel_iommu *iommu = dmaru->iommu;
4255
4256         if (!intel_iommu_enabled)
4257                 return 0;
4258         if (iommu == NULL)
4259                 return -EINVAL;
4260
4261         if (insert) {
4262                 ret = intel_iommu_add(dmaru);
4263         } else {
4264                 disable_dmar_iommu(iommu);
4265                 free_dmar_iommu(iommu);
4266         }
4267
4268         return ret;
4269 }
4270
4271 static void intel_iommu_free_dmars(void)
4272 {
4273         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4274         struct dmar_atsr_unit *atsru, *atsr_n;
4275
4276         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4277                 list_del(&rmrru->list);
4278                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4279                 kfree(rmrru);
4280         }
4281
4282         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4283                 list_del(&atsru->list);
4284                 intel_iommu_free_atsr(atsru);
4285         }
4286 }
4287
4288 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4289 {
4290         int i, ret = 1;
4291         struct pci_bus *bus;
4292         struct pci_dev *bridge = NULL;
4293         struct device *tmp;
4294         struct acpi_dmar_atsr *atsr;
4295         struct dmar_atsr_unit *atsru;
4296
4297         dev = pci_physfn(dev);
4298         for (bus = dev->bus; bus; bus = bus->parent) {
4299                 bridge = bus->self;
4300                 /* If it's an integrated device, allow ATS */
4301                 if (!bridge)
4302                         return 1;
4303                 /* Connected via non-PCIe: no ATS */
4304                 if (!pci_is_pcie(bridge) ||
4305                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4306                         return 0;
4307                 /* If we found the root port, look it up in the ATSR */
4308                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4309                         break;
4310         }
4311
4312         rcu_read_lock();
4313         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4314                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4315                 if (atsr->segment != pci_domain_nr(dev->bus))
4316                         continue;
4317
4318                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4319                         if (tmp == &bridge->dev)
4320                                 goto out;
4321
4322                 if (atsru->include_all)
4323                         goto out;
4324         }
4325         ret = 0;
4326 out:
4327         rcu_read_unlock();
4328
4329         return ret;
4330 }
4331
4332 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4333 {
4334         int ret;
4335         struct dmar_rmrr_unit *rmrru;
4336         struct dmar_atsr_unit *atsru;
4337         struct acpi_dmar_atsr *atsr;
4338         struct acpi_dmar_reserved_memory *rmrr;
4339
4340         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4341                 return 0;
4342
4343         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4344                 rmrr = container_of(rmrru->hdr,
4345                                     struct acpi_dmar_reserved_memory, header);
4346                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4347                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4348                                 ((void *)rmrr) + rmrr->header.length,
4349                                 rmrr->segment, rmrru->devices,
4350                                 rmrru->devices_cnt);
4351                         if (ret < 0)
4352                                 return ret;
4353                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4354                         dmar_remove_dev_scope(info, rmrr->segment,
4355                                 rmrru->devices, rmrru->devices_cnt);
4356                 }
4357         }
4358
4359         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4360                 if (atsru->include_all)
4361                         continue;
4362
4363                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4364                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4365                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4366                                         (void *)atsr + atsr->header.length,
4367                                         atsr->segment, atsru->devices,
4368                                         atsru->devices_cnt);
4369                         if (ret > 0)
4370                                 break;
4371                         else if (ret < 0)
4372                                 return ret;
4373                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4374                         if (dmar_remove_dev_scope(info, atsr->segment,
4375                                         atsru->devices, atsru->devices_cnt))
4376                                 break;
4377                 }
4378         }
4379
4380         return 0;
4381 }
4382
4383 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4384                                        unsigned long val, void *v)
4385 {
4386         struct memory_notify *mhp = v;
4387         unsigned long long start, end;
4388         unsigned long start_vpfn, last_vpfn;
4389
4390         switch (val) {
4391         case MEM_GOING_ONLINE:
4392                 start = mhp->start_pfn << PAGE_SHIFT;
4393                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4394                 if (iommu_domain_identity_map(si_domain, start, end)) {
4395                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4396                                 start, end);
4397                         return NOTIFY_BAD;
4398                 }
4399                 break;
4400
4401         case MEM_OFFLINE:
4402         case MEM_CANCEL_ONLINE:
4403                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4404                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4405                 while (start_vpfn <= last_vpfn) {
4406                         struct iova *iova;
4407                         struct dmar_drhd_unit *drhd;
4408                         struct intel_iommu *iommu;
4409                         struct page *freelist;
4410
4411                         iova = find_iova(&si_domain->iovad, start_vpfn);
4412                         if (iova == NULL) {
4413                                 pr_debug("Failed get IOVA for PFN %lx\n",
4414                                          start_vpfn);
4415                                 break;
4416                         }
4417
4418                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4419                                                      start_vpfn, last_vpfn);
4420                         if (iova == NULL) {
4421                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4422                                         start_vpfn, last_vpfn);
4423                                 return NOTIFY_BAD;
4424                         }
4425
4426                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4427                                                iova->pfn_hi);
4428
4429                         rcu_read_lock();
4430                         for_each_active_iommu(iommu, drhd)
4431                                 iommu_flush_iotlb_psi(iommu, si_domain,
4432                                         iova->pfn_lo, iova_size(iova),
4433                                         !freelist, 0);
4434                         rcu_read_unlock();
4435                         dma_free_pagelist(freelist);
4436
4437                         start_vpfn = iova->pfn_hi + 1;
4438                         free_iova_mem(iova);
4439                 }
4440                 break;
4441         }
4442
4443         return NOTIFY_OK;
4444 }
4445
4446 static struct notifier_block intel_iommu_memory_nb = {
4447         .notifier_call = intel_iommu_memory_notifier,
4448         .priority = 0
4449 };
4450
4451 static void free_all_cpu_cached_iovas(unsigned int cpu)
4452 {
4453         int i;
4454
4455         for (i = 0; i < g_num_of_iommus; i++) {
4456                 struct intel_iommu *iommu = g_iommus[i];
4457                 struct dmar_domain *domain;
4458                 int did;
4459
4460                 if (!iommu)
4461                         continue;
4462
4463                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4464                         domain = get_iommu_domain(iommu, (u16)did);
4465
4466                         if (!domain)
4467                                 continue;
4468                         free_cpu_cached_iovas(cpu, &domain->iovad);
4469                 }
4470         }
4471 }
4472
4473 static int intel_iommu_cpu_dead(unsigned int cpu)
4474 {
4475         free_all_cpu_cached_iovas(cpu);
4476         return 0;
4477 }
4478
4479 static void intel_disable_iommus(void)
4480 {
4481         struct intel_iommu *iommu = NULL;
4482         struct dmar_drhd_unit *drhd;
4483
4484         for_each_iommu(iommu, drhd)
4485                 iommu_disable_translation(iommu);
4486 }
4487
4488 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4489 {
4490         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4491
4492         return container_of(iommu_dev, struct intel_iommu, iommu);
4493 }
4494
4495 static ssize_t intel_iommu_show_version(struct device *dev,
4496                                         struct device_attribute *attr,
4497                                         char *buf)
4498 {
4499         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4500         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4501         return sprintf(buf, "%d:%d\n",
4502                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4503 }
4504 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4505
4506 static ssize_t intel_iommu_show_address(struct device *dev,
4507                                         struct device_attribute *attr,
4508                                         char *buf)
4509 {
4510         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4511         return sprintf(buf, "%llx\n", iommu->reg_phys);
4512 }
4513 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4514
4515 static ssize_t intel_iommu_show_cap(struct device *dev,
4516                                     struct device_attribute *attr,
4517                                     char *buf)
4518 {
4519         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4520         return sprintf(buf, "%llx\n", iommu->cap);
4521 }
4522 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4523
4524 static ssize_t intel_iommu_show_ecap(struct device *dev,
4525                                     struct device_attribute *attr,
4526                                     char *buf)
4527 {
4528         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4529         return sprintf(buf, "%llx\n", iommu->ecap);
4530 }
4531 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4532
4533 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4534                                       struct device_attribute *attr,
4535                                       char *buf)
4536 {
4537         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4538         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4539 }
4540 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4541
4542 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4543                                            struct device_attribute *attr,
4544                                            char *buf)
4545 {
4546         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4547         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4548                                                   cap_ndoms(iommu->cap)));
4549 }
4550 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4551
4552 static struct attribute *intel_iommu_attrs[] = {
4553         &dev_attr_version.attr,
4554         &dev_attr_address.attr,
4555         &dev_attr_cap.attr,
4556         &dev_attr_ecap.attr,
4557         &dev_attr_domains_supported.attr,
4558         &dev_attr_domains_used.attr,
4559         NULL,
4560 };
4561
4562 static struct attribute_group intel_iommu_group = {
4563         .name = "intel-iommu",
4564         .attrs = intel_iommu_attrs,
4565 };
4566
4567 const struct attribute_group *intel_iommu_groups[] = {
4568         &intel_iommu_group,
4569         NULL,
4570 };
4571
4572 static int __init platform_optin_force_iommu(void)
4573 {
4574         struct pci_dev *pdev = NULL;
4575         bool has_untrusted_dev = false;
4576
4577         if (!dmar_platform_optin() || no_platform_optin)
4578                 return 0;
4579
4580         for_each_pci_dev(pdev) {
4581                 if (pdev->untrusted) {
4582                         has_untrusted_dev = true;
4583                         break;
4584                 }
4585         }
4586
4587         if (!has_untrusted_dev)
4588                 return 0;
4589
4590         if (no_iommu || dmar_disabled)
4591                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4592
4593         /*
4594          * If Intel-IOMMU is disabled by default, we will apply identity
4595          * map for all devices except those marked as being untrusted.
4596          */
4597         if (dmar_disabled)
4598                 iommu_identity_mapping |= IDENTMAP_ALL;
4599
4600         dmar_disabled = 0;
4601 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4602         swiotlb = 0;
4603 #endif
4604         no_iommu = 0;
4605
4606         return 1;
4607 }
4608
4609 static int __init probe_acpi_namespace_devices(void)
4610 {
4611         struct dmar_drhd_unit *drhd;
4612         /* To avoid a -Wunused-but-set-variable warning. */
4613         struct intel_iommu *iommu __maybe_unused;
4614         struct device *dev;
4615         int i, ret = 0;
4616
4617         for_each_active_iommu(iommu, drhd) {
4618                 for_each_active_dev_scope(drhd->devices,
4619                                           drhd->devices_cnt, i, dev) {
4620                         struct acpi_device_physical_node *pn;
4621                         struct iommu_group *group;
4622                         struct acpi_device *adev;
4623
4624                         if (dev->bus != &acpi_bus_type)
4625                                 continue;
4626
4627                         adev = to_acpi_device(dev);
4628                         mutex_lock(&adev->physical_node_lock);
4629                         list_for_each_entry(pn,
4630                                             &adev->physical_node_list, node) {
4631                                 group = iommu_group_get(pn->dev);
4632                                 if (group) {
4633                                         iommu_group_put(group);
4634                                         continue;
4635                                 }
4636
4637                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4638                                 ret = iommu_probe_device(pn->dev);
4639                                 if (ret)
4640                                         break;
4641                         }
4642                         mutex_unlock(&adev->physical_node_lock);
4643
4644                         if (ret)
4645                                 return ret;
4646                 }
4647         }
4648
4649         return 0;
4650 }
4651
4652 int __init intel_iommu_init(void)
4653 {
4654         int ret = -ENODEV;
4655         struct dmar_drhd_unit *drhd;
4656         struct intel_iommu *iommu;
4657
4658         /*
4659          * Intel IOMMU is required for a TXT/tboot launch or platform
4660          * opt in, so enforce that.
4661          */
4662         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4663
4664         if (iommu_init_mempool()) {
4665                 if (force_on)
4666                         panic("tboot: Failed to initialize iommu memory\n");
4667                 return -ENOMEM;
4668         }
4669
4670         down_write(&dmar_global_lock);
4671         if (dmar_table_init()) {
4672                 if (force_on)
4673                         panic("tboot: Failed to initialize DMAR table\n");
4674                 goto out_free_dmar;
4675         }
4676
4677         if (dmar_dev_scope_init() < 0) {
4678                 if (force_on)
4679                         panic("tboot: Failed to initialize DMAR device scope\n");
4680                 goto out_free_dmar;
4681         }
4682
4683         up_write(&dmar_global_lock);
4684
4685         /*
4686          * The bus notifier takes the dmar_global_lock, so lockdep will
4687          * complain later when we register it under the lock.
4688          */
4689         dmar_register_bus_notifier();
4690
4691         down_write(&dmar_global_lock);
4692
4693         if (no_iommu || dmar_disabled) {
4694                 /*
4695                  * We exit the function here to ensure IOMMU's remapping and
4696                  * mempool aren't setup, which means that the IOMMU's PMRs
4697                  * won't be disabled via the call to init_dmars(). So disable
4698                  * it explicitly here. The PMRs were setup by tboot prior to
4699                  * calling SENTER, but the kernel is expected to reset/tear
4700                  * down the PMRs.
4701                  */
4702                 if (intel_iommu_tboot_noforce) {
4703                         for_each_iommu(iommu, drhd)
4704                                 iommu_disable_protect_mem_regions(iommu);
4705                 }
4706
4707                 /*
4708                  * Make sure the IOMMUs are switched off, even when we
4709                  * boot into a kexec kernel and the previous kernel left
4710                  * them enabled
4711                  */
4712                 intel_disable_iommus();
4713                 goto out_free_dmar;
4714         }
4715
4716         if (list_empty(&dmar_rmrr_units))
4717                 pr_info("No RMRR found\n");
4718
4719         if (list_empty(&dmar_atsr_units))
4720                 pr_info("No ATSR found\n");
4721
4722         if (dmar_init_reserved_ranges()) {
4723                 if (force_on)
4724                         panic("tboot: Failed to reserve iommu ranges\n");
4725                 goto out_free_reserved_range;
4726         }
4727
4728         if (dmar_map_gfx)
4729                 intel_iommu_gfx_mapped = 1;
4730
4731         init_no_remapping_devices();
4732
4733         ret = init_dmars();
4734         if (ret) {
4735                 if (force_on)
4736                         panic("tboot: Failed to initialize DMARs\n");
4737                 pr_err("Initialization failed\n");
4738                 goto out_free_reserved_range;
4739         }
4740         up_write(&dmar_global_lock);
4741
4742 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4743         swiotlb = 0;
4744 #endif
4745         dma_ops = &intel_dma_ops;
4746
4747         init_iommu_pm_ops();
4748
4749         for_each_active_iommu(iommu, drhd) {
4750                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4751                                        intel_iommu_groups,
4752                                        "%s", iommu->name);
4753                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4754                 iommu_device_register(&iommu->iommu);
4755         }
4756
4757         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4758         if (si_domain && !hw_pass_through)
4759                 register_memory_notifier(&intel_iommu_memory_nb);
4760         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4761                           intel_iommu_cpu_dead);
4762
4763         down_read(&dmar_global_lock);
4764         if (probe_acpi_namespace_devices())
4765                 pr_warn("ACPI name space devices didn't probe correctly\n");
4766         up_read(&dmar_global_lock);
4767
4768         /* Finally, we enable the DMA remapping hardware. */
4769         for_each_iommu(iommu, drhd) {
4770                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4771                         iommu_enable_translation(iommu);
4772
4773                 iommu_disable_protect_mem_regions(iommu);
4774         }
4775         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4776
4777         intel_iommu_enabled = 1;
4778         intel_iommu_debugfs_init();
4779
4780         return 0;
4781
4782 out_free_reserved_range:
4783         put_iova_domain(&reserved_iova_list);
4784 out_free_dmar:
4785         intel_iommu_free_dmars();
4786         up_write(&dmar_global_lock);
4787         iommu_exit_mempool();
4788         return ret;
4789 }
4790
4791 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4792 {
4793         struct intel_iommu *iommu = opaque;
4794
4795         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4796         return 0;
4797 }
4798
4799 /*
4800  * NB - intel-iommu lacks any sort of reference counting for the users of
4801  * dependent devices.  If multiple endpoints have intersecting dependent
4802  * devices, unbinding the driver from any one of them will possibly leave
4803  * the others unable to operate.
4804  */
4805 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4806 {
4807         if (!iommu || !dev || !dev_is_pci(dev))
4808                 return;
4809
4810         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4811 }
4812
4813 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4814 {
4815         struct dmar_domain *domain;
4816         struct intel_iommu *iommu;
4817         unsigned long flags;
4818
4819         assert_spin_locked(&device_domain_lock);
4820
4821         if (WARN_ON(!info))
4822                 return;
4823
4824         iommu = info->iommu;
4825         domain = info->domain;
4826
4827         if (info->dev) {
4828                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4829                         intel_pasid_tear_down_entry(iommu, info->dev,
4830                                         PASID_RID2PASID);
4831
4832                 iommu_disable_dev_iotlb(info);
4833                 domain_context_clear(iommu, info->dev);
4834                 intel_pasid_free_table(info->dev);
4835         }
4836
4837         unlink_domain_info(info);
4838
4839         spin_lock_irqsave(&iommu->lock, flags);
4840         domain_detach_iommu(domain, iommu);
4841         spin_unlock_irqrestore(&iommu->lock, flags);
4842
4843         /* free the private domain */
4844         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4845             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
4846             list_empty(&domain->devices))
4847                 domain_exit(info->domain);
4848
4849         free_devinfo_mem(info);
4850 }
4851
4852 static void dmar_remove_one_dev_info(struct device *dev)
4853 {
4854         struct device_domain_info *info;
4855         unsigned long flags;
4856
4857         spin_lock_irqsave(&device_domain_lock, flags);
4858         info = dev->archdata.iommu;
4859         if (info)
4860                 __dmar_remove_one_dev_info(info);
4861         spin_unlock_irqrestore(&device_domain_lock, flags);
4862 }
4863
4864 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4865 {
4866         int adjust_width;
4867
4868         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4869         domain_reserve_special_ranges(domain);
4870
4871         /* calculate AGAW */
4872         domain->gaw = guest_width;
4873         adjust_width = guestwidth_to_adjustwidth(guest_width);
4874         domain->agaw = width_to_agaw(adjust_width);
4875
4876         domain->iommu_coherency = 0;
4877         domain->iommu_snooping = 0;
4878         domain->iommu_superpage = 0;
4879         domain->max_addr = 0;
4880
4881         /* always allocate the top pgd */
4882         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4883         if (!domain->pgd)
4884                 return -ENOMEM;
4885         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4886         return 0;
4887 }
4888
4889 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4890 {
4891         struct dmar_domain *dmar_domain;
4892         struct iommu_domain *domain;
4893
4894         switch (type) {
4895         case IOMMU_DOMAIN_DMA:
4896         /* fallthrough */
4897         case IOMMU_DOMAIN_UNMANAGED:
4898                 dmar_domain = alloc_domain(0);
4899                 if (!dmar_domain) {
4900                         pr_err("Can't allocate dmar_domain\n");
4901                         return NULL;
4902                 }
4903                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4904                         pr_err("Domain initialization failed\n");
4905                         domain_exit(dmar_domain);
4906                         return NULL;
4907                 }
4908
4909                 if (type == IOMMU_DOMAIN_DMA &&
4910                     init_iova_flush_queue(&dmar_domain->iovad,
4911                                           iommu_flush_iova, iova_entry_free)) {
4912                         pr_warn("iova flush queue initialization failed\n");
4913                         intel_iommu_strict = 1;
4914                 }
4915
4916                 domain_update_iommu_cap(dmar_domain);
4917
4918                 domain = &dmar_domain->domain;
4919                 domain->geometry.aperture_start = 0;
4920                 domain->geometry.aperture_end   =
4921                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4922                 domain->geometry.force_aperture = true;
4923
4924                 return domain;
4925         case IOMMU_DOMAIN_IDENTITY:
4926                 return &si_domain->domain;
4927         default:
4928                 return NULL;
4929         }
4930
4931         return NULL;
4932 }
4933
4934 static void intel_iommu_domain_free(struct iommu_domain *domain)
4935 {
4936         if (domain != &si_domain->domain)
4937                 domain_exit(to_dmar_domain(domain));
4938 }
4939
4940 /*
4941  * Check whether a @domain could be attached to the @dev through the
4942  * aux-domain attach/detach APIs.
4943  */
4944 static inline bool
4945 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4946 {
4947         struct device_domain_info *info = dev->archdata.iommu;
4948
4949         return info && info->auxd_enabled &&
4950                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4951 }
4952
4953 static void auxiliary_link_device(struct dmar_domain *domain,
4954                                   struct device *dev)
4955 {
4956         struct device_domain_info *info = dev->archdata.iommu;
4957
4958         assert_spin_locked(&device_domain_lock);
4959         if (WARN_ON(!info))
4960                 return;
4961
4962         domain->auxd_refcnt++;
4963         list_add(&domain->auxd, &info->auxiliary_domains);
4964 }
4965
4966 static void auxiliary_unlink_device(struct dmar_domain *domain,
4967                                     struct device *dev)
4968 {
4969         struct device_domain_info *info = dev->archdata.iommu;
4970
4971         assert_spin_locked(&device_domain_lock);
4972         if (WARN_ON(!info))
4973                 return;
4974
4975         list_del(&domain->auxd);
4976         domain->auxd_refcnt--;
4977
4978         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4979                 intel_pasid_free_id(domain->default_pasid);
4980 }
4981
4982 static int aux_domain_add_dev(struct dmar_domain *domain,
4983                               struct device *dev)
4984 {
4985         int ret;
4986         u8 bus, devfn;
4987         unsigned long flags;
4988         struct intel_iommu *iommu;
4989
4990         iommu = device_to_iommu(dev, &bus, &devfn);
4991         if (!iommu)
4992                 return -ENODEV;
4993
4994         if (domain->default_pasid <= 0) {
4995                 int pasid;
4996
4997                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4998                                              pci_max_pasids(to_pci_dev(dev)),
4999                                              GFP_KERNEL);
5000                 if (pasid <= 0) {
5001                         pr_err("Can't allocate default pasid\n");
5002                         return -ENODEV;
5003                 }
5004                 domain->default_pasid = pasid;
5005         }
5006
5007         spin_lock_irqsave(&device_domain_lock, flags);
5008         /*
5009          * iommu->lock must be held to attach domain to iommu and setup the
5010          * pasid entry for second level translation.
5011          */
5012         spin_lock(&iommu->lock);
5013         ret = domain_attach_iommu(domain, iommu);
5014         if (ret)
5015                 goto attach_failed;
5016
5017         /* Setup the PASID entry for mediated devices: */
5018         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5019                                              domain->default_pasid);
5020         if (ret)
5021                 goto table_failed;
5022         spin_unlock(&iommu->lock);
5023
5024         auxiliary_link_device(domain, dev);
5025
5026         spin_unlock_irqrestore(&device_domain_lock, flags);
5027
5028         return 0;
5029
5030 table_failed:
5031         domain_detach_iommu(domain, iommu);
5032 attach_failed:
5033         spin_unlock(&iommu->lock);
5034         spin_unlock_irqrestore(&device_domain_lock, flags);
5035         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5036                 intel_pasid_free_id(domain->default_pasid);
5037
5038         return ret;
5039 }
5040
5041 static void aux_domain_remove_dev(struct dmar_domain *domain,
5042                                   struct device *dev)
5043 {
5044         struct device_domain_info *info;
5045         struct intel_iommu *iommu;
5046         unsigned long flags;
5047
5048         if (!is_aux_domain(dev, &domain->domain))
5049                 return;
5050
5051         spin_lock_irqsave(&device_domain_lock, flags);
5052         info = dev->archdata.iommu;
5053         iommu = info->iommu;
5054
5055         auxiliary_unlink_device(domain, dev);
5056
5057         spin_lock(&iommu->lock);
5058         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5059         domain_detach_iommu(domain, iommu);
5060         spin_unlock(&iommu->lock);
5061
5062         spin_unlock_irqrestore(&device_domain_lock, flags);
5063 }
5064
5065 static int prepare_domain_attach_device(struct iommu_domain *domain,
5066                                         struct device *dev)
5067 {
5068         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5069         struct intel_iommu *iommu;
5070         int addr_width;
5071         u8 bus, devfn;
5072
5073         iommu = device_to_iommu(dev, &bus, &devfn);
5074         if (!iommu)
5075                 return -ENODEV;
5076
5077         /* check if this iommu agaw is sufficient for max mapped address */
5078         addr_width = agaw_to_width(iommu->agaw);
5079         if (addr_width > cap_mgaw(iommu->cap))
5080                 addr_width = cap_mgaw(iommu->cap);
5081
5082         if (dmar_domain->max_addr > (1LL << addr_width)) {
5083                 dev_err(dev, "%s: iommu width (%d) is not "
5084                         "sufficient for the mapped address (%llx)\n",
5085                         __func__, addr_width, dmar_domain->max_addr);
5086                 return -EFAULT;
5087         }
5088         dmar_domain->gaw = addr_width;
5089
5090         /*
5091          * Knock out extra levels of page tables if necessary
5092          */
5093         while (iommu->agaw < dmar_domain->agaw) {
5094                 struct dma_pte *pte;
5095
5096                 pte = dmar_domain->pgd;
5097                 if (dma_pte_present(pte)) {
5098                         dmar_domain->pgd = (struct dma_pte *)
5099                                 phys_to_virt(dma_pte_addr(pte));
5100                         free_pgtable_page(pte);
5101                 }
5102                 dmar_domain->agaw--;
5103         }
5104
5105         return 0;
5106 }
5107
5108 static int intel_iommu_attach_device(struct iommu_domain *domain,
5109                                      struct device *dev)
5110 {
5111         int ret;
5112
5113         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5114             device_is_rmrr_locked(dev)) {
5115                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5116                 return -EPERM;
5117         }
5118
5119         if (is_aux_domain(dev, domain))
5120                 return -EPERM;
5121
5122         /* normally dev is not mapped */
5123         if (unlikely(domain_context_mapped(dev))) {
5124                 struct dmar_domain *old_domain;
5125
5126                 old_domain = find_domain(dev);
5127                 if (old_domain)
5128                         dmar_remove_one_dev_info(dev);
5129         }
5130
5131         ret = prepare_domain_attach_device(domain, dev);
5132         if (ret)
5133                 return ret;
5134
5135         return domain_add_dev_info(to_dmar_domain(domain), dev);
5136 }
5137
5138 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5139                                          struct device *dev)
5140 {
5141         int ret;
5142
5143         if (!is_aux_domain(dev, domain))
5144                 return -EPERM;
5145
5146         ret = prepare_domain_attach_device(domain, dev);
5147         if (ret)
5148                 return ret;
5149
5150         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5151 }
5152
5153 static void intel_iommu_detach_device(struct iommu_domain *domain,
5154                                       struct device *dev)
5155 {
5156         dmar_remove_one_dev_info(dev);
5157 }
5158
5159 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5160                                           struct device *dev)
5161 {
5162         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5163 }
5164
5165 static int intel_iommu_map(struct iommu_domain *domain,
5166                            unsigned long iova, phys_addr_t hpa,
5167                            size_t size, int iommu_prot)
5168 {
5169         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5170         u64 max_addr;
5171         int prot = 0;
5172         int ret;
5173
5174         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5175                 return -EINVAL;
5176
5177         if (iommu_prot & IOMMU_READ)
5178                 prot |= DMA_PTE_READ;
5179         if (iommu_prot & IOMMU_WRITE)
5180                 prot |= DMA_PTE_WRITE;
5181         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5182                 prot |= DMA_PTE_SNP;
5183
5184         max_addr = iova + size;
5185         if (dmar_domain->max_addr < max_addr) {
5186                 u64 end;
5187
5188                 /* check if minimum agaw is sufficient for mapped address */
5189                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5190                 if (end < max_addr) {
5191                         pr_err("%s: iommu width (%d) is not "
5192                                "sufficient for the mapped address (%llx)\n",
5193                                __func__, dmar_domain->gaw, max_addr);
5194                         return -EFAULT;
5195                 }
5196                 dmar_domain->max_addr = max_addr;
5197         }
5198         /* Round up size to next multiple of PAGE_SIZE, if it and
5199            the low bits of hpa would take us onto the next page */
5200         size = aligned_nrpages(hpa, size);
5201         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5202                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5203         return ret;
5204 }
5205
5206 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5207                                 unsigned long iova, size_t size)
5208 {
5209         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5210         struct page *freelist = NULL;
5211         unsigned long start_pfn, last_pfn;
5212         unsigned int npages;
5213         int iommu_id, level = 0;
5214
5215         /* Cope with horrid API which requires us to unmap more than the
5216            size argument if it happens to be a large-page mapping. */
5217         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5218         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5219                 return 0;
5220
5221         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5222                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5223
5224         start_pfn = iova >> VTD_PAGE_SHIFT;
5225         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5226
5227         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5228
5229         npages = last_pfn - start_pfn + 1;
5230
5231         for_each_domain_iommu(iommu_id, dmar_domain)
5232                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5233                                       start_pfn, npages, !freelist, 0);
5234
5235         dma_free_pagelist(freelist);
5236
5237         if (dmar_domain->max_addr == iova + size)
5238                 dmar_domain->max_addr = iova;
5239
5240         return size;
5241 }
5242
5243 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5244                                             dma_addr_t iova)
5245 {
5246         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5247         struct dma_pte *pte;
5248         int level = 0;
5249         u64 phys = 0;
5250
5251         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5252                 return 0;
5253
5254         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5255         if (pte)
5256                 phys = dma_pte_addr(pte);
5257
5258         return phys;
5259 }
5260
5261 static inline bool scalable_mode_support(void)
5262 {
5263         struct dmar_drhd_unit *drhd;
5264         struct intel_iommu *iommu;
5265         bool ret = true;
5266
5267         rcu_read_lock();
5268         for_each_active_iommu(iommu, drhd) {
5269                 if (!sm_supported(iommu)) {
5270                         ret = false;
5271                         break;
5272                 }
5273         }
5274         rcu_read_unlock();
5275
5276         return ret;
5277 }
5278
5279 static inline bool iommu_pasid_support(void)
5280 {
5281         struct dmar_drhd_unit *drhd;
5282         struct intel_iommu *iommu;
5283         bool ret = true;
5284
5285         rcu_read_lock();
5286         for_each_active_iommu(iommu, drhd) {
5287                 if (!pasid_supported(iommu)) {
5288                         ret = false;
5289                         break;
5290                 }
5291         }
5292         rcu_read_unlock();
5293
5294         return ret;
5295 }
5296
5297 static bool intel_iommu_capable(enum iommu_cap cap)
5298 {
5299         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5300                 return domain_update_iommu_snooping(NULL) == 1;
5301         if (cap == IOMMU_CAP_INTR_REMAP)
5302                 return irq_remapping_enabled == 1;
5303
5304         return false;
5305 }
5306
5307 static int intel_iommu_add_device(struct device *dev)
5308 {
5309         struct dmar_domain *dmar_domain;
5310         struct iommu_domain *domain;
5311         struct intel_iommu *iommu;
5312         struct iommu_group *group;
5313         u8 bus, devfn;
5314         int ret;
5315
5316         iommu = device_to_iommu(dev, &bus, &devfn);
5317         if (!iommu)
5318                 return -ENODEV;
5319
5320         iommu_device_link(&iommu->iommu, dev);
5321
5322         if (translation_pre_enabled(iommu))
5323                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5324
5325         group = iommu_group_get_for_dev(dev);
5326
5327         if (IS_ERR(group))
5328                 return PTR_ERR(group);
5329
5330         iommu_group_put(group);
5331
5332         domain = iommu_get_domain_for_dev(dev);
5333         dmar_domain = to_dmar_domain(domain);
5334         if (domain->type == IOMMU_DOMAIN_DMA) {
5335                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5336                         ret = iommu_request_dm_for_dev(dev);
5337                         if (ret) {
5338                                 dmar_remove_one_dev_info(dev);
5339                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5340                                 domain_add_dev_info(si_domain, dev);
5341                                 dev_info(dev,
5342                                          "Device uses a private identity domain.\n");
5343                         }
5344                 }
5345         } else {
5346                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5347                         ret = iommu_request_dma_domain_for_dev(dev);
5348                         if (ret) {
5349                                 dmar_remove_one_dev_info(dev);
5350                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5351                                 if (!get_private_domain_for_dev(dev)) {
5352                                         dev_warn(dev,
5353                                                  "Failed to get a private domain.\n");
5354                                         return -ENOMEM;
5355                                 }
5356
5357                                 dev_info(dev,
5358                                          "Device uses a private dma domain.\n");
5359                         }
5360                 }
5361         }
5362
5363         return 0;
5364 }
5365
5366 static void intel_iommu_remove_device(struct device *dev)
5367 {
5368         struct intel_iommu *iommu;
5369         u8 bus, devfn;
5370
5371         iommu = device_to_iommu(dev, &bus, &devfn);
5372         if (!iommu)
5373                 return;
5374
5375         dmar_remove_one_dev_info(dev);
5376
5377         iommu_group_remove_device(dev);
5378
5379         iommu_device_unlink(&iommu->iommu, dev);
5380 }
5381
5382 static void intel_iommu_get_resv_regions(struct device *device,
5383                                          struct list_head *head)
5384 {
5385         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5386         struct iommu_resv_region *reg;
5387         struct dmar_rmrr_unit *rmrr;
5388         struct device *i_dev;
5389         int i;
5390
5391         down_read(&dmar_global_lock);
5392         for_each_rmrr_units(rmrr) {
5393                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5394                                           i, i_dev) {
5395                         struct iommu_resv_region *resv;
5396                         enum iommu_resv_type type;
5397                         size_t length;
5398
5399                         if (i_dev != device &&
5400                             !is_downstream_to_pci_bridge(device, i_dev))
5401                                 continue;
5402
5403                         length = rmrr->end_address - rmrr->base_address + 1;
5404
5405                         type = device_rmrr_is_relaxable(device) ?
5406                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5407
5408                         resv = iommu_alloc_resv_region(rmrr->base_address,
5409                                                        length, prot, type);
5410                         if (!resv)
5411                                 break;
5412
5413                         list_add_tail(&resv->list, head);
5414                 }
5415         }
5416         up_read(&dmar_global_lock);
5417
5418 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5419         if (dev_is_pci(device)) {
5420                 struct pci_dev *pdev = to_pci_dev(device);
5421
5422                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5423                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5424                                                       IOMMU_RESV_DIRECT);
5425                         if (reg)
5426                                 list_add_tail(&reg->list, head);
5427                 }
5428         }
5429 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5430
5431         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5432                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5433                                       0, IOMMU_RESV_MSI);
5434         if (!reg)
5435                 return;
5436         list_add_tail(&reg->list, head);
5437 }
5438
5439 static void intel_iommu_put_resv_regions(struct device *dev,
5440                                          struct list_head *head)
5441 {
5442         struct iommu_resv_region *entry, *next;
5443
5444         list_for_each_entry_safe(entry, next, head, list)
5445                 kfree(entry);
5446 }
5447
5448 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5449 {
5450         struct device_domain_info *info;
5451         struct context_entry *context;
5452         struct dmar_domain *domain;
5453         unsigned long flags;
5454         u64 ctx_lo;
5455         int ret;
5456
5457         domain = find_domain(dev);
5458         if (!domain)
5459                 return -EINVAL;
5460
5461         spin_lock_irqsave(&device_domain_lock, flags);
5462         spin_lock(&iommu->lock);
5463
5464         ret = -EINVAL;
5465         info = dev->archdata.iommu;
5466         if (!info || !info->pasid_supported)
5467                 goto out;
5468
5469         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5470         if (WARN_ON(!context))
5471                 goto out;
5472
5473         ctx_lo = context[0].lo;
5474
5475         if (!(ctx_lo & CONTEXT_PASIDE)) {
5476                 ctx_lo |= CONTEXT_PASIDE;
5477                 context[0].lo = ctx_lo;
5478                 wmb();
5479                 iommu->flush.flush_context(iommu,
5480                                            domain->iommu_did[iommu->seq_id],
5481                                            PCI_DEVID(info->bus, info->devfn),
5482                                            DMA_CCMD_MASK_NOBIT,
5483                                            DMA_CCMD_DEVICE_INVL);
5484         }
5485
5486         /* Enable PASID support in the device, if it wasn't already */
5487         if (!info->pasid_enabled)
5488                 iommu_enable_dev_iotlb(info);
5489
5490         ret = 0;
5491
5492  out:
5493         spin_unlock(&iommu->lock);
5494         spin_unlock_irqrestore(&device_domain_lock, flags);
5495
5496         return ret;
5497 }
5498
5499 static void intel_iommu_apply_resv_region(struct device *dev,
5500                                           struct iommu_domain *domain,
5501                                           struct iommu_resv_region *region)
5502 {
5503         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5504         unsigned long start, end;
5505
5506         start = IOVA_PFN(region->start);
5507         end   = IOVA_PFN(region->start + region->length - 1);
5508
5509         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5510 }
5511
5512 #ifdef CONFIG_INTEL_IOMMU_SVM
5513 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5514 {
5515         struct intel_iommu *iommu;
5516         u8 bus, devfn;
5517
5518         if (iommu_dummy(dev)) {
5519                 dev_warn(dev,
5520                          "No IOMMU translation for device; cannot enable SVM\n");
5521                 return NULL;
5522         }
5523
5524         iommu = device_to_iommu(dev, &bus, &devfn);
5525         if ((!iommu)) {
5526                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5527                 return NULL;
5528         }
5529
5530         return iommu;
5531 }
5532 #endif /* CONFIG_INTEL_IOMMU_SVM */
5533
5534 static int intel_iommu_enable_auxd(struct device *dev)
5535 {
5536         struct device_domain_info *info;
5537         struct intel_iommu *iommu;
5538         unsigned long flags;
5539         u8 bus, devfn;
5540         int ret;
5541
5542         iommu = device_to_iommu(dev, &bus, &devfn);
5543         if (!iommu || dmar_disabled)
5544                 return -EINVAL;
5545
5546         if (!sm_supported(iommu) || !pasid_supported(iommu))
5547                 return -EINVAL;
5548
5549         ret = intel_iommu_enable_pasid(iommu, dev);
5550         if (ret)
5551                 return -ENODEV;
5552
5553         spin_lock_irqsave(&device_domain_lock, flags);
5554         info = dev->archdata.iommu;
5555         info->auxd_enabled = 1;
5556         spin_unlock_irqrestore(&device_domain_lock, flags);
5557
5558         return 0;
5559 }
5560
5561 static int intel_iommu_disable_auxd(struct device *dev)
5562 {
5563         struct device_domain_info *info;
5564         unsigned long flags;
5565
5566         spin_lock_irqsave(&device_domain_lock, flags);
5567         info = dev->archdata.iommu;
5568         if (!WARN_ON(!info))
5569                 info->auxd_enabled = 0;
5570         spin_unlock_irqrestore(&device_domain_lock, flags);
5571
5572         return 0;
5573 }
5574
5575 /*
5576  * A PCI express designated vendor specific extended capability is defined
5577  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5578  * for system software and tools to detect endpoint devices supporting the
5579  * Intel scalable IO virtualization without host driver dependency.
5580  *
5581  * Returns the address of the matching extended capability structure within
5582  * the device's PCI configuration space or 0 if the device does not support
5583  * it.
5584  */
5585 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5586 {
5587         int pos;
5588         u16 vendor, id;
5589
5590         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5591         while (pos) {
5592                 pci_read_config_word(pdev, pos + 4, &vendor);
5593                 pci_read_config_word(pdev, pos + 8, &id);
5594                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5595                         return pos;
5596
5597                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5598         }
5599
5600         return 0;
5601 }
5602
5603 static bool
5604 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5605 {
5606         if (feat == IOMMU_DEV_FEAT_AUX) {
5607                 int ret;
5608
5609                 if (!dev_is_pci(dev) || dmar_disabled ||
5610                     !scalable_mode_support() || !iommu_pasid_support())
5611                         return false;
5612
5613                 ret = pci_pasid_features(to_pci_dev(dev));
5614                 if (ret < 0)
5615                         return false;
5616
5617                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5618         }
5619
5620         return false;
5621 }
5622
5623 static int
5624 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5625 {
5626         if (feat == IOMMU_DEV_FEAT_AUX)
5627                 return intel_iommu_enable_auxd(dev);
5628
5629         return -ENODEV;
5630 }
5631
5632 static int
5633 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5634 {
5635         if (feat == IOMMU_DEV_FEAT_AUX)
5636                 return intel_iommu_disable_auxd(dev);
5637
5638         return -ENODEV;
5639 }
5640
5641 static bool
5642 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5643 {
5644         struct device_domain_info *info = dev->archdata.iommu;
5645
5646         if (feat == IOMMU_DEV_FEAT_AUX)
5647                 return scalable_mode_support() && info && info->auxd_enabled;
5648
5649         return false;
5650 }
5651
5652 static int
5653 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5654 {
5655         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5656
5657         return dmar_domain->default_pasid > 0 ?
5658                         dmar_domain->default_pasid : -EINVAL;
5659 }
5660
5661 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5662                                            struct device *dev)
5663 {
5664         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5665 }
5666
5667 const struct iommu_ops intel_iommu_ops = {
5668         .capable                = intel_iommu_capable,
5669         .domain_alloc           = intel_iommu_domain_alloc,
5670         .domain_free            = intel_iommu_domain_free,
5671         .attach_dev             = intel_iommu_attach_device,
5672         .detach_dev             = intel_iommu_detach_device,
5673         .aux_attach_dev         = intel_iommu_aux_attach_device,
5674         .aux_detach_dev         = intel_iommu_aux_detach_device,
5675         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5676         .map                    = intel_iommu_map,
5677         .unmap                  = intel_iommu_unmap,
5678         .iova_to_phys           = intel_iommu_iova_to_phys,
5679         .add_device             = intel_iommu_add_device,
5680         .remove_device          = intel_iommu_remove_device,
5681         .get_resv_regions       = intel_iommu_get_resv_regions,
5682         .put_resv_regions       = intel_iommu_put_resv_regions,
5683         .apply_resv_region      = intel_iommu_apply_resv_region,
5684         .device_group           = pci_device_group,
5685         .dev_has_feat           = intel_iommu_dev_has_feat,
5686         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5687         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5688         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5689         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5690         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5691 };
5692
5693 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5694 {
5695         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5696         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5697         dmar_map_gfx = 0;
5698 }
5699
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5703 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5704 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5705 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5706 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5707
5708 static void quirk_iommu_rwbf(struct pci_dev *dev)
5709 {
5710         /*
5711          * Mobile 4 Series Chipset neglects to set RWBF capability,
5712          * but needs it. Same seems to hold for the desktop versions.
5713          */
5714         pci_info(dev, "Forcing write-buffer flush capability\n");
5715         rwbf_quirk = 1;
5716 }
5717
5718 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5719 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5720 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5721 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5722 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5723 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5724 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5725
5726 #define GGC 0x52
5727 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5728 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5729 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5730 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5731 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5732 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5733 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5734 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5735
5736 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5737 {
5738         unsigned short ggc;
5739
5740         if (pci_read_config_word(dev, GGC, &ggc))
5741                 return;
5742
5743         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5744                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5745                 dmar_map_gfx = 0;
5746         } else if (dmar_map_gfx) {
5747                 /* we have to ensure the gfx device is idle before we flush */
5748                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5749                 intel_iommu_strict = 1;
5750        }
5751 }
5752 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5753 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5754 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5755 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5756
5757 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5758    ISOCH DMAR unit for the Azalia sound device, but not give it any
5759    TLB entries, which causes it to deadlock. Check for that.  We do
5760    this in a function called from init_dmars(), instead of in a PCI
5761    quirk, because we don't want to print the obnoxious "BIOS broken"
5762    message if VT-d is actually disabled.
5763 */
5764 static void __init check_tylersburg_isoch(void)
5765 {
5766         struct pci_dev *pdev;
5767         uint32_t vtisochctrl;
5768
5769         /* If there's no Azalia in the system anyway, forget it. */
5770         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5771         if (!pdev)
5772                 return;
5773         pci_dev_put(pdev);
5774
5775         /* System Management Registers. Might be hidden, in which case
5776            we can't do the sanity check. But that's OK, because the
5777            known-broken BIOSes _don't_ actually hide it, so far. */
5778         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5779         if (!pdev)
5780                 return;
5781
5782         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5783                 pci_dev_put(pdev);
5784                 return;
5785         }
5786
5787         pci_dev_put(pdev);
5788
5789         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5790         if (vtisochctrl & 1)
5791                 return;
5792
5793         /* Drop all bits other than the number of TLB entries */
5794         vtisochctrl &= 0x1c;
5795
5796         /* If we have the recommended number of TLB entries (16), fine. */
5797         if (vtisochctrl == 0x10)
5798                 return;
5799
5800         /* Zero TLB entries? You get to ride the short bus to school. */
5801         if (!vtisochctrl) {
5802                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5803                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5804                      dmi_get_system_info(DMI_BIOS_VENDOR),
5805                      dmi_get_system_info(DMI_BIOS_VERSION),
5806                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5807                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5808                 return;
5809         }
5810
5811         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5812                vtisochctrl);
5813 }