]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
iommu/vt-d: Remove static identity map code
[linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 #define dev_fmt(fmt)    pr_fmt(fmt)
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
55
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
58
59 #define ROOT_SIZE               VTD_PAGE_SIZE
60 #define CONTEXT_SIZE            VTD_PAGE_SIZE
61
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66
67 #define IOAPIC_RANGE_START      (0xfee00000)
68 #define IOAPIC_RANGE_END        (0xfeefffff)
69 #define IOVA_START_ADDR         (0x1000)
70
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75
76 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
82                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN          (1)
87
88 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
89
90 /* page table handling */
91 #define LEVEL_STRIDE            (9)
92 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
93
94 /*
95  * This bitmap is used to advertise the page sizes our hardware support
96  * to the IOMMU core, which will then use this information to split
97  * physically contiguous memory regions it is mapping into page sizes
98  * that we support.
99  *
100  * Traditionally the IOMMU core just handed us the mappings directly,
101  * after making sure the size is an order of a 4KiB page and that the
102  * mapping has natural alignment.
103  *
104  * To retain this behavior, we currently advertise that we support
105  * all page sizes that are an order of 4KiB.
106  *
107  * If at some point we'd like to utilize the IOMMU core's new behavior,
108  * we could change this to advertise the real page sizes we support.
109  */
110 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
111
112 static inline int agaw_to_level(int agaw)
113 {
114         return agaw + 2;
115 }
116
117 static inline int agaw_to_width(int agaw)
118 {
119         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 }
121
122 static inline int width_to_agaw(int width)
123 {
124         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 }
126
127 static inline unsigned int level_to_offset_bits(int level)
128 {
129         return (level - 1) * LEVEL_STRIDE;
130 }
131
132 static inline int pfn_level_offset(unsigned long pfn, int level)
133 {
134         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 }
136
137 static inline unsigned long level_mask(int level)
138 {
139         return -1UL << level_to_offset_bits(level);
140 }
141
142 static inline unsigned long level_size(int level)
143 {
144         return 1UL << level_to_offset_bits(level);
145 }
146
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 {
149         return (pfn + level_size(level) - 1) & level_mask(level);
150 }
151
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 {
154         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 }
156
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158    are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 {
161         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 }
163
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 {
166         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 }
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 {
170         return mm_to_dma_pfn(page_to_pfn(pg));
171 }
172 static inline unsigned long virt_to_dma_pfn(void *p)
173 {
174         return page_to_dma_pfn(virt_to_page(p));
175 }
176
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
179
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
182
183 /*
184  * set to 1 to panic kernel if can't successfully enable VT-d
185  * (used when kernel is launched w/ TXT)
186  */
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
190
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193 /*
194  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195  * if marked present.
196  */
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
198 {
199         if (!(re->lo & 1))
200                 return 0;
201
202         return re->lo & VTD_PAGE_MASK;
203 }
204
205 /*
206  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207  * if marked present.
208  */
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
210 {
211         if (!(re->hi & 1))
212                 return 0;
213
214         return re->hi & VTD_PAGE_MASK;
215 }
216
217 static inline void context_clear_pasid_enable(struct context_entry *context)
218 {
219         context->lo &= ~(1ULL << 11);
220 }
221
222 static inline bool context_pasid_enabled(struct context_entry *context)
223 {
224         return !!(context->lo & (1ULL << 11));
225 }
226
227 static inline void context_set_copied(struct context_entry *context)
228 {
229         context->hi |= (1ull << 3);
230 }
231
232 static inline bool context_copied(struct context_entry *context)
233 {
234         return !!(context->hi & (1ULL << 3));
235 }
236
237 static inline bool __context_present(struct context_entry *context)
238 {
239         return (context->lo & 1);
240 }
241
242 bool context_present(struct context_entry *context)
243 {
244         return context_pasid_enabled(context) ?
245              __context_present(context) :
246              __context_present(context) && !context_copied(context);
247 }
248
249 static inline void context_set_present(struct context_entry *context)
250 {
251         context->lo |= 1;
252 }
253
254 static inline void context_set_fault_enable(struct context_entry *context)
255 {
256         context->lo &= (((u64)-1) << 2) | 1;
257 }
258
259 static inline void context_set_translation_type(struct context_entry *context,
260                                                 unsigned long value)
261 {
262         context->lo &= (((u64)-1) << 4) | 3;
263         context->lo |= (value & 3) << 2;
264 }
265
266 static inline void context_set_address_root(struct context_entry *context,
267                                             unsigned long value)
268 {
269         context->lo &= ~VTD_PAGE_MASK;
270         context->lo |= value & VTD_PAGE_MASK;
271 }
272
273 static inline void context_set_address_width(struct context_entry *context,
274                                              unsigned long value)
275 {
276         context->hi |= value & 7;
277 }
278
279 static inline void context_set_domain_id(struct context_entry *context,
280                                          unsigned long value)
281 {
282         context->hi |= (value & ((1 << 16) - 1)) << 8;
283 }
284
285 static inline int context_domain_id(struct context_entry *c)
286 {
287         return((c->hi >> 8) & 0xffff);
288 }
289
290 static inline void context_clear_entry(struct context_entry *context)
291 {
292         context->lo = 0;
293         context->hi = 0;
294 }
295
296 /*
297  * This domain is a statically identity mapping domain.
298  *      1. This domain creats a static 1:1 mapping to all usable memory.
299  *      2. It maps to each iommu if successful.
300  *      3. Each iommu mapps to this domain if successful.
301  */
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
304
305 /* si_domain contains mulitple devices */
306 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
307
308 /*
309  * This is a DMA domain allocated through the iommu domain allocation
310  * interface. But one or more devices belonging to this domain have
311  * been chosen to use a private domain. We should avoid to use the
312  * map/unmap/iova_to_phys APIs on it.
313  */
314 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
315
316 #define for_each_domain_iommu(idx, domain)                      \
317         for (idx = 0; idx < g_num_of_iommus; idx++)             \
318                 if (domain->iommu_refcnt[idx])
319
320 struct dmar_rmrr_unit {
321         struct list_head list;          /* list of rmrr units   */
322         struct acpi_dmar_header *hdr;   /* ACPI header          */
323         u64     base_address;           /* reserved base address*/
324         u64     end_address;            /* reserved end address */
325         struct dmar_dev_scope *devices; /* target devices */
326         int     devices_cnt;            /* target device count */
327         struct iommu_resv_region *resv; /* reserved region handle */
328 };
329
330 struct dmar_atsr_unit {
331         struct list_head list;          /* list of ATSR units */
332         struct acpi_dmar_header *hdr;   /* ACPI header */
333         struct dmar_dev_scope *devices; /* target devices */
334         int devices_cnt;                /* target device count */
335         u8 include_all:1;               /* include all ports */
336 };
337
338 static LIST_HEAD(dmar_atsr_units);
339 static LIST_HEAD(dmar_rmrr_units);
340
341 #define for_each_rmrr_units(rmrr) \
342         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
343
344 /* bitmap for indexing intel_iommus */
345 static int g_num_of_iommus;
346
347 static void domain_exit(struct dmar_domain *domain);
348 static void domain_remove_dev_info(struct dmar_domain *domain);
349 static void dmar_remove_one_dev_info(struct device *dev);
350 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
351 static void domain_context_clear(struct intel_iommu *iommu,
352                                  struct device *dev);
353 static int domain_detach_iommu(struct dmar_domain *domain,
354                                struct intel_iommu *iommu);
355 static bool device_is_rmrr_locked(struct device *dev);
356 static int intel_iommu_attach_device(struct iommu_domain *domain,
357                                      struct device *dev);
358
359 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
360 int dmar_disabled = 0;
361 #else
362 int dmar_disabled = 1;
363 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
364
365 int intel_iommu_sm;
366 int intel_iommu_enabled = 0;
367 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
368
369 static int dmar_map_gfx = 1;
370 static int dmar_forcedac;
371 static int intel_iommu_strict;
372 static int intel_iommu_superpage = 1;
373 static int iommu_identity_mapping;
374
375 #define IDENTMAP_ALL            1
376 #define IDENTMAP_GFX            2
377 #define IDENTMAP_AZALIA         4
378
379 int intel_iommu_gfx_mapped;
380 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
381
382 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
383 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
384 static DEFINE_SPINLOCK(device_domain_lock);
385 static LIST_HEAD(device_domain_list);
386
387 /*
388  * Iterate over elements in device_domain_list and call the specified
389  * callback @fn against each element.
390  */
391 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
392                                      void *data), void *data)
393 {
394         int ret = 0;
395         unsigned long flags;
396         struct device_domain_info *info;
397
398         spin_lock_irqsave(&device_domain_lock, flags);
399         list_for_each_entry(info, &device_domain_list, global) {
400                 ret = fn(info, data);
401                 if (ret) {
402                         spin_unlock_irqrestore(&device_domain_lock, flags);
403                         return ret;
404                 }
405         }
406         spin_unlock_irqrestore(&device_domain_lock, flags);
407
408         return 0;
409 }
410
411 const struct iommu_ops intel_iommu_ops;
412
413 static bool translation_pre_enabled(struct intel_iommu *iommu)
414 {
415         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
416 }
417
418 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
419 {
420         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
421 }
422
423 static void init_translation_status(struct intel_iommu *iommu)
424 {
425         u32 gsts;
426
427         gsts = readl(iommu->reg + DMAR_GSTS_REG);
428         if (gsts & DMA_GSTS_TES)
429                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
430 }
431
432 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
433 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
434 {
435         return container_of(dom, struct dmar_domain, domain);
436 }
437
438 static int __init intel_iommu_setup(char *str)
439 {
440         if (!str)
441                 return -EINVAL;
442         while (*str) {
443                 if (!strncmp(str, "on", 2)) {
444                         dmar_disabled = 0;
445                         pr_info("IOMMU enabled\n");
446                 } else if (!strncmp(str, "off", 3)) {
447                         dmar_disabled = 1;
448                         no_platform_optin = 1;
449                         pr_info("IOMMU disabled\n");
450                 } else if (!strncmp(str, "igfx_off", 8)) {
451                         dmar_map_gfx = 0;
452                         pr_info("Disable GFX device mapping\n");
453                 } else if (!strncmp(str, "forcedac", 8)) {
454                         pr_info("Forcing DAC for PCI devices\n");
455                         dmar_forcedac = 1;
456                 } else if (!strncmp(str, "strict", 6)) {
457                         pr_info("Disable batched IOTLB flush\n");
458                         intel_iommu_strict = 1;
459                 } else if (!strncmp(str, "sp_off", 6)) {
460                         pr_info("Disable supported super page\n");
461                         intel_iommu_superpage = 0;
462                 } else if (!strncmp(str, "sm_on", 5)) {
463                         pr_info("Intel-IOMMU: scalable mode supported\n");
464                         intel_iommu_sm = 1;
465                 } else if (!strncmp(str, "tboot_noforce", 13)) {
466                         printk(KERN_INFO
467                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
468                         intel_iommu_tboot_noforce = 1;
469                 }
470
471                 str += strcspn(str, ",");
472                 while (*str == ',')
473                         str++;
474         }
475         return 0;
476 }
477 __setup("intel_iommu=", intel_iommu_setup);
478
479 static struct kmem_cache *iommu_domain_cache;
480 static struct kmem_cache *iommu_devinfo_cache;
481
482 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
483 {
484         struct dmar_domain **domains;
485         int idx = did >> 8;
486
487         domains = iommu->domains[idx];
488         if (!domains)
489                 return NULL;
490
491         return domains[did & 0xff];
492 }
493
494 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
495                              struct dmar_domain *domain)
496 {
497         struct dmar_domain **domains;
498         int idx = did >> 8;
499
500         if (!iommu->domains[idx]) {
501                 size_t size = 256 * sizeof(struct dmar_domain *);
502                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
503         }
504
505         domains = iommu->domains[idx];
506         if (WARN_ON(!domains))
507                 return;
508         else
509                 domains[did & 0xff] = domain;
510 }
511
512 void *alloc_pgtable_page(int node)
513 {
514         struct page *page;
515         void *vaddr = NULL;
516
517         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
518         if (page)
519                 vaddr = page_address(page);
520         return vaddr;
521 }
522
523 void free_pgtable_page(void *vaddr)
524 {
525         free_page((unsigned long)vaddr);
526 }
527
528 static inline void *alloc_domain_mem(void)
529 {
530         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
531 }
532
533 static void free_domain_mem(void *vaddr)
534 {
535         kmem_cache_free(iommu_domain_cache, vaddr);
536 }
537
538 static inline void * alloc_devinfo_mem(void)
539 {
540         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
541 }
542
543 static inline void free_devinfo_mem(void *vaddr)
544 {
545         kmem_cache_free(iommu_devinfo_cache, vaddr);
546 }
547
548 static inline int domain_type_is_si(struct dmar_domain *domain)
549 {
550         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
551 }
552
553 static inline int domain_pfn_supported(struct dmar_domain *domain,
554                                        unsigned long pfn)
555 {
556         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
557
558         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
559 }
560
561 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
562 {
563         unsigned long sagaw;
564         int agaw = -1;
565
566         sagaw = cap_sagaw(iommu->cap);
567         for (agaw = width_to_agaw(max_gaw);
568              agaw >= 0; agaw--) {
569                 if (test_bit(agaw, &sagaw))
570                         break;
571         }
572
573         return agaw;
574 }
575
576 /*
577  * Calculate max SAGAW for each iommu.
578  */
579 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
580 {
581         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
582 }
583
584 /*
585  * calculate agaw for each iommu.
586  * "SAGAW" may be different across iommus, use a default agaw, and
587  * get a supported less agaw for iommus that don't support the default agaw.
588  */
589 int iommu_calculate_agaw(struct intel_iommu *iommu)
590 {
591         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
592 }
593
594 /* This functionin only returns single iommu in a domain */
595 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
596 {
597         int iommu_id;
598
599         /* si_domain and vm domain should not get here. */
600         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
601                 return NULL;
602
603         for_each_domain_iommu(iommu_id, domain)
604                 break;
605
606         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
607                 return NULL;
608
609         return g_iommus[iommu_id];
610 }
611
612 static void domain_update_iommu_coherency(struct dmar_domain *domain)
613 {
614         struct dmar_drhd_unit *drhd;
615         struct intel_iommu *iommu;
616         bool found = false;
617         int i;
618
619         domain->iommu_coherency = 1;
620
621         for_each_domain_iommu(i, domain) {
622                 found = true;
623                 if (!ecap_coherent(g_iommus[i]->ecap)) {
624                         domain->iommu_coherency = 0;
625                         break;
626                 }
627         }
628         if (found)
629                 return;
630
631         /* No hardware attached; use lowest common denominator */
632         rcu_read_lock();
633         for_each_active_iommu(iommu, drhd) {
634                 if (!ecap_coherent(iommu->ecap)) {
635                         domain->iommu_coherency = 0;
636                         break;
637                 }
638         }
639         rcu_read_unlock();
640 }
641
642 static int domain_update_iommu_snooping(struct intel_iommu *skip)
643 {
644         struct dmar_drhd_unit *drhd;
645         struct intel_iommu *iommu;
646         int ret = 1;
647
648         rcu_read_lock();
649         for_each_active_iommu(iommu, drhd) {
650                 if (iommu != skip) {
651                         if (!ecap_sc_support(iommu->ecap)) {
652                                 ret = 0;
653                                 break;
654                         }
655                 }
656         }
657         rcu_read_unlock();
658
659         return ret;
660 }
661
662 static int domain_update_iommu_superpage(struct intel_iommu *skip)
663 {
664         struct dmar_drhd_unit *drhd;
665         struct intel_iommu *iommu;
666         int mask = 0xf;
667
668         if (!intel_iommu_superpage) {
669                 return 0;
670         }
671
672         /* set iommu_superpage to the smallest common denominator */
673         rcu_read_lock();
674         for_each_active_iommu(iommu, drhd) {
675                 if (iommu != skip) {
676                         mask &= cap_super_page_val(iommu->cap);
677                         if (!mask)
678                                 break;
679                 }
680         }
681         rcu_read_unlock();
682
683         return fls(mask);
684 }
685
686 /* Some capabilities may be different across iommus */
687 static void domain_update_iommu_cap(struct dmar_domain *domain)
688 {
689         domain_update_iommu_coherency(domain);
690         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
691         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
692 }
693
694 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
695                                          u8 devfn, int alloc)
696 {
697         struct root_entry *root = &iommu->root_entry[bus];
698         struct context_entry *context;
699         u64 *entry;
700
701         entry = &root->lo;
702         if (sm_supported(iommu)) {
703                 if (devfn >= 0x80) {
704                         devfn -= 0x80;
705                         entry = &root->hi;
706                 }
707                 devfn *= 2;
708         }
709         if (*entry & 1)
710                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
711         else {
712                 unsigned long phy_addr;
713                 if (!alloc)
714                         return NULL;
715
716                 context = alloc_pgtable_page(iommu->node);
717                 if (!context)
718                         return NULL;
719
720                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
721                 phy_addr = virt_to_phys((void *)context);
722                 *entry = phy_addr | 1;
723                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
724         }
725         return &context[devfn];
726 }
727
728 static int iommu_dummy(struct device *dev)
729 {
730         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
731 }
732
733 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
734 {
735         struct dmar_drhd_unit *drhd = NULL;
736         struct intel_iommu *iommu;
737         struct device *tmp;
738         struct pci_dev *ptmp, *pdev = NULL;
739         u16 segment = 0;
740         int i;
741
742         if (iommu_dummy(dev))
743                 return NULL;
744
745         if (dev_is_pci(dev)) {
746                 struct pci_dev *pf_pdev;
747
748                 pdev = to_pci_dev(dev);
749
750 #ifdef CONFIG_X86
751                 /* VMD child devices currently cannot be handled individually */
752                 if (is_vmd(pdev->bus))
753                         return NULL;
754 #endif
755
756                 /* VFs aren't listed in scope tables; we need to look up
757                  * the PF instead to find the IOMMU. */
758                 pf_pdev = pci_physfn(pdev);
759                 dev = &pf_pdev->dev;
760                 segment = pci_domain_nr(pdev->bus);
761         } else if (has_acpi_companion(dev))
762                 dev = &ACPI_COMPANION(dev)->dev;
763
764         rcu_read_lock();
765         for_each_active_iommu(iommu, drhd) {
766                 if (pdev && segment != drhd->segment)
767                         continue;
768
769                 for_each_active_dev_scope(drhd->devices,
770                                           drhd->devices_cnt, i, tmp) {
771                         if (tmp == dev) {
772                                 /* For a VF use its original BDF# not that of the PF
773                                  * which we used for the IOMMU lookup. Strictly speaking
774                                  * we could do this for all PCI devices; we only need to
775                                  * get the BDF# from the scope table for ACPI matches. */
776                                 if (pdev && pdev->is_virtfn)
777                                         goto got_pdev;
778
779                                 *bus = drhd->devices[i].bus;
780                                 *devfn = drhd->devices[i].devfn;
781                                 goto out;
782                         }
783
784                         if (!pdev || !dev_is_pci(tmp))
785                                 continue;
786
787                         ptmp = to_pci_dev(tmp);
788                         if (ptmp->subordinate &&
789                             ptmp->subordinate->number <= pdev->bus->number &&
790                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
791                                 goto got_pdev;
792                 }
793
794                 if (pdev && drhd->include_all) {
795                 got_pdev:
796                         *bus = pdev->bus->number;
797                         *devfn = pdev->devfn;
798                         goto out;
799                 }
800         }
801         iommu = NULL;
802  out:
803         rcu_read_unlock();
804
805         return iommu;
806 }
807
808 static void domain_flush_cache(struct dmar_domain *domain,
809                                void *addr, int size)
810 {
811         if (!domain->iommu_coherency)
812                 clflush_cache_range(addr, size);
813 }
814
815 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
816 {
817         struct context_entry *context;
818         int ret = 0;
819         unsigned long flags;
820
821         spin_lock_irqsave(&iommu->lock, flags);
822         context = iommu_context_addr(iommu, bus, devfn, 0);
823         if (context)
824                 ret = context_present(context);
825         spin_unlock_irqrestore(&iommu->lock, flags);
826         return ret;
827 }
828
829 static void free_context_table(struct intel_iommu *iommu)
830 {
831         int i;
832         unsigned long flags;
833         struct context_entry *context;
834
835         spin_lock_irqsave(&iommu->lock, flags);
836         if (!iommu->root_entry) {
837                 goto out;
838         }
839         for (i = 0; i < ROOT_ENTRY_NR; i++) {
840                 context = iommu_context_addr(iommu, i, 0, 0);
841                 if (context)
842                         free_pgtable_page(context);
843
844                 if (!sm_supported(iommu))
845                         continue;
846
847                 context = iommu_context_addr(iommu, i, 0x80, 0);
848                 if (context)
849                         free_pgtable_page(context);
850
851         }
852         free_pgtable_page(iommu->root_entry);
853         iommu->root_entry = NULL;
854 out:
855         spin_unlock_irqrestore(&iommu->lock, flags);
856 }
857
858 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
859                                       unsigned long pfn, int *target_level)
860 {
861         struct dma_pte *parent, *pte;
862         int level = agaw_to_level(domain->agaw);
863         int offset;
864
865         BUG_ON(!domain->pgd);
866
867         if (!domain_pfn_supported(domain, pfn))
868                 /* Address beyond IOMMU's addressing capabilities. */
869                 return NULL;
870
871         parent = domain->pgd;
872
873         while (1) {
874                 void *tmp_page;
875
876                 offset = pfn_level_offset(pfn, level);
877                 pte = &parent[offset];
878                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
879                         break;
880                 if (level == *target_level)
881                         break;
882
883                 if (!dma_pte_present(pte)) {
884                         uint64_t pteval;
885
886                         tmp_page = alloc_pgtable_page(domain->nid);
887
888                         if (!tmp_page)
889                                 return NULL;
890
891                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
892                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
893                         if (cmpxchg64(&pte->val, 0ULL, pteval))
894                                 /* Someone else set it while we were thinking; use theirs. */
895                                 free_pgtable_page(tmp_page);
896                         else
897                                 domain_flush_cache(domain, pte, sizeof(*pte));
898                 }
899                 if (level == 1)
900                         break;
901
902                 parent = phys_to_virt(dma_pte_addr(pte));
903                 level--;
904         }
905
906         if (!*target_level)
907                 *target_level = level;
908
909         return pte;
910 }
911
912
913 /* return address's pte at specific level */
914 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
915                                          unsigned long pfn,
916                                          int level, int *large_page)
917 {
918         struct dma_pte *parent, *pte;
919         int total = agaw_to_level(domain->agaw);
920         int offset;
921
922         parent = domain->pgd;
923         while (level <= total) {
924                 offset = pfn_level_offset(pfn, total);
925                 pte = &parent[offset];
926                 if (level == total)
927                         return pte;
928
929                 if (!dma_pte_present(pte)) {
930                         *large_page = total;
931                         break;
932                 }
933
934                 if (dma_pte_superpage(pte)) {
935                         *large_page = total;
936                         return pte;
937                 }
938
939                 parent = phys_to_virt(dma_pte_addr(pte));
940                 total--;
941         }
942         return NULL;
943 }
944
945 /* clear last level pte, a tlb flush should be followed */
946 static void dma_pte_clear_range(struct dmar_domain *domain,
947                                 unsigned long start_pfn,
948                                 unsigned long last_pfn)
949 {
950         unsigned int large_page;
951         struct dma_pte *first_pte, *pte;
952
953         BUG_ON(!domain_pfn_supported(domain, start_pfn));
954         BUG_ON(!domain_pfn_supported(domain, last_pfn));
955         BUG_ON(start_pfn > last_pfn);
956
957         /* we don't need lock here; nobody else touches the iova range */
958         do {
959                 large_page = 1;
960                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
961                 if (!pte) {
962                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
963                         continue;
964                 }
965                 do {
966                         dma_clear_pte(pte);
967                         start_pfn += lvl_to_nr_pages(large_page);
968                         pte++;
969                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
970
971                 domain_flush_cache(domain, first_pte,
972                                    (void *)pte - (void *)first_pte);
973
974         } while (start_pfn && start_pfn <= last_pfn);
975 }
976
977 static void dma_pte_free_level(struct dmar_domain *domain, int level,
978                                int retain_level, struct dma_pte *pte,
979                                unsigned long pfn, unsigned long start_pfn,
980                                unsigned long last_pfn)
981 {
982         pfn = max(start_pfn, pfn);
983         pte = &pte[pfn_level_offset(pfn, level)];
984
985         do {
986                 unsigned long level_pfn;
987                 struct dma_pte *level_pte;
988
989                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
990                         goto next;
991
992                 level_pfn = pfn & level_mask(level);
993                 level_pte = phys_to_virt(dma_pte_addr(pte));
994
995                 if (level > 2) {
996                         dma_pte_free_level(domain, level - 1, retain_level,
997                                            level_pte, level_pfn, start_pfn,
998                                            last_pfn);
999                 }
1000
1001                 /*
1002                  * Free the page table if we're below the level we want to
1003                  * retain and the range covers the entire table.
1004                  */
1005                 if (level < retain_level && !(start_pfn > level_pfn ||
1006                       last_pfn < level_pfn + level_size(level) - 1)) {
1007                         dma_clear_pte(pte);
1008                         domain_flush_cache(domain, pte, sizeof(*pte));
1009                         free_pgtable_page(level_pte);
1010                 }
1011 next:
1012                 pfn += level_size(level);
1013         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1014 }
1015
1016 /*
1017  * clear last level (leaf) ptes and free page table pages below the
1018  * level we wish to keep intact.
1019  */
1020 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1021                                    unsigned long start_pfn,
1022                                    unsigned long last_pfn,
1023                                    int retain_level)
1024 {
1025         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1026         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1027         BUG_ON(start_pfn > last_pfn);
1028
1029         dma_pte_clear_range(domain, start_pfn, last_pfn);
1030
1031         /* We don't need lock here; nobody else touches the iova range */
1032         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1033                            domain->pgd, 0, start_pfn, last_pfn);
1034
1035         /* free pgd */
1036         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1037                 free_pgtable_page(domain->pgd);
1038                 domain->pgd = NULL;
1039         }
1040 }
1041
1042 /* When a page at a given level is being unlinked from its parent, we don't
1043    need to *modify* it at all. All we need to do is make a list of all the
1044    pages which can be freed just as soon as we've flushed the IOTLB and we
1045    know the hardware page-walk will no longer touch them.
1046    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1047    be freed. */
1048 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1049                                             int level, struct dma_pte *pte,
1050                                             struct page *freelist)
1051 {
1052         struct page *pg;
1053
1054         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1055         pg->freelist = freelist;
1056         freelist = pg;
1057
1058         if (level == 1)
1059                 return freelist;
1060
1061         pte = page_address(pg);
1062         do {
1063                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1064                         freelist = dma_pte_list_pagetables(domain, level - 1,
1065                                                            pte, freelist);
1066                 pte++;
1067         } while (!first_pte_in_page(pte));
1068
1069         return freelist;
1070 }
1071
1072 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1073                                         struct dma_pte *pte, unsigned long pfn,
1074                                         unsigned long start_pfn,
1075                                         unsigned long last_pfn,
1076                                         struct page *freelist)
1077 {
1078         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1079
1080         pfn = max(start_pfn, pfn);
1081         pte = &pte[pfn_level_offset(pfn, level)];
1082
1083         do {
1084                 unsigned long level_pfn;
1085
1086                 if (!dma_pte_present(pte))
1087                         goto next;
1088
1089                 level_pfn = pfn & level_mask(level);
1090
1091                 /* If range covers entire pagetable, free it */
1092                 if (start_pfn <= level_pfn &&
1093                     last_pfn >= level_pfn + level_size(level) - 1) {
1094                         /* These suborbinate page tables are going away entirely. Don't
1095                            bother to clear them; we're just going to *free* them. */
1096                         if (level > 1 && !dma_pte_superpage(pte))
1097                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1098
1099                         dma_clear_pte(pte);
1100                         if (!first_pte)
1101                                 first_pte = pte;
1102                         last_pte = pte;
1103                 } else if (level > 1) {
1104                         /* Recurse down into a level that isn't *entirely* obsolete */
1105                         freelist = dma_pte_clear_level(domain, level - 1,
1106                                                        phys_to_virt(dma_pte_addr(pte)),
1107                                                        level_pfn, start_pfn, last_pfn,
1108                                                        freelist);
1109                 }
1110 next:
1111                 pfn += level_size(level);
1112         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1113
1114         if (first_pte)
1115                 domain_flush_cache(domain, first_pte,
1116                                    (void *)++last_pte - (void *)first_pte);
1117
1118         return freelist;
1119 }
1120
1121 /* We can't just free the pages because the IOMMU may still be walking
1122    the page tables, and may have cached the intermediate levels. The
1123    pages can only be freed after the IOTLB flush has been done. */
1124 static struct page *domain_unmap(struct dmar_domain *domain,
1125                                  unsigned long start_pfn,
1126                                  unsigned long last_pfn)
1127 {
1128         struct page *freelist;
1129
1130         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1131         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1132         BUG_ON(start_pfn > last_pfn);
1133
1134         /* we don't need lock here; nobody else touches the iova range */
1135         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1136                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1137
1138         /* free pgd */
1139         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1140                 struct page *pgd_page = virt_to_page(domain->pgd);
1141                 pgd_page->freelist = freelist;
1142                 freelist = pgd_page;
1143
1144                 domain->pgd = NULL;
1145         }
1146
1147         return freelist;
1148 }
1149
1150 static void dma_free_pagelist(struct page *freelist)
1151 {
1152         struct page *pg;
1153
1154         while ((pg = freelist)) {
1155                 freelist = pg->freelist;
1156                 free_pgtable_page(page_address(pg));
1157         }
1158 }
1159
1160 static void iova_entry_free(unsigned long data)
1161 {
1162         struct page *freelist = (struct page *)data;
1163
1164         dma_free_pagelist(freelist);
1165 }
1166
1167 /* iommu handling */
1168 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1169 {
1170         struct root_entry *root;
1171         unsigned long flags;
1172
1173         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1174         if (!root) {
1175                 pr_err("Allocating root entry for %s failed\n",
1176                         iommu->name);
1177                 return -ENOMEM;
1178         }
1179
1180         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1181
1182         spin_lock_irqsave(&iommu->lock, flags);
1183         iommu->root_entry = root;
1184         spin_unlock_irqrestore(&iommu->lock, flags);
1185
1186         return 0;
1187 }
1188
1189 static void iommu_set_root_entry(struct intel_iommu *iommu)
1190 {
1191         u64 addr;
1192         u32 sts;
1193         unsigned long flag;
1194
1195         addr = virt_to_phys(iommu->root_entry);
1196         if (sm_supported(iommu))
1197                 addr |= DMA_RTADDR_SMT;
1198
1199         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1200         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1201
1202         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1203
1204         /* Make sure hardware complete it */
1205         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1206                       readl, (sts & DMA_GSTS_RTPS), sts);
1207
1208         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1209 }
1210
1211 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1212 {
1213         u32 val;
1214         unsigned long flag;
1215
1216         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1217                 return;
1218
1219         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1220         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1221
1222         /* Make sure hardware complete it */
1223         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224                       readl, (!(val & DMA_GSTS_WBFS)), val);
1225
1226         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1227 }
1228
1229 /* return value determine if we need a write buffer flush */
1230 static void __iommu_flush_context(struct intel_iommu *iommu,
1231                                   u16 did, u16 source_id, u8 function_mask,
1232                                   u64 type)
1233 {
1234         u64 val = 0;
1235         unsigned long flag;
1236
1237         switch (type) {
1238         case DMA_CCMD_GLOBAL_INVL:
1239                 val = DMA_CCMD_GLOBAL_INVL;
1240                 break;
1241         case DMA_CCMD_DOMAIN_INVL:
1242                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1243                 break;
1244         case DMA_CCMD_DEVICE_INVL:
1245                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1246                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1247                 break;
1248         default:
1249                 BUG();
1250         }
1251         val |= DMA_CCMD_ICC;
1252
1253         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1254         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1255
1256         /* Make sure hardware complete it */
1257         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1258                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1259
1260         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1261 }
1262
1263 /* return value determine if we need a write buffer flush */
1264 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1265                                 u64 addr, unsigned int size_order, u64 type)
1266 {
1267         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1268         u64 val = 0, val_iva = 0;
1269         unsigned long flag;
1270
1271         switch (type) {
1272         case DMA_TLB_GLOBAL_FLUSH:
1273                 /* global flush doesn't need set IVA_REG */
1274                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1275                 break;
1276         case DMA_TLB_DSI_FLUSH:
1277                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1278                 break;
1279         case DMA_TLB_PSI_FLUSH:
1280                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1281                 /* IH bit is passed in as part of address */
1282                 val_iva = size_order | addr;
1283                 break;
1284         default:
1285                 BUG();
1286         }
1287         /* Note: set drain read/write */
1288 #if 0
1289         /*
1290          * This is probably to be super secure.. Looks like we can
1291          * ignore it without any impact.
1292          */
1293         if (cap_read_drain(iommu->cap))
1294                 val |= DMA_TLB_READ_DRAIN;
1295 #endif
1296         if (cap_write_drain(iommu->cap))
1297                 val |= DMA_TLB_WRITE_DRAIN;
1298
1299         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1300         /* Note: Only uses first TLB reg currently */
1301         if (val_iva)
1302                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1303         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1304
1305         /* Make sure hardware complete it */
1306         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1307                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1308
1309         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1310
1311         /* check IOTLB invalidation granularity */
1312         if (DMA_TLB_IAIG(val) == 0)
1313                 pr_err("Flush IOTLB failed\n");
1314         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1315                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1316                         (unsigned long long)DMA_TLB_IIRG(type),
1317                         (unsigned long long)DMA_TLB_IAIG(val));
1318 }
1319
1320 static struct device_domain_info *
1321 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1322                          u8 bus, u8 devfn)
1323 {
1324         struct device_domain_info *info;
1325
1326         assert_spin_locked(&device_domain_lock);
1327
1328         if (!iommu->qi)
1329                 return NULL;
1330
1331         list_for_each_entry(info, &domain->devices, link)
1332                 if (info->iommu == iommu && info->bus == bus &&
1333                     info->devfn == devfn) {
1334                         if (info->ats_supported && info->dev)
1335                                 return info;
1336                         break;
1337                 }
1338
1339         return NULL;
1340 }
1341
1342 static void domain_update_iotlb(struct dmar_domain *domain)
1343 {
1344         struct device_domain_info *info;
1345         bool has_iotlb_device = false;
1346
1347         assert_spin_locked(&device_domain_lock);
1348
1349         list_for_each_entry(info, &domain->devices, link) {
1350                 struct pci_dev *pdev;
1351
1352                 if (!info->dev || !dev_is_pci(info->dev))
1353                         continue;
1354
1355                 pdev = to_pci_dev(info->dev);
1356                 if (pdev->ats_enabled) {
1357                         has_iotlb_device = true;
1358                         break;
1359                 }
1360         }
1361
1362         domain->has_iotlb_device = has_iotlb_device;
1363 }
1364
1365 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1366 {
1367         struct pci_dev *pdev;
1368
1369         assert_spin_locked(&device_domain_lock);
1370
1371         if (!info || !dev_is_pci(info->dev))
1372                 return;
1373
1374         pdev = to_pci_dev(info->dev);
1375         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1376          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1377          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1378          * reserved, which should be set to 0.
1379          */
1380         if (!ecap_dit(info->iommu->ecap))
1381                 info->pfsid = 0;
1382         else {
1383                 struct pci_dev *pf_pdev;
1384
1385                 /* pdev will be returned if device is not a vf */
1386                 pf_pdev = pci_physfn(pdev);
1387                 info->pfsid = pci_dev_id(pf_pdev);
1388         }
1389
1390 #ifdef CONFIG_INTEL_IOMMU_SVM
1391         /* The PCIe spec, in its wisdom, declares that the behaviour of
1392            the device if you enable PASID support after ATS support is
1393            undefined. So always enable PASID support on devices which
1394            have it, even if we can't yet know if we're ever going to
1395            use it. */
1396         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1397                 info->pasid_enabled = 1;
1398
1399         if (info->pri_supported &&
1400             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1401             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1402                 info->pri_enabled = 1;
1403 #endif
1404         if (!pdev->untrusted && info->ats_supported &&
1405             pci_ats_page_aligned(pdev) &&
1406             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1407                 info->ats_enabled = 1;
1408                 domain_update_iotlb(info->domain);
1409                 info->ats_qdep = pci_ats_queue_depth(pdev);
1410         }
1411 }
1412
1413 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1414 {
1415         struct pci_dev *pdev;
1416
1417         assert_spin_locked(&device_domain_lock);
1418
1419         if (!dev_is_pci(info->dev))
1420                 return;
1421
1422         pdev = to_pci_dev(info->dev);
1423
1424         if (info->ats_enabled) {
1425                 pci_disable_ats(pdev);
1426                 info->ats_enabled = 0;
1427                 domain_update_iotlb(info->domain);
1428         }
1429 #ifdef CONFIG_INTEL_IOMMU_SVM
1430         if (info->pri_enabled) {
1431                 pci_disable_pri(pdev);
1432                 info->pri_enabled = 0;
1433         }
1434         if (info->pasid_enabled) {
1435                 pci_disable_pasid(pdev);
1436                 info->pasid_enabled = 0;
1437         }
1438 #endif
1439 }
1440
1441 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1442                                   u64 addr, unsigned mask)
1443 {
1444         u16 sid, qdep;
1445         unsigned long flags;
1446         struct device_domain_info *info;
1447
1448         if (!domain->has_iotlb_device)
1449                 return;
1450
1451         spin_lock_irqsave(&device_domain_lock, flags);
1452         list_for_each_entry(info, &domain->devices, link) {
1453                 if (!info->ats_enabled)
1454                         continue;
1455
1456                 sid = info->bus << 8 | info->devfn;
1457                 qdep = info->ats_qdep;
1458                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1459                                 qdep, addr, mask);
1460         }
1461         spin_unlock_irqrestore(&device_domain_lock, flags);
1462 }
1463
1464 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1465                                   struct dmar_domain *domain,
1466                                   unsigned long pfn, unsigned int pages,
1467                                   int ih, int map)
1468 {
1469         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1470         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1471         u16 did = domain->iommu_did[iommu->seq_id];
1472
1473         BUG_ON(pages == 0);
1474
1475         if (ih)
1476                 ih = 1 << 6;
1477         /*
1478          * Fallback to domain selective flush if no PSI support or the size is
1479          * too big.
1480          * PSI requires page size to be 2 ^ x, and the base address is naturally
1481          * aligned to the size
1482          */
1483         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1484                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1485                                                 DMA_TLB_DSI_FLUSH);
1486         else
1487                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1488                                                 DMA_TLB_PSI_FLUSH);
1489
1490         /*
1491          * In caching mode, changes of pages from non-present to present require
1492          * flush. However, device IOTLB doesn't need to be flushed in this case.
1493          */
1494         if (!cap_caching_mode(iommu->cap) || !map)
1495                 iommu_flush_dev_iotlb(domain, addr, mask);
1496 }
1497
1498 /* Notification for newly created mappings */
1499 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1500                                         struct dmar_domain *domain,
1501                                         unsigned long pfn, unsigned int pages)
1502 {
1503         /* It's a non-present to present mapping. Only flush if caching mode */
1504         if (cap_caching_mode(iommu->cap))
1505                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1506         else
1507                 iommu_flush_write_buffer(iommu);
1508 }
1509
1510 static void iommu_flush_iova(struct iova_domain *iovad)
1511 {
1512         struct dmar_domain *domain;
1513         int idx;
1514
1515         domain = container_of(iovad, struct dmar_domain, iovad);
1516
1517         for_each_domain_iommu(idx, domain) {
1518                 struct intel_iommu *iommu = g_iommus[idx];
1519                 u16 did = domain->iommu_did[iommu->seq_id];
1520
1521                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1522
1523                 if (!cap_caching_mode(iommu->cap))
1524                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1525                                               0, MAX_AGAW_PFN_WIDTH);
1526         }
1527 }
1528
1529 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1530 {
1531         u32 pmen;
1532         unsigned long flags;
1533
1534         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1535                 return;
1536
1537         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1538         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1539         pmen &= ~DMA_PMEN_EPM;
1540         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1541
1542         /* wait for the protected region status bit to clear */
1543         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1544                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1545
1546         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1547 }
1548
1549 static void iommu_enable_translation(struct intel_iommu *iommu)
1550 {
1551         u32 sts;
1552         unsigned long flags;
1553
1554         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1555         iommu->gcmd |= DMA_GCMD_TE;
1556         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1557
1558         /* Make sure hardware complete it */
1559         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1560                       readl, (sts & DMA_GSTS_TES), sts);
1561
1562         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1563 }
1564
1565 static void iommu_disable_translation(struct intel_iommu *iommu)
1566 {
1567         u32 sts;
1568         unsigned long flag;
1569
1570         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1571         iommu->gcmd &= ~DMA_GCMD_TE;
1572         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1573
1574         /* Make sure hardware complete it */
1575         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1576                       readl, (!(sts & DMA_GSTS_TES)), sts);
1577
1578         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1579 }
1580
1581
1582 static int iommu_init_domains(struct intel_iommu *iommu)
1583 {
1584         u32 ndomains, nlongs;
1585         size_t size;
1586
1587         ndomains = cap_ndoms(iommu->cap);
1588         pr_debug("%s: Number of Domains supported <%d>\n",
1589                  iommu->name, ndomains);
1590         nlongs = BITS_TO_LONGS(ndomains);
1591
1592         spin_lock_init(&iommu->lock);
1593
1594         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1595         if (!iommu->domain_ids) {
1596                 pr_err("%s: Allocating domain id array failed\n",
1597                        iommu->name);
1598                 return -ENOMEM;
1599         }
1600
1601         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1602         iommu->domains = kzalloc(size, GFP_KERNEL);
1603
1604         if (iommu->domains) {
1605                 size = 256 * sizeof(struct dmar_domain *);
1606                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1607         }
1608
1609         if (!iommu->domains || !iommu->domains[0]) {
1610                 pr_err("%s: Allocating domain array failed\n",
1611                        iommu->name);
1612                 kfree(iommu->domain_ids);
1613                 kfree(iommu->domains);
1614                 iommu->domain_ids = NULL;
1615                 iommu->domains    = NULL;
1616                 return -ENOMEM;
1617         }
1618
1619
1620
1621         /*
1622          * If Caching mode is set, then invalid translations are tagged
1623          * with domain-id 0, hence we need to pre-allocate it. We also
1624          * use domain-id 0 as a marker for non-allocated domain-id, so
1625          * make sure it is not used for a real domain.
1626          */
1627         set_bit(0, iommu->domain_ids);
1628
1629         /*
1630          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1631          * entry for first-level or pass-through translation modes should
1632          * be programmed with a domain id different from those used for
1633          * second-level or nested translation. We reserve a domain id for
1634          * this purpose.
1635          */
1636         if (sm_supported(iommu))
1637                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1638
1639         return 0;
1640 }
1641
1642 static void disable_dmar_iommu(struct intel_iommu *iommu)
1643 {
1644         struct device_domain_info *info, *tmp;
1645         unsigned long flags;
1646
1647         if (!iommu->domains || !iommu->domain_ids)
1648                 return;
1649
1650         spin_lock_irqsave(&device_domain_lock, flags);
1651         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1652                 struct dmar_domain *domain;
1653
1654                 if (info->iommu != iommu)
1655                         continue;
1656
1657                 if (!info->dev || !info->domain)
1658                         continue;
1659
1660                 domain = info->domain;
1661
1662                 __dmar_remove_one_dev_info(info);
1663         }
1664         spin_unlock_irqrestore(&device_domain_lock, flags);
1665
1666         if (iommu->gcmd & DMA_GCMD_TE)
1667                 iommu_disable_translation(iommu);
1668 }
1669
1670 static void free_dmar_iommu(struct intel_iommu *iommu)
1671 {
1672         if ((iommu->domains) && (iommu->domain_ids)) {
1673                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1674                 int i;
1675
1676                 for (i = 0; i < elems; i++)
1677                         kfree(iommu->domains[i]);
1678                 kfree(iommu->domains);
1679                 kfree(iommu->domain_ids);
1680                 iommu->domains = NULL;
1681                 iommu->domain_ids = NULL;
1682         }
1683
1684         g_iommus[iommu->seq_id] = NULL;
1685
1686         /* free context mapping */
1687         free_context_table(iommu);
1688
1689 #ifdef CONFIG_INTEL_IOMMU_SVM
1690         if (pasid_supported(iommu)) {
1691                 if (ecap_prs(iommu->ecap))
1692                         intel_svm_finish_prq(iommu);
1693         }
1694 #endif
1695 }
1696
1697 static struct dmar_domain *alloc_domain(int flags)
1698 {
1699         struct dmar_domain *domain;
1700
1701         domain = alloc_domain_mem();
1702         if (!domain)
1703                 return NULL;
1704
1705         memset(domain, 0, sizeof(*domain));
1706         domain->nid = NUMA_NO_NODE;
1707         domain->flags = flags;
1708         domain->has_iotlb_device = false;
1709         INIT_LIST_HEAD(&domain->devices);
1710
1711         return domain;
1712 }
1713
1714 /* Must be called with iommu->lock */
1715 static int domain_attach_iommu(struct dmar_domain *domain,
1716                                struct intel_iommu *iommu)
1717 {
1718         unsigned long ndomains;
1719         int num;
1720
1721         assert_spin_locked(&device_domain_lock);
1722         assert_spin_locked(&iommu->lock);
1723
1724         domain->iommu_refcnt[iommu->seq_id] += 1;
1725         domain->iommu_count += 1;
1726         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1727                 ndomains = cap_ndoms(iommu->cap);
1728                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1729
1730                 if (num >= ndomains) {
1731                         pr_err("%s: No free domain ids\n", iommu->name);
1732                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1733                         domain->iommu_count -= 1;
1734                         return -ENOSPC;
1735                 }
1736
1737                 set_bit(num, iommu->domain_ids);
1738                 set_iommu_domain(iommu, num, domain);
1739
1740                 domain->iommu_did[iommu->seq_id] = num;
1741                 domain->nid                      = iommu->node;
1742
1743                 domain_update_iommu_cap(domain);
1744         }
1745
1746         return 0;
1747 }
1748
1749 static int domain_detach_iommu(struct dmar_domain *domain,
1750                                struct intel_iommu *iommu)
1751 {
1752         int num, count;
1753
1754         assert_spin_locked(&device_domain_lock);
1755         assert_spin_locked(&iommu->lock);
1756
1757         domain->iommu_refcnt[iommu->seq_id] -= 1;
1758         count = --domain->iommu_count;
1759         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1760                 num = domain->iommu_did[iommu->seq_id];
1761                 clear_bit(num, iommu->domain_ids);
1762                 set_iommu_domain(iommu, num, NULL);
1763
1764                 domain_update_iommu_cap(domain);
1765                 domain->iommu_did[iommu->seq_id] = 0;
1766         }
1767
1768         return count;
1769 }
1770
1771 static struct iova_domain reserved_iova_list;
1772 static struct lock_class_key reserved_rbtree_key;
1773
1774 static int dmar_init_reserved_ranges(void)
1775 {
1776         struct pci_dev *pdev = NULL;
1777         struct iova *iova;
1778         int i;
1779
1780         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1781
1782         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1783                 &reserved_rbtree_key);
1784
1785         /* IOAPIC ranges shouldn't be accessed by DMA */
1786         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1787                 IOVA_PFN(IOAPIC_RANGE_END));
1788         if (!iova) {
1789                 pr_err("Reserve IOAPIC range failed\n");
1790                 return -ENODEV;
1791         }
1792
1793         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1794         for_each_pci_dev(pdev) {
1795                 struct resource *r;
1796
1797                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1798                         r = &pdev->resource[i];
1799                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1800                                 continue;
1801                         iova = reserve_iova(&reserved_iova_list,
1802                                             IOVA_PFN(r->start),
1803                                             IOVA_PFN(r->end));
1804                         if (!iova) {
1805                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1806                                 return -ENODEV;
1807                         }
1808                 }
1809         }
1810         return 0;
1811 }
1812
1813 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1814 {
1815         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1816 }
1817
1818 static inline int guestwidth_to_adjustwidth(int gaw)
1819 {
1820         int agaw;
1821         int r = (gaw - 12) % 9;
1822
1823         if (r == 0)
1824                 agaw = gaw;
1825         else
1826                 agaw = gaw + 9 - r;
1827         if (agaw > 64)
1828                 agaw = 64;
1829         return agaw;
1830 }
1831
1832 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1833                        int guest_width)
1834 {
1835         int adjust_width, agaw;
1836         unsigned long sagaw;
1837         int err;
1838
1839         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1840
1841         err = init_iova_flush_queue(&domain->iovad,
1842                                     iommu_flush_iova, iova_entry_free);
1843         if (err)
1844                 return err;
1845
1846         domain_reserve_special_ranges(domain);
1847
1848         /* calculate AGAW */
1849         if (guest_width > cap_mgaw(iommu->cap))
1850                 guest_width = cap_mgaw(iommu->cap);
1851         domain->gaw = guest_width;
1852         adjust_width = guestwidth_to_adjustwidth(guest_width);
1853         agaw = width_to_agaw(adjust_width);
1854         sagaw = cap_sagaw(iommu->cap);
1855         if (!test_bit(agaw, &sagaw)) {
1856                 /* hardware doesn't support it, choose a bigger one */
1857                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1858                 agaw = find_next_bit(&sagaw, 5, agaw);
1859                 if (agaw >= 5)
1860                         return -ENODEV;
1861         }
1862         domain->agaw = agaw;
1863
1864         if (ecap_coherent(iommu->ecap))
1865                 domain->iommu_coherency = 1;
1866         else
1867                 domain->iommu_coherency = 0;
1868
1869         if (ecap_sc_support(iommu->ecap))
1870                 domain->iommu_snooping = 1;
1871         else
1872                 domain->iommu_snooping = 0;
1873
1874         if (intel_iommu_superpage)
1875                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1876         else
1877                 domain->iommu_superpage = 0;
1878
1879         domain->nid = iommu->node;
1880
1881         /* always allocate the top pgd */
1882         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1883         if (!domain->pgd)
1884                 return -ENOMEM;
1885         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1886         return 0;
1887 }
1888
1889 static void domain_exit(struct dmar_domain *domain)
1890 {
1891         struct page *freelist;
1892
1893         /* Remove associated devices and clear attached or cached domains */
1894         domain_remove_dev_info(domain);
1895
1896         /* destroy iovas */
1897         put_iova_domain(&domain->iovad);
1898
1899         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1900
1901         dma_free_pagelist(freelist);
1902
1903         free_domain_mem(domain);
1904 }
1905
1906 /*
1907  * Get the PASID directory size for scalable mode context entry.
1908  * Value of X in the PDTS field of a scalable mode context entry
1909  * indicates PASID directory with 2^(X + 7) entries.
1910  */
1911 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1912 {
1913         int pds, max_pde;
1914
1915         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1916         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1917         if (pds < 7)
1918                 return 0;
1919
1920         return pds - 7;
1921 }
1922
1923 /*
1924  * Set the RID_PASID field of a scalable mode context entry. The
1925  * IOMMU hardware will use the PASID value set in this field for
1926  * DMA translations of DMA requests without PASID.
1927  */
1928 static inline void
1929 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1930 {
1931         context->hi |= pasid & ((1 << 20) - 1);
1932         context->hi |= (1 << 20);
1933 }
1934
1935 /*
1936  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1937  * entry.
1938  */
1939 static inline void context_set_sm_dte(struct context_entry *context)
1940 {
1941         context->lo |= (1 << 2);
1942 }
1943
1944 /*
1945  * Set the PRE(Page Request Enable) field of a scalable mode context
1946  * entry.
1947  */
1948 static inline void context_set_sm_pre(struct context_entry *context)
1949 {
1950         context->lo |= (1 << 4);
1951 }
1952
1953 /* Convert value to context PASID directory size field coding. */
1954 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1955
1956 static int domain_context_mapping_one(struct dmar_domain *domain,
1957                                       struct intel_iommu *iommu,
1958                                       struct pasid_table *table,
1959                                       u8 bus, u8 devfn)
1960 {
1961         u16 did = domain->iommu_did[iommu->seq_id];
1962         int translation = CONTEXT_TT_MULTI_LEVEL;
1963         struct device_domain_info *info = NULL;
1964         struct context_entry *context;
1965         unsigned long flags;
1966         int ret;
1967
1968         WARN_ON(did == 0);
1969
1970         if (hw_pass_through && domain_type_is_si(domain))
1971                 translation = CONTEXT_TT_PASS_THROUGH;
1972
1973         pr_debug("Set context mapping for %02x:%02x.%d\n",
1974                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1975
1976         BUG_ON(!domain->pgd);
1977
1978         spin_lock_irqsave(&device_domain_lock, flags);
1979         spin_lock(&iommu->lock);
1980
1981         ret = -ENOMEM;
1982         context = iommu_context_addr(iommu, bus, devfn, 1);
1983         if (!context)
1984                 goto out_unlock;
1985
1986         ret = 0;
1987         if (context_present(context))
1988                 goto out_unlock;
1989
1990         /*
1991          * For kdump cases, old valid entries may be cached due to the
1992          * in-flight DMA and copied pgtable, but there is no unmapping
1993          * behaviour for them, thus we need an explicit cache flush for
1994          * the newly-mapped device. For kdump, at this point, the device
1995          * is supposed to finish reset at its driver probe stage, so no
1996          * in-flight DMA will exist, and we don't need to worry anymore
1997          * hereafter.
1998          */
1999         if (context_copied(context)) {
2000                 u16 did_old = context_domain_id(context);
2001
2002                 if (did_old < cap_ndoms(iommu->cap)) {
2003                         iommu->flush.flush_context(iommu, did_old,
2004                                                    (((u16)bus) << 8) | devfn,
2005                                                    DMA_CCMD_MASK_NOBIT,
2006                                                    DMA_CCMD_DEVICE_INVL);
2007                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2008                                                  DMA_TLB_DSI_FLUSH);
2009                 }
2010         }
2011
2012         context_clear_entry(context);
2013
2014         if (sm_supported(iommu)) {
2015                 unsigned long pds;
2016
2017                 WARN_ON(!table);
2018
2019                 /* Setup the PASID DIR pointer: */
2020                 pds = context_get_sm_pds(table);
2021                 context->lo = (u64)virt_to_phys(table->table) |
2022                                 context_pdts(pds);
2023
2024                 /* Setup the RID_PASID field: */
2025                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2026
2027                 /*
2028                  * Setup the Device-TLB enable bit and Page request
2029                  * Enable bit:
2030                  */
2031                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2032                 if (info && info->ats_supported)
2033                         context_set_sm_dte(context);
2034                 if (info && info->pri_supported)
2035                         context_set_sm_pre(context);
2036         } else {
2037                 struct dma_pte *pgd = domain->pgd;
2038                 int agaw;
2039
2040                 context_set_domain_id(context, did);
2041
2042                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2043                         /*
2044                          * Skip top levels of page tables for iommu which has
2045                          * less agaw than default. Unnecessary for PT mode.
2046                          */
2047                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2048                                 ret = -ENOMEM;
2049                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2050                                 if (!dma_pte_present(pgd))
2051                                         goto out_unlock;
2052                         }
2053
2054                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2055                         if (info && info->ats_supported)
2056                                 translation = CONTEXT_TT_DEV_IOTLB;
2057                         else
2058                                 translation = CONTEXT_TT_MULTI_LEVEL;
2059
2060                         context_set_address_root(context, virt_to_phys(pgd));
2061                         context_set_address_width(context, agaw);
2062                 } else {
2063                         /*
2064                          * In pass through mode, AW must be programmed to
2065                          * indicate the largest AGAW value supported by
2066                          * hardware. And ASR is ignored by hardware.
2067                          */
2068                         context_set_address_width(context, iommu->msagaw);
2069                 }
2070
2071                 context_set_translation_type(context, translation);
2072         }
2073
2074         context_set_fault_enable(context);
2075         context_set_present(context);
2076         domain_flush_cache(domain, context, sizeof(*context));
2077
2078         /*
2079          * It's a non-present to present mapping. If hardware doesn't cache
2080          * non-present entry we only need to flush the write-buffer. If the
2081          * _does_ cache non-present entries, then it does so in the special
2082          * domain #0, which we have to flush:
2083          */
2084         if (cap_caching_mode(iommu->cap)) {
2085                 iommu->flush.flush_context(iommu, 0,
2086                                            (((u16)bus) << 8) | devfn,
2087                                            DMA_CCMD_MASK_NOBIT,
2088                                            DMA_CCMD_DEVICE_INVL);
2089                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2090         } else {
2091                 iommu_flush_write_buffer(iommu);
2092         }
2093         iommu_enable_dev_iotlb(info);
2094
2095         ret = 0;
2096
2097 out_unlock:
2098         spin_unlock(&iommu->lock);
2099         spin_unlock_irqrestore(&device_domain_lock, flags);
2100
2101         return ret;
2102 }
2103
2104 struct domain_context_mapping_data {
2105         struct dmar_domain *domain;
2106         struct intel_iommu *iommu;
2107         struct pasid_table *table;
2108 };
2109
2110 static int domain_context_mapping_cb(struct pci_dev *pdev,
2111                                      u16 alias, void *opaque)
2112 {
2113         struct domain_context_mapping_data *data = opaque;
2114
2115         return domain_context_mapping_one(data->domain, data->iommu,
2116                                           data->table, PCI_BUS_NUM(alias),
2117                                           alias & 0xff);
2118 }
2119
2120 static int
2121 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2122 {
2123         struct domain_context_mapping_data data;
2124         struct pasid_table *table;
2125         struct intel_iommu *iommu;
2126         u8 bus, devfn;
2127
2128         iommu = device_to_iommu(dev, &bus, &devfn);
2129         if (!iommu)
2130                 return -ENODEV;
2131
2132         table = intel_pasid_get_table(dev);
2133
2134         if (!dev_is_pci(dev))
2135                 return domain_context_mapping_one(domain, iommu, table,
2136                                                   bus, devfn);
2137
2138         data.domain = domain;
2139         data.iommu = iommu;
2140         data.table = table;
2141
2142         return pci_for_each_dma_alias(to_pci_dev(dev),
2143                                       &domain_context_mapping_cb, &data);
2144 }
2145
2146 static int domain_context_mapped_cb(struct pci_dev *pdev,
2147                                     u16 alias, void *opaque)
2148 {
2149         struct intel_iommu *iommu = opaque;
2150
2151         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2152 }
2153
2154 static int domain_context_mapped(struct device *dev)
2155 {
2156         struct intel_iommu *iommu;
2157         u8 bus, devfn;
2158
2159         iommu = device_to_iommu(dev, &bus, &devfn);
2160         if (!iommu)
2161                 return -ENODEV;
2162
2163         if (!dev_is_pci(dev))
2164                 return device_context_mapped(iommu, bus, devfn);
2165
2166         return !pci_for_each_dma_alias(to_pci_dev(dev),
2167                                        domain_context_mapped_cb, iommu);
2168 }
2169
2170 /* Returns a number of VTD pages, but aligned to MM page size */
2171 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2172                                             size_t size)
2173 {
2174         host_addr &= ~PAGE_MASK;
2175         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2176 }
2177
2178 /* Return largest possible superpage level for a given mapping */
2179 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2180                                           unsigned long iov_pfn,
2181                                           unsigned long phy_pfn,
2182                                           unsigned long pages)
2183 {
2184         int support, level = 1;
2185         unsigned long pfnmerge;
2186
2187         support = domain->iommu_superpage;
2188
2189         /* To use a large page, the virtual *and* physical addresses
2190            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2191            of them will mean we have to use smaller pages. So just
2192            merge them and check both at once. */
2193         pfnmerge = iov_pfn | phy_pfn;
2194
2195         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2196                 pages >>= VTD_STRIDE_SHIFT;
2197                 if (!pages)
2198                         break;
2199                 pfnmerge >>= VTD_STRIDE_SHIFT;
2200                 level++;
2201                 support--;
2202         }
2203         return level;
2204 }
2205
2206 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2207                             struct scatterlist *sg, unsigned long phys_pfn,
2208                             unsigned long nr_pages, int prot)
2209 {
2210         struct dma_pte *first_pte = NULL, *pte = NULL;
2211         phys_addr_t uninitialized_var(pteval);
2212         unsigned long sg_res = 0;
2213         unsigned int largepage_lvl = 0;
2214         unsigned long lvl_pages = 0;
2215
2216         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2217
2218         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2219                 return -EINVAL;
2220
2221         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2222
2223         if (!sg) {
2224                 sg_res = nr_pages;
2225                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2226         }
2227
2228         while (nr_pages > 0) {
2229                 uint64_t tmp;
2230
2231                 if (!sg_res) {
2232                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2233
2234                         sg_res = aligned_nrpages(sg->offset, sg->length);
2235                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2236                         sg->dma_length = sg->length;
2237                         pteval = (sg_phys(sg) - pgoff) | prot;
2238                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2239                 }
2240
2241                 if (!pte) {
2242                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2243
2244                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2245                         if (!pte)
2246                                 return -ENOMEM;
2247                         /* It is large page*/
2248                         if (largepage_lvl > 1) {
2249                                 unsigned long nr_superpages, end_pfn;
2250
2251                                 pteval |= DMA_PTE_LARGE_PAGE;
2252                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2253
2254                                 nr_superpages = sg_res / lvl_pages;
2255                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2256
2257                                 /*
2258                                  * Ensure that old small page tables are
2259                                  * removed to make room for superpage(s).
2260                                  * We're adding new large pages, so make sure
2261                                  * we don't remove their parent tables.
2262                                  */
2263                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2264                                                        largepage_lvl + 1);
2265                         } else {
2266                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2267                         }
2268
2269                 }
2270                 /* We don't need lock here, nobody else
2271                  * touches the iova range
2272                  */
2273                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2274                 if (tmp) {
2275                         static int dumps = 5;
2276                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2277                                 iov_pfn, tmp, (unsigned long long)pteval);
2278                         if (dumps) {
2279                                 dumps--;
2280                                 debug_dma_dump_mappings(NULL);
2281                         }
2282                         WARN_ON(1);
2283                 }
2284
2285                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2286
2287                 BUG_ON(nr_pages < lvl_pages);
2288                 BUG_ON(sg_res < lvl_pages);
2289
2290                 nr_pages -= lvl_pages;
2291                 iov_pfn += lvl_pages;
2292                 phys_pfn += lvl_pages;
2293                 pteval += lvl_pages * VTD_PAGE_SIZE;
2294                 sg_res -= lvl_pages;
2295
2296                 /* If the next PTE would be the first in a new page, then we
2297                    need to flush the cache on the entries we've just written.
2298                    And then we'll need to recalculate 'pte', so clear it and
2299                    let it get set again in the if (!pte) block above.
2300
2301                    If we're done (!nr_pages) we need to flush the cache too.
2302
2303                    Also if we've been setting superpages, we may need to
2304                    recalculate 'pte' and switch back to smaller pages for the
2305                    end of the mapping, if the trailing size is not enough to
2306                    use another superpage (i.e. sg_res < lvl_pages). */
2307                 pte++;
2308                 if (!nr_pages || first_pte_in_page(pte) ||
2309                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2310                         domain_flush_cache(domain, first_pte,
2311                                            (void *)pte - (void *)first_pte);
2312                         pte = NULL;
2313                 }
2314
2315                 if (!sg_res && nr_pages)
2316                         sg = sg_next(sg);
2317         }
2318         return 0;
2319 }
2320
2321 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2322                           struct scatterlist *sg, unsigned long phys_pfn,
2323                           unsigned long nr_pages, int prot)
2324 {
2325         int iommu_id, ret;
2326         struct intel_iommu *iommu;
2327
2328         /* Do the real mapping first */
2329         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2330         if (ret)
2331                 return ret;
2332
2333         for_each_domain_iommu(iommu_id, domain) {
2334                 iommu = g_iommus[iommu_id];
2335                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2336         }
2337
2338         return 0;
2339 }
2340
2341 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2342                                     struct scatterlist *sg, unsigned long nr_pages,
2343                                     int prot)
2344 {
2345         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2346 }
2347
2348 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2349                                      unsigned long phys_pfn, unsigned long nr_pages,
2350                                      int prot)
2351 {
2352         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2353 }
2354
2355 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2356 {
2357         unsigned long flags;
2358         struct context_entry *context;
2359         u16 did_old;
2360
2361         if (!iommu)
2362                 return;
2363
2364         spin_lock_irqsave(&iommu->lock, flags);
2365         context = iommu_context_addr(iommu, bus, devfn, 0);
2366         if (!context) {
2367                 spin_unlock_irqrestore(&iommu->lock, flags);
2368                 return;
2369         }
2370         did_old = context_domain_id(context);
2371         context_clear_entry(context);
2372         __iommu_flush_cache(iommu, context, sizeof(*context));
2373         spin_unlock_irqrestore(&iommu->lock, flags);
2374         iommu->flush.flush_context(iommu,
2375                                    did_old,
2376                                    (((u16)bus) << 8) | devfn,
2377                                    DMA_CCMD_MASK_NOBIT,
2378                                    DMA_CCMD_DEVICE_INVL);
2379         iommu->flush.flush_iotlb(iommu,
2380                                  did_old,
2381                                  0,
2382                                  0,
2383                                  DMA_TLB_DSI_FLUSH);
2384 }
2385
2386 static inline void unlink_domain_info(struct device_domain_info *info)
2387 {
2388         assert_spin_locked(&device_domain_lock);
2389         list_del(&info->link);
2390         list_del(&info->global);
2391         if (info->dev)
2392                 info->dev->archdata.iommu = NULL;
2393 }
2394
2395 static void domain_remove_dev_info(struct dmar_domain *domain)
2396 {
2397         struct device_domain_info *info, *tmp;
2398         unsigned long flags;
2399
2400         spin_lock_irqsave(&device_domain_lock, flags);
2401         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2402                 __dmar_remove_one_dev_info(info);
2403         spin_unlock_irqrestore(&device_domain_lock, flags);
2404 }
2405
2406 /*
2407  * find_domain
2408  * Note: we use struct device->archdata.iommu stores the info
2409  */
2410 static struct dmar_domain *find_domain(struct device *dev)
2411 {
2412         struct device_domain_info *info;
2413
2414         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2415                 struct iommu_domain *domain;
2416
2417                 dev->archdata.iommu = NULL;
2418                 domain = iommu_get_domain_for_dev(dev);
2419                 if (domain)
2420                         intel_iommu_attach_device(domain, dev);
2421         }
2422
2423         /* No lock here, assumes no domain exit in normal case */
2424         info = dev->archdata.iommu;
2425
2426         if (likely(info))
2427                 return info->domain;
2428         return NULL;
2429 }
2430
2431 static inline struct device_domain_info *
2432 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2433 {
2434         struct device_domain_info *info;
2435
2436         list_for_each_entry(info, &device_domain_list, global)
2437                 if (info->iommu->segment == segment && info->bus == bus &&
2438                     info->devfn == devfn)
2439                         return info;
2440
2441         return NULL;
2442 }
2443
2444 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2445                                                     int bus, int devfn,
2446                                                     struct device *dev,
2447                                                     struct dmar_domain *domain)
2448 {
2449         struct dmar_domain *found = NULL;
2450         struct device_domain_info *info;
2451         unsigned long flags;
2452         int ret;
2453
2454         info = alloc_devinfo_mem();
2455         if (!info)
2456                 return NULL;
2457
2458         info->bus = bus;
2459         info->devfn = devfn;
2460         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2461         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2462         info->ats_qdep = 0;
2463         info->dev = dev;
2464         info->domain = domain;
2465         info->iommu = iommu;
2466         info->pasid_table = NULL;
2467         info->auxd_enabled = 0;
2468         INIT_LIST_HEAD(&info->auxiliary_domains);
2469
2470         if (dev && dev_is_pci(dev)) {
2471                 struct pci_dev *pdev = to_pci_dev(info->dev);
2472
2473                 if (!pdev->untrusted &&
2474                     !pci_ats_disabled() &&
2475                     ecap_dev_iotlb_support(iommu->ecap) &&
2476                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2477                     dmar_find_matched_atsr_unit(pdev))
2478                         info->ats_supported = 1;
2479
2480                 if (sm_supported(iommu)) {
2481                         if (pasid_supported(iommu)) {
2482                                 int features = pci_pasid_features(pdev);
2483                                 if (features >= 0)
2484                                         info->pasid_supported = features | 1;
2485                         }
2486
2487                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2488                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2489                                 info->pri_supported = 1;
2490                 }
2491         }
2492
2493         spin_lock_irqsave(&device_domain_lock, flags);
2494         if (dev)
2495                 found = find_domain(dev);
2496
2497         if (!found) {
2498                 struct device_domain_info *info2;
2499                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2500                 if (info2) {
2501                         found      = info2->domain;
2502                         info2->dev = dev;
2503                 }
2504         }
2505
2506         if (found) {
2507                 spin_unlock_irqrestore(&device_domain_lock, flags);
2508                 free_devinfo_mem(info);
2509                 /* Caller must free the original domain */
2510                 return found;
2511         }
2512
2513         spin_lock(&iommu->lock);
2514         ret = domain_attach_iommu(domain, iommu);
2515         spin_unlock(&iommu->lock);
2516
2517         if (ret) {
2518                 spin_unlock_irqrestore(&device_domain_lock, flags);
2519                 free_devinfo_mem(info);
2520                 return NULL;
2521         }
2522
2523         list_add(&info->link, &domain->devices);
2524         list_add(&info->global, &device_domain_list);
2525         if (dev)
2526                 dev->archdata.iommu = info;
2527         spin_unlock_irqrestore(&device_domain_lock, flags);
2528
2529         /* PASID table is mandatory for a PCI device in scalable mode. */
2530         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2531                 ret = intel_pasid_alloc_table(dev);
2532                 if (ret) {
2533                         dev_err(dev, "PASID table allocation failed\n");
2534                         dmar_remove_one_dev_info(dev);
2535                         return NULL;
2536                 }
2537
2538                 /* Setup the PASID entry for requests without PASID: */
2539                 spin_lock(&iommu->lock);
2540                 if (hw_pass_through && domain_type_is_si(domain))
2541                         ret = intel_pasid_setup_pass_through(iommu, domain,
2542                                         dev, PASID_RID2PASID);
2543                 else
2544                         ret = intel_pasid_setup_second_level(iommu, domain,
2545                                         dev, PASID_RID2PASID);
2546                 spin_unlock(&iommu->lock);
2547                 if (ret) {
2548                         dev_err(dev, "Setup RID2PASID failed\n");
2549                         dmar_remove_one_dev_info(dev);
2550                         return NULL;
2551                 }
2552         }
2553
2554         if (dev && domain_context_mapping(domain, dev)) {
2555                 dev_err(dev, "Domain context map failed\n");
2556                 dmar_remove_one_dev_info(dev);
2557                 return NULL;
2558         }
2559
2560         return domain;
2561 }
2562
2563 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2564 {
2565         *(u16 *)opaque = alias;
2566         return 0;
2567 }
2568
2569 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2570 {
2571         struct device_domain_info *info;
2572         struct dmar_domain *domain = NULL;
2573         struct intel_iommu *iommu;
2574         u16 dma_alias;
2575         unsigned long flags;
2576         u8 bus, devfn;
2577
2578         iommu = device_to_iommu(dev, &bus, &devfn);
2579         if (!iommu)
2580                 return NULL;
2581
2582         if (dev_is_pci(dev)) {
2583                 struct pci_dev *pdev = to_pci_dev(dev);
2584
2585                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2586
2587                 spin_lock_irqsave(&device_domain_lock, flags);
2588                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2589                                                       PCI_BUS_NUM(dma_alias),
2590                                                       dma_alias & 0xff);
2591                 if (info) {
2592                         iommu = info->iommu;
2593                         domain = info->domain;
2594                 }
2595                 spin_unlock_irqrestore(&device_domain_lock, flags);
2596
2597                 /* DMA alias already has a domain, use it */
2598                 if (info)
2599                         goto out;
2600         }
2601
2602         /* Allocate and initialize new domain for the device */
2603         domain = alloc_domain(0);
2604         if (!domain)
2605                 return NULL;
2606         if (domain_init(domain, iommu, gaw)) {
2607                 domain_exit(domain);
2608                 return NULL;
2609         }
2610
2611 out:
2612         return domain;
2613 }
2614
2615 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2616                                               struct dmar_domain *domain)
2617 {
2618         struct intel_iommu *iommu;
2619         struct dmar_domain *tmp;
2620         u16 req_id, dma_alias;
2621         u8 bus, devfn;
2622
2623         iommu = device_to_iommu(dev, &bus, &devfn);
2624         if (!iommu)
2625                 return NULL;
2626
2627         req_id = ((u16)bus << 8) | devfn;
2628
2629         if (dev_is_pci(dev)) {
2630                 struct pci_dev *pdev = to_pci_dev(dev);
2631
2632                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2633
2634                 /* register PCI DMA alias device */
2635                 if (req_id != dma_alias) {
2636                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2637                                         dma_alias & 0xff, NULL, domain);
2638
2639                         if (!tmp || tmp != domain)
2640                                 return tmp;
2641                 }
2642         }
2643
2644         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2645         if (!tmp || tmp != domain)
2646                 return tmp;
2647
2648         return domain;
2649 }
2650
2651 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2652 {
2653         struct dmar_domain *domain, *tmp;
2654
2655         domain = find_domain(dev);
2656         if (domain)
2657                 goto out;
2658
2659         domain = find_or_alloc_domain(dev, gaw);
2660         if (!domain)
2661                 goto out;
2662
2663         tmp = set_domain_for_dev(dev, domain);
2664         if (!tmp || domain != tmp) {
2665                 domain_exit(domain);
2666                 domain = tmp;
2667         }
2668
2669 out:
2670
2671         return domain;
2672 }
2673
2674 static int iommu_domain_identity_map(struct dmar_domain *domain,
2675                                      unsigned long long start,
2676                                      unsigned long long end)
2677 {
2678         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2679         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2680
2681         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2682                           dma_to_mm_pfn(last_vpfn))) {
2683                 pr_err("Reserving iova failed\n");
2684                 return -ENOMEM;
2685         }
2686
2687         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2688         /*
2689          * RMRR range might have overlap with physical memory range,
2690          * clear it first
2691          */
2692         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2693
2694         return __domain_mapping(domain, first_vpfn, NULL,
2695                                 first_vpfn, last_vpfn - first_vpfn + 1,
2696                                 DMA_PTE_READ|DMA_PTE_WRITE);
2697 }
2698
2699 static int domain_prepare_identity_map(struct device *dev,
2700                                        struct dmar_domain *domain,
2701                                        unsigned long long start,
2702                                        unsigned long long end)
2703 {
2704         /* For _hardware_ passthrough, don't bother. But for software
2705            passthrough, we do it anyway -- it may indicate a memory
2706            range which is reserved in E820, so which didn't get set
2707            up to start with in si_domain */
2708         if (domain == si_domain && hw_pass_through) {
2709                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2710                          start, end);
2711                 return 0;
2712         }
2713
2714         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2715
2716         if (end < start) {
2717                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2718                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2719                         dmi_get_system_info(DMI_BIOS_VENDOR),
2720                         dmi_get_system_info(DMI_BIOS_VERSION),
2721                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2722                 return -EIO;
2723         }
2724
2725         if (end >> agaw_to_width(domain->agaw)) {
2726                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2727                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2728                      agaw_to_width(domain->agaw),
2729                      dmi_get_system_info(DMI_BIOS_VENDOR),
2730                      dmi_get_system_info(DMI_BIOS_VERSION),
2731                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2732                 return -EIO;
2733         }
2734
2735         return iommu_domain_identity_map(domain, start, end);
2736 }
2737
2738 static int iommu_prepare_identity_map(struct device *dev,
2739                                       unsigned long long start,
2740                                       unsigned long long end)
2741 {
2742         struct dmar_domain *domain;
2743         int ret;
2744
2745         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2746         if (!domain)
2747                 return -ENOMEM;
2748
2749         ret = domain_prepare_identity_map(dev, domain, start, end);
2750         if (ret)
2751                 domain_exit(domain);
2752
2753         return ret;
2754 }
2755
2756 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2757                                          struct device *dev)
2758 {
2759         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2760                 return 0;
2761         return iommu_prepare_identity_map(dev, rmrr->base_address,
2762                                           rmrr->end_address);
2763 }
2764
2765 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2766
2767 static int __init si_domain_init(int hw)
2768 {
2769         struct dmar_rmrr_unit *rmrr;
2770         struct device *dev;
2771         int i, nid, ret;
2772
2773         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2774         if (!si_domain)
2775                 return -EFAULT;
2776
2777         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2778                 domain_exit(si_domain);
2779                 return -EFAULT;
2780         }
2781
2782         if (hw)
2783                 return 0;
2784
2785         for_each_online_node(nid) {
2786                 unsigned long start_pfn, end_pfn;
2787                 int i;
2788
2789                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2790                         ret = iommu_domain_identity_map(si_domain,
2791                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2792                         if (ret)
2793                                 return ret;
2794                 }
2795         }
2796
2797         /*
2798          * Normally we use DMA domains for devices which have RMRRs. But we
2799          * loose this requirement for graphic and usb devices. Identity map
2800          * the RMRRs for graphic and USB devices so that they could use the
2801          * si_domain.
2802          */
2803         for_each_rmrr_units(rmrr) {
2804                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2805                                           i, dev) {
2806                         unsigned long long start = rmrr->base_address;
2807                         unsigned long long end = rmrr->end_address;
2808
2809                         if (device_is_rmrr_locked(dev))
2810                                 continue;
2811
2812                         if (WARN_ON(end < start ||
2813                                     end >> agaw_to_width(si_domain->agaw)))
2814                                 continue;
2815
2816                         ret = iommu_domain_identity_map(si_domain, start, end);
2817                         if (ret)
2818                                 return ret;
2819                 }
2820         }
2821
2822         return 0;
2823 }
2824
2825 static int identity_mapping(struct device *dev)
2826 {
2827         struct device_domain_info *info;
2828
2829         info = dev->archdata.iommu;
2830         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2831                 return (info->domain == si_domain);
2832
2833         return 0;
2834 }
2835
2836 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2837 {
2838         struct dmar_domain *ndomain;
2839         struct intel_iommu *iommu;
2840         u8 bus, devfn;
2841
2842         iommu = device_to_iommu(dev, &bus, &devfn);
2843         if (!iommu)
2844                 return -ENODEV;
2845
2846         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2847         if (ndomain != domain)
2848                 return -EBUSY;
2849
2850         return 0;
2851 }
2852
2853 static bool device_has_rmrr(struct device *dev)
2854 {
2855         struct dmar_rmrr_unit *rmrr;
2856         struct device *tmp;
2857         int i;
2858
2859         rcu_read_lock();
2860         for_each_rmrr_units(rmrr) {
2861                 /*
2862                  * Return TRUE if this RMRR contains the device that
2863                  * is passed in.
2864                  */
2865                 for_each_active_dev_scope(rmrr->devices,
2866                                           rmrr->devices_cnt, i, tmp)
2867                         if (tmp == dev) {
2868                                 rcu_read_unlock();
2869                                 return true;
2870                         }
2871         }
2872         rcu_read_unlock();
2873         return false;
2874 }
2875
2876 /*
2877  * There are a couple cases where we need to restrict the functionality of
2878  * devices associated with RMRRs.  The first is when evaluating a device for
2879  * identity mapping because problems exist when devices are moved in and out
2880  * of domains and their respective RMRR information is lost.  This means that
2881  * a device with associated RMRRs will never be in a "passthrough" domain.
2882  * The second is use of the device through the IOMMU API.  This interface
2883  * expects to have full control of the IOVA space for the device.  We cannot
2884  * satisfy both the requirement that RMRR access is maintained and have an
2885  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2886  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2887  * We therefore prevent devices associated with an RMRR from participating in
2888  * the IOMMU API, which eliminates them from device assignment.
2889  *
2890  * In both cases we assume that PCI USB devices with RMRRs have them largely
2891  * for historical reasons and that the RMRR space is not actively used post
2892  * boot.  This exclusion may change if vendors begin to abuse it.
2893  *
2894  * The same exception is made for graphics devices, with the requirement that
2895  * any use of the RMRR regions will be torn down before assigning the device
2896  * to a guest.
2897  */
2898 static bool device_is_rmrr_locked(struct device *dev)
2899 {
2900         if (!device_has_rmrr(dev))
2901                 return false;
2902
2903         if (dev_is_pci(dev)) {
2904                 struct pci_dev *pdev = to_pci_dev(dev);
2905
2906                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2907                         return false;
2908         }
2909
2910         return true;
2911 }
2912
2913 /*
2914  * Return the required default domain type for a specific device.
2915  *
2916  * @dev: the device in query
2917  * @startup: true if this is during early boot
2918  *
2919  * Returns:
2920  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2921  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2922  *  - 0: both identity and dynamic domains work for this device
2923  */
2924 static int device_def_domain_type(struct device *dev)
2925 {
2926         if (dev_is_pci(dev)) {
2927                 struct pci_dev *pdev = to_pci_dev(dev);
2928
2929                 if (device_is_rmrr_locked(dev))
2930                         return IOMMU_DOMAIN_DMA;
2931
2932                 /*
2933                  * Prevent any device marked as untrusted from getting
2934                  * placed into the statically identity mapping domain.
2935                  */
2936                 if (pdev->untrusted)
2937                         return IOMMU_DOMAIN_DMA;
2938
2939                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2940                         return IOMMU_DOMAIN_IDENTITY;
2941
2942                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2943                         return IOMMU_DOMAIN_IDENTITY;
2944
2945                 /*
2946                  * We want to start off with all devices in the 1:1 domain, and
2947                  * take them out later if we find they can't access all of memory.
2948                  *
2949                  * However, we can't do this for PCI devices behind bridges,
2950                  * because all PCI devices behind the same bridge will end up
2951                  * with the same source-id on their transactions.
2952                  *
2953                  * Practically speaking, we can't change things around for these
2954                  * devices at run-time, because we can't be sure there'll be no
2955                  * DMA transactions in flight for any of their siblings.
2956                  *
2957                  * So PCI devices (unless they're on the root bus) as well as
2958                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2959                  * the 1:1 domain, just in _case_ one of their siblings turns out
2960                  * not to be able to map all of memory.
2961                  */
2962                 if (!pci_is_pcie(pdev)) {
2963                         if (!pci_is_root_bus(pdev->bus))
2964                                 return IOMMU_DOMAIN_DMA;
2965                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2966                                 return IOMMU_DOMAIN_DMA;
2967                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2968                         return IOMMU_DOMAIN_DMA;
2969         } else {
2970                 if (device_has_rmrr(dev))
2971                         return IOMMU_DOMAIN_DMA;
2972         }
2973
2974         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2975                         IOMMU_DOMAIN_IDENTITY : 0;
2976 }
2977
2978 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2979 {
2980         /*
2981          * Start from the sane iommu hardware state.
2982          * If the queued invalidation is already initialized by us
2983          * (for example, while enabling interrupt-remapping) then
2984          * we got the things already rolling from a sane state.
2985          */
2986         if (!iommu->qi) {
2987                 /*
2988                  * Clear any previous faults.
2989                  */
2990                 dmar_fault(-1, iommu);
2991                 /*
2992                  * Disable queued invalidation if supported and already enabled
2993                  * before OS handover.
2994                  */
2995                 dmar_disable_qi(iommu);
2996         }
2997
2998         if (dmar_enable_qi(iommu)) {
2999                 /*
3000                  * Queued Invalidate not enabled, use Register Based Invalidate
3001                  */
3002                 iommu->flush.flush_context = __iommu_flush_context;
3003                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3004                 pr_info("%s: Using Register based invalidation\n",
3005                         iommu->name);
3006         } else {
3007                 iommu->flush.flush_context = qi_flush_context;
3008                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3009                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3010         }
3011 }
3012
3013 static int copy_context_table(struct intel_iommu *iommu,
3014                               struct root_entry *old_re,
3015                               struct context_entry **tbl,
3016                               int bus, bool ext)
3017 {
3018         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3019         struct context_entry *new_ce = NULL, ce;
3020         struct context_entry *old_ce = NULL;
3021         struct root_entry re;
3022         phys_addr_t old_ce_phys;
3023
3024         tbl_idx = ext ? bus * 2 : bus;
3025         memcpy(&re, old_re, sizeof(re));
3026
3027         for (devfn = 0; devfn < 256; devfn++) {
3028                 /* First calculate the correct index */
3029                 idx = (ext ? devfn * 2 : devfn) % 256;
3030
3031                 if (idx == 0) {
3032                         /* First save what we may have and clean up */
3033                         if (new_ce) {
3034                                 tbl[tbl_idx] = new_ce;
3035                                 __iommu_flush_cache(iommu, new_ce,
3036                                                     VTD_PAGE_SIZE);
3037                                 pos = 1;
3038                         }
3039
3040                         if (old_ce)
3041                                 memunmap(old_ce);
3042
3043                         ret = 0;
3044                         if (devfn < 0x80)
3045                                 old_ce_phys = root_entry_lctp(&re);
3046                         else
3047                                 old_ce_phys = root_entry_uctp(&re);
3048
3049                         if (!old_ce_phys) {
3050                                 if (ext && devfn == 0) {
3051                                         /* No LCTP, try UCTP */
3052                                         devfn = 0x7f;
3053                                         continue;
3054                                 } else {
3055                                         goto out;
3056                                 }
3057                         }
3058
3059                         ret = -ENOMEM;
3060                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3061                                         MEMREMAP_WB);
3062                         if (!old_ce)
3063                                 goto out;
3064
3065                         new_ce = alloc_pgtable_page(iommu->node);
3066                         if (!new_ce)
3067                                 goto out_unmap;
3068
3069                         ret = 0;
3070                 }
3071
3072                 /* Now copy the context entry */
3073                 memcpy(&ce, old_ce + idx, sizeof(ce));
3074
3075                 if (!__context_present(&ce))
3076                         continue;
3077
3078                 did = context_domain_id(&ce);
3079                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3080                         set_bit(did, iommu->domain_ids);
3081
3082                 /*
3083                  * We need a marker for copied context entries. This
3084                  * marker needs to work for the old format as well as
3085                  * for extended context entries.
3086                  *
3087                  * Bit 67 of the context entry is used. In the old
3088                  * format this bit is available to software, in the
3089                  * extended format it is the PGE bit, but PGE is ignored
3090                  * by HW if PASIDs are disabled (and thus still
3091                  * available).
3092                  *
3093                  * So disable PASIDs first and then mark the entry
3094                  * copied. This means that we don't copy PASID
3095                  * translations from the old kernel, but this is fine as
3096                  * faults there are not fatal.
3097                  */
3098                 context_clear_pasid_enable(&ce);
3099                 context_set_copied(&ce);
3100
3101                 new_ce[idx] = ce;
3102         }
3103
3104         tbl[tbl_idx + pos] = new_ce;
3105
3106         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3107
3108 out_unmap:
3109         memunmap(old_ce);
3110
3111 out:
3112         return ret;
3113 }
3114
3115 static int copy_translation_tables(struct intel_iommu *iommu)
3116 {
3117         struct context_entry **ctxt_tbls;
3118         struct root_entry *old_rt;
3119         phys_addr_t old_rt_phys;
3120         int ctxt_table_entries;
3121         unsigned long flags;
3122         u64 rtaddr_reg;
3123         int bus, ret;
3124         bool new_ext, ext;
3125
3126         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3127         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3128         new_ext    = !!ecap_ecs(iommu->ecap);
3129
3130         /*
3131          * The RTT bit can only be changed when translation is disabled,
3132          * but disabling translation means to open a window for data
3133          * corruption. So bail out and don't copy anything if we would
3134          * have to change the bit.
3135          */
3136         if (new_ext != ext)
3137                 return -EINVAL;
3138
3139         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3140         if (!old_rt_phys)
3141                 return -EINVAL;
3142
3143         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3144         if (!old_rt)
3145                 return -ENOMEM;
3146
3147         /* This is too big for the stack - allocate it from slab */
3148         ctxt_table_entries = ext ? 512 : 256;
3149         ret = -ENOMEM;
3150         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3151         if (!ctxt_tbls)
3152                 goto out_unmap;
3153
3154         for (bus = 0; bus < 256; bus++) {
3155                 ret = copy_context_table(iommu, &old_rt[bus],
3156                                          ctxt_tbls, bus, ext);
3157                 if (ret) {
3158                         pr_err("%s: Failed to copy context table for bus %d\n",
3159                                 iommu->name, bus);
3160                         continue;
3161                 }
3162         }
3163
3164         spin_lock_irqsave(&iommu->lock, flags);
3165
3166         /* Context tables are copied, now write them to the root_entry table */
3167         for (bus = 0; bus < 256; bus++) {
3168                 int idx = ext ? bus * 2 : bus;
3169                 u64 val;
3170
3171                 if (ctxt_tbls[idx]) {
3172                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3173                         iommu->root_entry[bus].lo = val;
3174                 }
3175
3176                 if (!ext || !ctxt_tbls[idx + 1])
3177                         continue;
3178
3179                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3180                 iommu->root_entry[bus].hi = val;
3181         }
3182
3183         spin_unlock_irqrestore(&iommu->lock, flags);
3184
3185         kfree(ctxt_tbls);
3186
3187         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3188
3189         ret = 0;
3190
3191 out_unmap:
3192         memunmap(old_rt);
3193
3194         return ret;
3195 }
3196
3197 static int __init init_dmars(void)
3198 {
3199         struct dmar_drhd_unit *drhd;
3200         struct intel_iommu *iommu;
3201         int ret;
3202
3203         /*
3204          * for each drhd
3205          *    allocate root
3206          *    initialize and program root entry to not present
3207          * endfor
3208          */
3209         for_each_drhd_unit(drhd) {
3210                 /*
3211                  * lock not needed as this is only incremented in the single
3212                  * threaded kernel __init code path all other access are read
3213                  * only
3214                  */
3215                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3216                         g_num_of_iommus++;
3217                         continue;
3218                 }
3219                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3220         }
3221
3222         /* Preallocate enough resources for IOMMU hot-addition */
3223         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3224                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3225
3226         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3227                         GFP_KERNEL);
3228         if (!g_iommus) {
3229                 pr_err("Allocating global iommu array failed\n");
3230                 ret = -ENOMEM;
3231                 goto error;
3232         }
3233
3234         for_each_active_iommu(iommu, drhd) {
3235                 /*
3236                  * Find the max pasid size of all IOMMU's in the system.
3237                  * We need to ensure the system pasid table is no bigger
3238                  * than the smallest supported.
3239                  */
3240                 if (pasid_supported(iommu)) {
3241                         u32 temp = 2 << ecap_pss(iommu->ecap);
3242
3243                         intel_pasid_max_id = min_t(u32, temp,
3244                                                    intel_pasid_max_id);
3245                 }
3246
3247                 g_iommus[iommu->seq_id] = iommu;
3248
3249                 intel_iommu_init_qi(iommu);
3250
3251                 ret = iommu_init_domains(iommu);
3252                 if (ret)
3253                         goto free_iommu;
3254
3255                 init_translation_status(iommu);
3256
3257                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3258                         iommu_disable_translation(iommu);
3259                         clear_translation_pre_enabled(iommu);
3260                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3261                                 iommu->name);
3262                 }
3263
3264                 /*
3265                  * TBD:
3266                  * we could share the same root & context tables
3267                  * among all IOMMU's. Need to Split it later.
3268                  */
3269                 ret = iommu_alloc_root_entry(iommu);
3270                 if (ret)
3271                         goto free_iommu;
3272
3273                 if (translation_pre_enabled(iommu)) {
3274                         pr_info("Translation already enabled - trying to copy translation structures\n");
3275
3276                         ret = copy_translation_tables(iommu);
3277                         if (ret) {
3278                                 /*
3279                                  * We found the IOMMU with translation
3280                                  * enabled - but failed to copy over the
3281                                  * old root-entry table. Try to proceed
3282                                  * by disabling translation now and
3283                                  * allocating a clean root-entry table.
3284                                  * This might cause DMAR faults, but
3285                                  * probably the dump will still succeed.
3286                                  */
3287                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3288                                        iommu->name);
3289                                 iommu_disable_translation(iommu);
3290                                 clear_translation_pre_enabled(iommu);
3291                         } else {
3292                                 pr_info("Copied translation tables from previous kernel for %s\n",
3293                                         iommu->name);
3294                         }
3295                 }
3296
3297                 if (!ecap_pass_through(iommu->ecap))
3298                         hw_pass_through = 0;
3299 #ifdef CONFIG_INTEL_IOMMU_SVM
3300                 if (pasid_supported(iommu))
3301                         intel_svm_init(iommu);
3302 #endif
3303         }
3304
3305         /*
3306          * Now that qi is enabled on all iommus, set the root entry and flush
3307          * caches. This is required on some Intel X58 chipsets, otherwise the
3308          * flush_context function will loop forever and the boot hangs.
3309          */
3310         for_each_active_iommu(iommu, drhd) {
3311                 iommu_flush_write_buffer(iommu);
3312                 iommu_set_root_entry(iommu);
3313                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3314                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3315         }
3316
3317         if (iommu_pass_through)
3318                 iommu_identity_mapping |= IDENTMAP_ALL;
3319
3320 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3321         dmar_map_gfx = 0;
3322 #endif
3323
3324         if (!dmar_map_gfx)
3325                 iommu_identity_mapping |= IDENTMAP_GFX;
3326
3327         check_tylersburg_isoch();
3328
3329         ret = si_domain_init(hw_pass_through);
3330         if (ret)
3331                 goto free_iommu;
3332
3333         /*
3334          * for each drhd
3335          *   enable fault log
3336          *   global invalidate context cache
3337          *   global invalidate iotlb
3338          *   enable translation
3339          */
3340         for_each_iommu(iommu, drhd) {
3341                 if (drhd->ignored) {
3342                         /*
3343                          * we always have to disable PMRs or DMA may fail on
3344                          * this device
3345                          */
3346                         if (force_on)
3347                                 iommu_disable_protect_mem_regions(iommu);
3348                         continue;
3349                 }
3350
3351                 iommu_flush_write_buffer(iommu);
3352
3353 #ifdef CONFIG_INTEL_IOMMU_SVM
3354                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3355                         /*
3356                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3357                          * could cause possible lock race condition.
3358                          */
3359                         up_write(&dmar_global_lock);
3360                         ret = intel_svm_enable_prq(iommu);
3361                         down_write(&dmar_global_lock);
3362                         if (ret)
3363                                 goto free_iommu;
3364                 }
3365 #endif
3366                 ret = dmar_set_interrupt(iommu);
3367                 if (ret)
3368                         goto free_iommu;
3369         }
3370
3371         return 0;
3372
3373 free_iommu:
3374         for_each_active_iommu(iommu, drhd) {
3375                 disable_dmar_iommu(iommu);
3376                 free_dmar_iommu(iommu);
3377         }
3378
3379         kfree(g_iommus);
3380
3381 error:
3382         return ret;
3383 }
3384
3385 /* This takes a number of _MM_ pages, not VTD pages */
3386 static unsigned long intel_alloc_iova(struct device *dev,
3387                                      struct dmar_domain *domain,
3388                                      unsigned long nrpages, uint64_t dma_mask)
3389 {
3390         unsigned long iova_pfn;
3391
3392         /* Restrict dma_mask to the width that the iommu can handle */
3393         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3394         /* Ensure we reserve the whole size-aligned region */
3395         nrpages = __roundup_pow_of_two(nrpages);
3396
3397         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3398                 /*
3399                  * First try to allocate an io virtual address in
3400                  * DMA_BIT_MASK(32) and if that fails then try allocating
3401                  * from higher range
3402                  */
3403                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3404                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3405                 if (iova_pfn)
3406                         return iova_pfn;
3407         }
3408         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3409                                    IOVA_PFN(dma_mask), true);
3410         if (unlikely(!iova_pfn)) {
3411                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3412                 return 0;
3413         }
3414
3415         return iova_pfn;
3416 }
3417
3418 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3419 {
3420         struct dmar_domain *domain, *tmp;
3421         struct dmar_rmrr_unit *rmrr;
3422         struct device *i_dev;
3423         int i, ret;
3424
3425         /* Device shouldn't be attached by any domains. */
3426         domain = find_domain(dev);
3427         if (domain)
3428                 return NULL;
3429
3430         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3431         if (!domain)
3432                 goto out;
3433
3434         /* We have a new domain - setup possible RMRRs for the device */
3435         rcu_read_lock();
3436         for_each_rmrr_units(rmrr) {
3437                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3438                                           i, i_dev) {
3439                         if (i_dev != dev)
3440                                 continue;
3441
3442                         ret = domain_prepare_identity_map(dev, domain,
3443                                                           rmrr->base_address,
3444                                                           rmrr->end_address);
3445                         if (ret)
3446                                 dev_err(dev, "Mapping reserved region failed\n");
3447                 }
3448         }
3449         rcu_read_unlock();
3450
3451         tmp = set_domain_for_dev(dev, domain);
3452         if (!tmp || domain != tmp) {
3453                 domain_exit(domain);
3454                 domain = tmp;
3455         }
3456
3457 out:
3458         if (!domain)
3459                 dev_err(dev, "Allocating domain failed\n");
3460
3461         return domain;
3462 }
3463
3464 /* Check if the dev needs to go through non-identity map and unmap process.*/
3465 static bool iommu_need_mapping(struct device *dev)
3466 {
3467         int ret;
3468
3469         if (iommu_dummy(dev))
3470                 return false;
3471
3472         ret = identity_mapping(dev);
3473         if (ret) {
3474                 u64 dma_mask = *dev->dma_mask;
3475
3476                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3477                         dma_mask = dev->coherent_dma_mask;
3478
3479                 if (dma_mask >= dma_get_required_mask(dev))
3480                         return false;
3481
3482                 /*
3483                  * 32 bit DMA is removed from si_domain and fall back to
3484                  * non-identity mapping.
3485                  */
3486                 dmar_remove_one_dev_info(dev);
3487                 ret = iommu_request_dma_domain_for_dev(dev);
3488                 if (ret) {
3489                         struct iommu_domain *domain;
3490                         struct dmar_domain *dmar_domain;
3491
3492                         domain = iommu_get_domain_for_dev(dev);
3493                         if (domain) {
3494                                 dmar_domain = to_dmar_domain(domain);
3495                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3496                         }
3497                         get_private_domain_for_dev(dev);
3498                 }
3499
3500                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3501         }
3502
3503         return true;
3504 }
3505
3506 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3507                                      size_t size, int dir, u64 dma_mask)
3508 {
3509         struct dmar_domain *domain;
3510         phys_addr_t start_paddr;
3511         unsigned long iova_pfn;
3512         int prot = 0;
3513         int ret;
3514         struct intel_iommu *iommu;
3515         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3516
3517         BUG_ON(dir == DMA_NONE);
3518
3519         domain = find_domain(dev);
3520         if (!domain)
3521                 return DMA_MAPPING_ERROR;
3522
3523         iommu = domain_get_iommu(domain);
3524         size = aligned_nrpages(paddr, size);
3525
3526         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3527         if (!iova_pfn)
3528                 goto error;
3529
3530         /*
3531          * Check if DMAR supports zero-length reads on write only
3532          * mappings..
3533          */
3534         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3535                         !cap_zlr(iommu->cap))
3536                 prot |= DMA_PTE_READ;
3537         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3538                 prot |= DMA_PTE_WRITE;
3539         /*
3540          * paddr - (paddr + size) might be partial page, we should map the whole
3541          * page.  Note: if two part of one page are separately mapped, we
3542          * might have two guest_addr mapping to the same host paddr, but this
3543          * is not a big problem
3544          */
3545         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3546                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3547         if (ret)
3548                 goto error;
3549
3550         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3551         start_paddr += paddr & ~PAGE_MASK;
3552         return start_paddr;
3553
3554 error:
3555         if (iova_pfn)
3556                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3557         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3558                 size, (unsigned long long)paddr, dir);
3559         return DMA_MAPPING_ERROR;
3560 }
3561
3562 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3563                                  unsigned long offset, size_t size,
3564                                  enum dma_data_direction dir,
3565                                  unsigned long attrs)
3566 {
3567         if (iommu_need_mapping(dev))
3568                 return __intel_map_single(dev, page_to_phys(page) + offset,
3569                                 size, dir, *dev->dma_mask);
3570         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3571 }
3572
3573 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3574                                      size_t size, enum dma_data_direction dir,
3575                                      unsigned long attrs)
3576 {
3577         if (iommu_need_mapping(dev))
3578                 return __intel_map_single(dev, phys_addr, size, dir,
3579                                 *dev->dma_mask);
3580         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3581 }
3582
3583 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3584 {
3585         struct dmar_domain *domain;
3586         unsigned long start_pfn, last_pfn;
3587         unsigned long nrpages;
3588         unsigned long iova_pfn;
3589         struct intel_iommu *iommu;
3590         struct page *freelist;
3591         struct pci_dev *pdev = NULL;
3592
3593         domain = find_domain(dev);
3594         BUG_ON(!domain);
3595
3596         iommu = domain_get_iommu(domain);
3597
3598         iova_pfn = IOVA_PFN(dev_addr);
3599
3600         nrpages = aligned_nrpages(dev_addr, size);
3601         start_pfn = mm_to_dma_pfn(iova_pfn);
3602         last_pfn = start_pfn + nrpages - 1;
3603
3604         if (dev_is_pci(dev))
3605                 pdev = to_pci_dev(dev);
3606
3607         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3608
3609         freelist = domain_unmap(domain, start_pfn, last_pfn);
3610
3611         if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3612                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3613                                       nrpages, !freelist, 0);
3614                 /* free iova */
3615                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3616                 dma_free_pagelist(freelist);
3617         } else {
3618                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3619                            (unsigned long)freelist);
3620                 /*
3621                  * queue up the release of the unmap to save the 1/6th of the
3622                  * cpu used up by the iotlb flush operation...
3623                  */
3624         }
3625 }
3626
3627 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3628                              size_t size, enum dma_data_direction dir,
3629                              unsigned long attrs)
3630 {
3631         if (iommu_need_mapping(dev))
3632                 intel_unmap(dev, dev_addr, size);
3633         else
3634                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3635 }
3636
3637 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3638                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3639 {
3640         if (iommu_need_mapping(dev))
3641                 intel_unmap(dev, dev_addr, size);
3642 }
3643
3644 static void *intel_alloc_coherent(struct device *dev, size_t size,
3645                                   dma_addr_t *dma_handle, gfp_t flags,
3646                                   unsigned long attrs)
3647 {
3648         struct page *page = NULL;
3649         int order;
3650
3651         if (!iommu_need_mapping(dev))
3652                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3653
3654         size = PAGE_ALIGN(size);
3655         order = get_order(size);
3656
3657         if (gfpflags_allow_blocking(flags)) {
3658                 unsigned int count = size >> PAGE_SHIFT;
3659
3660                 page = dma_alloc_from_contiguous(dev, count, order,
3661                                                  flags & __GFP_NOWARN);
3662         }
3663
3664         if (!page)
3665                 page = alloc_pages(flags, order);
3666         if (!page)
3667                 return NULL;
3668         memset(page_address(page), 0, size);
3669
3670         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3671                                          DMA_BIDIRECTIONAL,
3672                                          dev->coherent_dma_mask);
3673         if (*dma_handle != DMA_MAPPING_ERROR)
3674                 return page_address(page);
3675         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3676                 __free_pages(page, order);
3677
3678         return NULL;
3679 }
3680
3681 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3682                                 dma_addr_t dma_handle, unsigned long attrs)
3683 {
3684         int order;
3685         struct page *page = virt_to_page(vaddr);
3686
3687         if (!iommu_need_mapping(dev))
3688                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3689
3690         size = PAGE_ALIGN(size);
3691         order = get_order(size);
3692
3693         intel_unmap(dev, dma_handle, size);
3694         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3695                 __free_pages(page, order);
3696 }
3697
3698 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3699                            int nelems, enum dma_data_direction dir,
3700                            unsigned long attrs)
3701 {
3702         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3703         unsigned long nrpages = 0;
3704         struct scatterlist *sg;
3705         int i;
3706
3707         if (!iommu_need_mapping(dev))
3708                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3709
3710         for_each_sg(sglist, sg, nelems, i) {
3711                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3712         }
3713
3714         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3715 }
3716
3717 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3718                         enum dma_data_direction dir, unsigned long attrs)
3719 {
3720         int i;
3721         struct dmar_domain *domain;
3722         size_t size = 0;
3723         int prot = 0;
3724         unsigned long iova_pfn;
3725         int ret;
3726         struct scatterlist *sg;
3727         unsigned long start_vpfn;
3728         struct intel_iommu *iommu;
3729
3730         BUG_ON(dir == DMA_NONE);
3731         if (!iommu_need_mapping(dev))
3732                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3733
3734         domain = find_domain(dev);
3735         if (!domain)
3736                 return 0;
3737
3738         iommu = domain_get_iommu(domain);
3739
3740         for_each_sg(sglist, sg, nelems, i)
3741                 size += aligned_nrpages(sg->offset, sg->length);
3742
3743         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3744                                 *dev->dma_mask);
3745         if (!iova_pfn) {
3746                 sglist->dma_length = 0;
3747                 return 0;
3748         }
3749
3750         /*
3751          * Check if DMAR supports zero-length reads on write only
3752          * mappings..
3753          */
3754         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3755                         !cap_zlr(iommu->cap))
3756                 prot |= DMA_PTE_READ;
3757         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3758                 prot |= DMA_PTE_WRITE;
3759
3760         start_vpfn = mm_to_dma_pfn(iova_pfn);
3761
3762         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3763         if (unlikely(ret)) {
3764                 dma_pte_free_pagetable(domain, start_vpfn,
3765                                        start_vpfn + size - 1,
3766                                        agaw_to_level(domain->agaw) + 1);
3767                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3768                 return 0;
3769         }
3770
3771         return nelems;
3772 }
3773
3774 static const struct dma_map_ops intel_dma_ops = {
3775         .alloc = intel_alloc_coherent,
3776         .free = intel_free_coherent,
3777         .map_sg = intel_map_sg,
3778         .unmap_sg = intel_unmap_sg,
3779         .map_page = intel_map_page,
3780         .unmap_page = intel_unmap_page,
3781         .map_resource = intel_map_resource,
3782         .unmap_resource = intel_unmap_resource,
3783         .dma_supported = dma_direct_supported,
3784 };
3785
3786 static inline int iommu_domain_cache_init(void)
3787 {
3788         int ret = 0;
3789
3790         iommu_domain_cache = kmem_cache_create("iommu_domain",
3791                                          sizeof(struct dmar_domain),
3792                                          0,
3793                                          SLAB_HWCACHE_ALIGN,
3794
3795                                          NULL);
3796         if (!iommu_domain_cache) {
3797                 pr_err("Couldn't create iommu_domain cache\n");
3798                 ret = -ENOMEM;
3799         }
3800
3801         return ret;
3802 }
3803
3804 static inline int iommu_devinfo_cache_init(void)
3805 {
3806         int ret = 0;
3807
3808         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3809                                          sizeof(struct device_domain_info),
3810                                          0,
3811                                          SLAB_HWCACHE_ALIGN,
3812                                          NULL);
3813         if (!iommu_devinfo_cache) {
3814                 pr_err("Couldn't create devinfo cache\n");
3815                 ret = -ENOMEM;
3816         }
3817
3818         return ret;
3819 }
3820
3821 static int __init iommu_init_mempool(void)
3822 {
3823         int ret;
3824         ret = iova_cache_get();
3825         if (ret)
3826                 return ret;
3827
3828         ret = iommu_domain_cache_init();
3829         if (ret)
3830                 goto domain_error;
3831
3832         ret = iommu_devinfo_cache_init();
3833         if (!ret)
3834                 return ret;
3835
3836         kmem_cache_destroy(iommu_domain_cache);
3837 domain_error:
3838         iova_cache_put();
3839
3840         return -ENOMEM;
3841 }
3842
3843 static void __init iommu_exit_mempool(void)
3844 {
3845         kmem_cache_destroy(iommu_devinfo_cache);
3846         kmem_cache_destroy(iommu_domain_cache);
3847         iova_cache_put();
3848 }
3849
3850 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3851 {
3852         struct dmar_drhd_unit *drhd;
3853         u32 vtbar;
3854         int rc;
3855
3856         /* We know that this device on this chipset has its own IOMMU.
3857          * If we find it under a different IOMMU, then the BIOS is lying
3858          * to us. Hope that the IOMMU for this device is actually
3859          * disabled, and it needs no translation...
3860          */
3861         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3862         if (rc) {
3863                 /* "can't" happen */
3864                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3865                 return;
3866         }
3867         vtbar &= 0xffff0000;
3868
3869         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3870         drhd = dmar_find_matched_drhd_unit(pdev);
3871         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3872                             TAINT_FIRMWARE_WORKAROUND,
3873                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3874                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3875 }
3876 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3877
3878 static void __init init_no_remapping_devices(void)
3879 {
3880         struct dmar_drhd_unit *drhd;
3881         struct device *dev;
3882         int i;
3883
3884         for_each_drhd_unit(drhd) {
3885                 if (!drhd->include_all) {
3886                         for_each_active_dev_scope(drhd->devices,
3887                                                   drhd->devices_cnt, i, dev)
3888                                 break;
3889                         /* ignore DMAR unit if no devices exist */
3890                         if (i == drhd->devices_cnt)
3891                                 drhd->ignored = 1;
3892                 }
3893         }
3894
3895         for_each_active_drhd_unit(drhd) {
3896                 if (drhd->include_all)
3897                         continue;
3898
3899                 for_each_active_dev_scope(drhd->devices,
3900                                           drhd->devices_cnt, i, dev)
3901                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3902                                 break;
3903                 if (i < drhd->devices_cnt)
3904                         continue;
3905
3906                 /* This IOMMU has *only* gfx devices. Either bypass it or
3907                    set the gfx_mapped flag, as appropriate */
3908                 if (!dmar_map_gfx) {
3909                         drhd->ignored = 1;
3910                         for_each_active_dev_scope(drhd->devices,
3911                                                   drhd->devices_cnt, i, dev)
3912                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3913                 }
3914         }
3915 }
3916
3917 #ifdef CONFIG_SUSPEND
3918 static int init_iommu_hw(void)
3919 {
3920         struct dmar_drhd_unit *drhd;
3921         struct intel_iommu *iommu = NULL;
3922
3923         for_each_active_iommu(iommu, drhd)
3924                 if (iommu->qi)
3925                         dmar_reenable_qi(iommu);
3926
3927         for_each_iommu(iommu, drhd) {
3928                 if (drhd->ignored) {
3929                         /*
3930                          * we always have to disable PMRs or DMA may fail on
3931                          * this device
3932                          */
3933                         if (force_on)
3934                                 iommu_disable_protect_mem_regions(iommu);
3935                         continue;
3936                 }
3937
3938                 iommu_flush_write_buffer(iommu);
3939
3940                 iommu_set_root_entry(iommu);
3941
3942                 iommu->flush.flush_context(iommu, 0, 0, 0,
3943                                            DMA_CCMD_GLOBAL_INVL);
3944                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3945                 iommu_enable_translation(iommu);
3946                 iommu_disable_protect_mem_regions(iommu);
3947         }
3948
3949         return 0;
3950 }
3951
3952 static void iommu_flush_all(void)
3953 {
3954         struct dmar_drhd_unit *drhd;
3955         struct intel_iommu *iommu;
3956
3957         for_each_active_iommu(iommu, drhd) {
3958                 iommu->flush.flush_context(iommu, 0, 0, 0,
3959                                            DMA_CCMD_GLOBAL_INVL);
3960                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3961                                          DMA_TLB_GLOBAL_FLUSH);
3962         }
3963 }
3964
3965 static int iommu_suspend(void)
3966 {
3967         struct dmar_drhd_unit *drhd;
3968         struct intel_iommu *iommu = NULL;
3969         unsigned long flag;
3970
3971         for_each_active_iommu(iommu, drhd) {
3972                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3973                                                  GFP_ATOMIC);
3974                 if (!iommu->iommu_state)
3975                         goto nomem;
3976         }
3977
3978         iommu_flush_all();
3979
3980         for_each_active_iommu(iommu, drhd) {
3981                 iommu_disable_translation(iommu);
3982
3983                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3984
3985                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3986                         readl(iommu->reg + DMAR_FECTL_REG);
3987                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3988                         readl(iommu->reg + DMAR_FEDATA_REG);
3989                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3990                         readl(iommu->reg + DMAR_FEADDR_REG);
3991                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3992                         readl(iommu->reg + DMAR_FEUADDR_REG);
3993
3994                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3995         }
3996         return 0;
3997
3998 nomem:
3999         for_each_active_iommu(iommu, drhd)
4000                 kfree(iommu->iommu_state);
4001
4002         return -ENOMEM;
4003 }
4004
4005 static void iommu_resume(void)
4006 {
4007         struct dmar_drhd_unit *drhd;
4008         struct intel_iommu *iommu = NULL;
4009         unsigned long flag;
4010
4011         if (init_iommu_hw()) {
4012                 if (force_on)
4013                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4014                 else
4015                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4016                 return;
4017         }
4018
4019         for_each_active_iommu(iommu, drhd) {
4020
4021                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4022
4023                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4024                         iommu->reg + DMAR_FECTL_REG);
4025                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4026                         iommu->reg + DMAR_FEDATA_REG);
4027                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4028                         iommu->reg + DMAR_FEADDR_REG);
4029                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4030                         iommu->reg + DMAR_FEUADDR_REG);
4031
4032                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4033         }
4034
4035         for_each_active_iommu(iommu, drhd)
4036                 kfree(iommu->iommu_state);
4037 }
4038
4039 static struct syscore_ops iommu_syscore_ops = {
4040         .resume         = iommu_resume,
4041         .suspend        = iommu_suspend,
4042 };
4043
4044 static void __init init_iommu_pm_ops(void)
4045 {
4046         register_syscore_ops(&iommu_syscore_ops);
4047 }
4048
4049 #else
4050 static inline void init_iommu_pm_ops(void) {}
4051 #endif  /* CONFIG_PM */
4052
4053
4054 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4055 {
4056         struct acpi_dmar_reserved_memory *rmrr;
4057         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4058         struct dmar_rmrr_unit *rmrru;
4059         size_t length;
4060
4061         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4062         if (!rmrru)
4063                 goto out;
4064
4065         rmrru->hdr = header;
4066         rmrr = (struct acpi_dmar_reserved_memory *)header;
4067         rmrru->base_address = rmrr->base_address;
4068         rmrru->end_address = rmrr->end_address;
4069
4070         length = rmrr->end_address - rmrr->base_address + 1;
4071         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4072                                               IOMMU_RESV_DIRECT);
4073         if (!rmrru->resv)
4074                 goto free_rmrru;
4075
4076         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4077                                 ((void *)rmrr) + rmrr->header.length,
4078                                 &rmrru->devices_cnt);
4079         if (rmrru->devices_cnt && rmrru->devices == NULL)
4080                 goto free_all;
4081
4082         list_add(&rmrru->list, &dmar_rmrr_units);
4083
4084         return 0;
4085 free_all:
4086         kfree(rmrru->resv);
4087 free_rmrru:
4088         kfree(rmrru);
4089 out:
4090         return -ENOMEM;
4091 }
4092
4093 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4094 {
4095         struct dmar_atsr_unit *atsru;
4096         struct acpi_dmar_atsr *tmp;
4097
4098         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4099                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4100                 if (atsr->segment != tmp->segment)
4101                         continue;
4102                 if (atsr->header.length != tmp->header.length)
4103                         continue;
4104                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4105                         return atsru;
4106         }
4107
4108         return NULL;
4109 }
4110
4111 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4112 {
4113         struct acpi_dmar_atsr *atsr;
4114         struct dmar_atsr_unit *atsru;
4115
4116         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4117                 return 0;
4118
4119         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4120         atsru = dmar_find_atsr(atsr);
4121         if (atsru)
4122                 return 0;
4123
4124         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4125         if (!atsru)
4126                 return -ENOMEM;
4127
4128         /*
4129          * If memory is allocated from slab by ACPI _DSM method, we need to
4130          * copy the memory content because the memory buffer will be freed
4131          * on return.
4132          */
4133         atsru->hdr = (void *)(atsru + 1);
4134         memcpy(atsru->hdr, hdr, hdr->length);
4135         atsru->include_all = atsr->flags & 0x1;
4136         if (!atsru->include_all) {
4137                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4138                                 (void *)atsr + atsr->header.length,
4139                                 &atsru->devices_cnt);
4140                 if (atsru->devices_cnt && atsru->devices == NULL) {
4141                         kfree(atsru);
4142                         return -ENOMEM;
4143                 }
4144         }
4145
4146         list_add_rcu(&atsru->list, &dmar_atsr_units);
4147
4148         return 0;
4149 }
4150
4151 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4152 {
4153         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4154         kfree(atsru);
4155 }
4156
4157 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4158 {
4159         struct acpi_dmar_atsr *atsr;
4160         struct dmar_atsr_unit *atsru;
4161
4162         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4163         atsru = dmar_find_atsr(atsr);
4164         if (atsru) {
4165                 list_del_rcu(&atsru->list);
4166                 synchronize_rcu();
4167                 intel_iommu_free_atsr(atsru);
4168         }
4169
4170         return 0;
4171 }
4172
4173 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4174 {
4175         int i;
4176         struct device *dev;
4177         struct acpi_dmar_atsr *atsr;
4178         struct dmar_atsr_unit *atsru;
4179
4180         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4181         atsru = dmar_find_atsr(atsr);
4182         if (!atsru)
4183                 return 0;
4184
4185         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4186                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4187                                           i, dev)
4188                         return -EBUSY;
4189         }
4190
4191         return 0;
4192 }
4193
4194 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4195 {
4196         int sp, ret;
4197         struct intel_iommu *iommu = dmaru->iommu;
4198
4199         if (g_iommus[iommu->seq_id])
4200                 return 0;
4201
4202         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4203                 pr_warn("%s: Doesn't support hardware pass through.\n",
4204                         iommu->name);
4205                 return -ENXIO;
4206         }
4207         if (!ecap_sc_support(iommu->ecap) &&
4208             domain_update_iommu_snooping(iommu)) {
4209                 pr_warn("%s: Doesn't support snooping.\n",
4210                         iommu->name);
4211                 return -ENXIO;
4212         }
4213         sp = domain_update_iommu_superpage(iommu) - 1;
4214         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4215                 pr_warn("%s: Doesn't support large page.\n",
4216                         iommu->name);
4217                 return -ENXIO;
4218         }
4219
4220         /*
4221          * Disable translation if already enabled prior to OS handover.
4222          */
4223         if (iommu->gcmd & DMA_GCMD_TE)
4224                 iommu_disable_translation(iommu);
4225
4226         g_iommus[iommu->seq_id] = iommu;
4227         ret = iommu_init_domains(iommu);
4228         if (ret == 0)
4229                 ret = iommu_alloc_root_entry(iommu);
4230         if (ret)
4231                 goto out;
4232
4233 #ifdef CONFIG_INTEL_IOMMU_SVM
4234         if (pasid_supported(iommu))
4235                 intel_svm_init(iommu);
4236 #endif
4237
4238         if (dmaru->ignored) {
4239                 /*
4240                  * we always have to disable PMRs or DMA may fail on this device
4241                  */
4242                 if (force_on)
4243                         iommu_disable_protect_mem_regions(iommu);
4244                 return 0;
4245         }
4246
4247         intel_iommu_init_qi(iommu);
4248         iommu_flush_write_buffer(iommu);
4249
4250 #ifdef CONFIG_INTEL_IOMMU_SVM
4251         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4252                 ret = intel_svm_enable_prq(iommu);
4253                 if (ret)
4254                         goto disable_iommu;
4255         }
4256 #endif
4257         ret = dmar_set_interrupt(iommu);
4258         if (ret)
4259                 goto disable_iommu;
4260
4261         iommu_set_root_entry(iommu);
4262         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4263         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4264         iommu_enable_translation(iommu);
4265
4266         iommu_disable_protect_mem_regions(iommu);
4267         return 0;
4268
4269 disable_iommu:
4270         disable_dmar_iommu(iommu);
4271 out:
4272         free_dmar_iommu(iommu);
4273         return ret;
4274 }
4275
4276 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4277 {
4278         int ret = 0;
4279         struct intel_iommu *iommu = dmaru->iommu;
4280
4281         if (!intel_iommu_enabled)
4282                 return 0;
4283         if (iommu == NULL)
4284                 return -EINVAL;
4285
4286         if (insert) {
4287                 ret = intel_iommu_add(dmaru);
4288         } else {
4289                 disable_dmar_iommu(iommu);
4290                 free_dmar_iommu(iommu);
4291         }
4292
4293         return ret;
4294 }
4295
4296 static void intel_iommu_free_dmars(void)
4297 {
4298         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4299         struct dmar_atsr_unit *atsru, *atsr_n;
4300
4301         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4302                 list_del(&rmrru->list);
4303                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4304                 kfree(rmrru->resv);
4305                 kfree(rmrru);
4306         }
4307
4308         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4309                 list_del(&atsru->list);
4310                 intel_iommu_free_atsr(atsru);
4311         }
4312 }
4313
4314 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4315 {
4316         int i, ret = 1;
4317         struct pci_bus *bus;
4318         struct pci_dev *bridge = NULL;
4319         struct device *tmp;
4320         struct acpi_dmar_atsr *atsr;
4321         struct dmar_atsr_unit *atsru;
4322
4323         dev = pci_physfn(dev);
4324         for (bus = dev->bus; bus; bus = bus->parent) {
4325                 bridge = bus->self;
4326                 /* If it's an integrated device, allow ATS */
4327                 if (!bridge)
4328                         return 1;
4329                 /* Connected via non-PCIe: no ATS */
4330                 if (!pci_is_pcie(bridge) ||
4331                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4332                         return 0;
4333                 /* If we found the root port, look it up in the ATSR */
4334                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4335                         break;
4336         }
4337
4338         rcu_read_lock();
4339         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4340                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4341                 if (atsr->segment != pci_domain_nr(dev->bus))
4342                         continue;
4343
4344                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4345                         if (tmp == &bridge->dev)
4346                                 goto out;
4347
4348                 if (atsru->include_all)
4349                         goto out;
4350         }
4351         ret = 0;
4352 out:
4353         rcu_read_unlock();
4354
4355         return ret;
4356 }
4357
4358 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4359 {
4360         int ret;
4361         struct dmar_rmrr_unit *rmrru;
4362         struct dmar_atsr_unit *atsru;
4363         struct acpi_dmar_atsr *atsr;
4364         struct acpi_dmar_reserved_memory *rmrr;
4365
4366         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4367                 return 0;
4368
4369         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4370                 rmrr = container_of(rmrru->hdr,
4371                                     struct acpi_dmar_reserved_memory, header);
4372                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4373                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4374                                 ((void *)rmrr) + rmrr->header.length,
4375                                 rmrr->segment, rmrru->devices,
4376                                 rmrru->devices_cnt);
4377                         if (ret < 0)
4378                                 return ret;
4379                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4380                         dmar_remove_dev_scope(info, rmrr->segment,
4381                                 rmrru->devices, rmrru->devices_cnt);
4382                 }
4383         }
4384
4385         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4386                 if (atsru->include_all)
4387                         continue;
4388
4389                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4390                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4391                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4392                                         (void *)atsr + atsr->header.length,
4393                                         atsr->segment, atsru->devices,
4394                                         atsru->devices_cnt);
4395                         if (ret > 0)
4396                                 break;
4397                         else if (ret < 0)
4398                                 return ret;
4399                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4400                         if (dmar_remove_dev_scope(info, atsr->segment,
4401                                         atsru->devices, atsru->devices_cnt))
4402                                 break;
4403                 }
4404         }
4405
4406         return 0;
4407 }
4408
4409 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4410                                        unsigned long val, void *v)
4411 {
4412         struct memory_notify *mhp = v;
4413         unsigned long long start, end;
4414         unsigned long start_vpfn, last_vpfn;
4415
4416         switch (val) {
4417         case MEM_GOING_ONLINE:
4418                 start = mhp->start_pfn << PAGE_SHIFT;
4419                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4420                 if (iommu_domain_identity_map(si_domain, start, end)) {
4421                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4422                                 start, end);
4423                         return NOTIFY_BAD;
4424                 }
4425                 break;
4426
4427         case MEM_OFFLINE:
4428         case MEM_CANCEL_ONLINE:
4429                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4430                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4431                 while (start_vpfn <= last_vpfn) {
4432                         struct iova *iova;
4433                         struct dmar_drhd_unit *drhd;
4434                         struct intel_iommu *iommu;
4435                         struct page *freelist;
4436
4437                         iova = find_iova(&si_domain->iovad, start_vpfn);
4438                         if (iova == NULL) {
4439                                 pr_debug("Failed get IOVA for PFN %lx\n",
4440                                          start_vpfn);
4441                                 break;
4442                         }
4443
4444                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4445                                                      start_vpfn, last_vpfn);
4446                         if (iova == NULL) {
4447                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4448                                         start_vpfn, last_vpfn);
4449                                 return NOTIFY_BAD;
4450                         }
4451
4452                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4453                                                iova->pfn_hi);
4454
4455                         rcu_read_lock();
4456                         for_each_active_iommu(iommu, drhd)
4457                                 iommu_flush_iotlb_psi(iommu, si_domain,
4458                                         iova->pfn_lo, iova_size(iova),
4459                                         !freelist, 0);
4460                         rcu_read_unlock();
4461                         dma_free_pagelist(freelist);
4462
4463                         start_vpfn = iova->pfn_hi + 1;
4464                         free_iova_mem(iova);
4465                 }
4466                 break;
4467         }
4468
4469         return NOTIFY_OK;
4470 }
4471
4472 static struct notifier_block intel_iommu_memory_nb = {
4473         .notifier_call = intel_iommu_memory_notifier,
4474         .priority = 0
4475 };
4476
4477 static void free_all_cpu_cached_iovas(unsigned int cpu)
4478 {
4479         int i;
4480
4481         for (i = 0; i < g_num_of_iommus; i++) {
4482                 struct intel_iommu *iommu = g_iommus[i];
4483                 struct dmar_domain *domain;
4484                 int did;
4485
4486                 if (!iommu)
4487                         continue;
4488
4489                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4490                         domain = get_iommu_domain(iommu, (u16)did);
4491
4492                         if (!domain)
4493                                 continue;
4494                         free_cpu_cached_iovas(cpu, &domain->iovad);
4495                 }
4496         }
4497 }
4498
4499 static int intel_iommu_cpu_dead(unsigned int cpu)
4500 {
4501         free_all_cpu_cached_iovas(cpu);
4502         return 0;
4503 }
4504
4505 static void intel_disable_iommus(void)
4506 {
4507         struct intel_iommu *iommu = NULL;
4508         struct dmar_drhd_unit *drhd;
4509
4510         for_each_iommu(iommu, drhd)
4511                 iommu_disable_translation(iommu);
4512 }
4513
4514 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4515 {
4516         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4517
4518         return container_of(iommu_dev, struct intel_iommu, iommu);
4519 }
4520
4521 static ssize_t intel_iommu_show_version(struct device *dev,
4522                                         struct device_attribute *attr,
4523                                         char *buf)
4524 {
4525         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4526         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4527         return sprintf(buf, "%d:%d\n",
4528                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4529 }
4530 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4531
4532 static ssize_t intel_iommu_show_address(struct device *dev,
4533                                         struct device_attribute *attr,
4534                                         char *buf)
4535 {
4536         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4537         return sprintf(buf, "%llx\n", iommu->reg_phys);
4538 }
4539 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4540
4541 static ssize_t intel_iommu_show_cap(struct device *dev,
4542                                     struct device_attribute *attr,
4543                                     char *buf)
4544 {
4545         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4546         return sprintf(buf, "%llx\n", iommu->cap);
4547 }
4548 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4549
4550 static ssize_t intel_iommu_show_ecap(struct device *dev,
4551                                     struct device_attribute *attr,
4552                                     char *buf)
4553 {
4554         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4555         return sprintf(buf, "%llx\n", iommu->ecap);
4556 }
4557 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4558
4559 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4560                                       struct device_attribute *attr,
4561                                       char *buf)
4562 {
4563         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4564         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4565 }
4566 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4567
4568 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4569                                            struct device_attribute *attr,
4570                                            char *buf)
4571 {
4572         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4573         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4574                                                   cap_ndoms(iommu->cap)));
4575 }
4576 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4577
4578 static struct attribute *intel_iommu_attrs[] = {
4579         &dev_attr_version.attr,
4580         &dev_attr_address.attr,
4581         &dev_attr_cap.attr,
4582         &dev_attr_ecap.attr,
4583         &dev_attr_domains_supported.attr,
4584         &dev_attr_domains_used.attr,
4585         NULL,
4586 };
4587
4588 static struct attribute_group intel_iommu_group = {
4589         .name = "intel-iommu",
4590         .attrs = intel_iommu_attrs,
4591 };
4592
4593 const struct attribute_group *intel_iommu_groups[] = {
4594         &intel_iommu_group,
4595         NULL,
4596 };
4597
4598 static int __init platform_optin_force_iommu(void)
4599 {
4600         struct pci_dev *pdev = NULL;
4601         bool has_untrusted_dev = false;
4602
4603         if (!dmar_platform_optin() || no_platform_optin)
4604                 return 0;
4605
4606         for_each_pci_dev(pdev) {
4607                 if (pdev->untrusted) {
4608                         has_untrusted_dev = true;
4609                         break;
4610                 }
4611         }
4612
4613         if (!has_untrusted_dev)
4614                 return 0;
4615
4616         if (no_iommu || dmar_disabled)
4617                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4618
4619         /*
4620          * If Intel-IOMMU is disabled by default, we will apply identity
4621          * map for all devices except those marked as being untrusted.
4622          */
4623         if (dmar_disabled)
4624                 iommu_identity_mapping |= IDENTMAP_ALL;
4625
4626         dmar_disabled = 0;
4627 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4628         swiotlb = 0;
4629 #endif
4630         no_iommu = 0;
4631
4632         return 1;
4633 }
4634
4635 static int __init probe_acpi_namespace_devices(void)
4636 {
4637         struct dmar_drhd_unit *drhd;
4638         struct intel_iommu *iommu;
4639         struct device *dev;
4640         int i, ret = 0;
4641
4642         for_each_active_iommu(iommu, drhd) {
4643                 for_each_active_dev_scope(drhd->devices,
4644                                           drhd->devices_cnt, i, dev) {
4645                         struct acpi_device_physical_node *pn;
4646                         struct iommu_group *group;
4647                         struct acpi_device *adev;
4648
4649                         if (dev->bus != &acpi_bus_type)
4650                                 continue;
4651
4652                         adev = to_acpi_device(dev);
4653                         mutex_lock(&adev->physical_node_lock);
4654                         list_for_each_entry(pn,
4655                                             &adev->physical_node_list, node) {
4656                                 group = iommu_group_get(pn->dev);
4657                                 if (group) {
4658                                         iommu_group_put(group);
4659                                         continue;
4660                                 }
4661
4662                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4663                                 ret = iommu_probe_device(pn->dev);
4664                                 if (ret)
4665                                         break;
4666                         }
4667                         mutex_unlock(&adev->physical_node_lock);
4668
4669                         if (ret)
4670                                 return ret;
4671                 }
4672         }
4673
4674         return 0;
4675 }
4676
4677 int __init intel_iommu_init(void)
4678 {
4679         int ret = -ENODEV;
4680         struct dmar_drhd_unit *drhd;
4681         struct intel_iommu *iommu;
4682
4683         /*
4684          * Intel IOMMU is required for a TXT/tboot launch or platform
4685          * opt in, so enforce that.
4686          */
4687         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4688
4689         if (iommu_init_mempool()) {
4690                 if (force_on)
4691                         panic("tboot: Failed to initialize iommu memory\n");
4692                 return -ENOMEM;
4693         }
4694
4695         down_write(&dmar_global_lock);
4696         if (dmar_table_init()) {
4697                 if (force_on)
4698                         panic("tboot: Failed to initialize DMAR table\n");
4699                 goto out_free_dmar;
4700         }
4701
4702         if (dmar_dev_scope_init() < 0) {
4703                 if (force_on)
4704                         panic("tboot: Failed to initialize DMAR device scope\n");
4705                 goto out_free_dmar;
4706         }
4707
4708         up_write(&dmar_global_lock);
4709
4710         /*
4711          * The bus notifier takes the dmar_global_lock, so lockdep will
4712          * complain later when we register it under the lock.
4713          */
4714         dmar_register_bus_notifier();
4715
4716         down_write(&dmar_global_lock);
4717
4718         if (no_iommu || dmar_disabled) {
4719                 /*
4720                  * We exit the function here to ensure IOMMU's remapping and
4721                  * mempool aren't setup, which means that the IOMMU's PMRs
4722                  * won't be disabled via the call to init_dmars(). So disable
4723                  * it explicitly here. The PMRs were setup by tboot prior to
4724                  * calling SENTER, but the kernel is expected to reset/tear
4725                  * down the PMRs.
4726                  */
4727                 if (intel_iommu_tboot_noforce) {
4728                         for_each_iommu(iommu, drhd)
4729                                 iommu_disable_protect_mem_regions(iommu);
4730                 }
4731
4732                 /*
4733                  * Make sure the IOMMUs are switched off, even when we
4734                  * boot into a kexec kernel and the previous kernel left
4735                  * them enabled
4736                  */
4737                 intel_disable_iommus();
4738                 goto out_free_dmar;
4739         }
4740
4741         if (list_empty(&dmar_rmrr_units))
4742                 pr_info("No RMRR found\n");
4743
4744         if (list_empty(&dmar_atsr_units))
4745                 pr_info("No ATSR found\n");
4746
4747         if (dmar_init_reserved_ranges()) {
4748                 if (force_on)
4749                         panic("tboot: Failed to reserve iommu ranges\n");
4750                 goto out_free_reserved_range;
4751         }
4752
4753         if (dmar_map_gfx)
4754                 intel_iommu_gfx_mapped = 1;
4755
4756         init_no_remapping_devices();
4757
4758         ret = init_dmars();
4759         if (ret) {
4760                 if (force_on)
4761                         panic("tboot: Failed to initialize DMARs\n");
4762                 pr_err("Initialization failed\n");
4763                 goto out_free_reserved_range;
4764         }
4765         up_write(&dmar_global_lock);
4766
4767 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4768         swiotlb = 0;
4769 #endif
4770         dma_ops = &intel_dma_ops;
4771
4772         init_iommu_pm_ops();
4773
4774         for_each_active_iommu(iommu, drhd) {
4775                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4776                                        intel_iommu_groups,
4777                                        "%s", iommu->name);
4778                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4779                 iommu_device_register(&iommu->iommu);
4780         }
4781
4782         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4783         if (si_domain && !hw_pass_through)
4784                 register_memory_notifier(&intel_iommu_memory_nb);
4785         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4786                           intel_iommu_cpu_dead);
4787
4788         if (probe_acpi_namespace_devices())
4789                 pr_warn("ACPI name space devices didn't probe correctly\n");
4790
4791         /* Finally, we enable the DMA remapping hardware. */
4792         for_each_iommu(iommu, drhd) {
4793                 if (!translation_pre_enabled(iommu))
4794                         iommu_enable_translation(iommu);
4795
4796                 iommu_disable_protect_mem_regions(iommu);
4797         }
4798         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4799
4800         intel_iommu_enabled = 1;
4801         intel_iommu_debugfs_init();
4802
4803         return 0;
4804
4805 out_free_reserved_range:
4806         put_iova_domain(&reserved_iova_list);
4807 out_free_dmar:
4808         intel_iommu_free_dmars();
4809         up_write(&dmar_global_lock);
4810         iommu_exit_mempool();
4811         return ret;
4812 }
4813
4814 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4815 {
4816         struct intel_iommu *iommu = opaque;
4817
4818         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4819         return 0;
4820 }
4821
4822 /*
4823  * NB - intel-iommu lacks any sort of reference counting for the users of
4824  * dependent devices.  If multiple endpoints have intersecting dependent
4825  * devices, unbinding the driver from any one of them will possibly leave
4826  * the others unable to operate.
4827  */
4828 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4829 {
4830         if (!iommu || !dev || !dev_is_pci(dev))
4831                 return;
4832
4833         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4834 }
4835
4836 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4837 {
4838         struct dmar_domain *domain;
4839         struct intel_iommu *iommu;
4840         unsigned long flags;
4841
4842         assert_spin_locked(&device_domain_lock);
4843
4844         if (WARN_ON(!info))
4845                 return;
4846
4847         iommu = info->iommu;
4848         domain = info->domain;
4849
4850         if (info->dev) {
4851                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4852                         intel_pasid_tear_down_entry(iommu, info->dev,
4853                                         PASID_RID2PASID);
4854
4855                 iommu_disable_dev_iotlb(info);
4856                 domain_context_clear(iommu, info->dev);
4857                 intel_pasid_free_table(info->dev);
4858         }
4859
4860         unlink_domain_info(info);
4861
4862         spin_lock_irqsave(&iommu->lock, flags);
4863         domain_detach_iommu(domain, iommu);
4864         spin_unlock_irqrestore(&iommu->lock, flags);
4865
4866         /* free the private domain */
4867         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4868             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4869                 domain_exit(info->domain);
4870
4871         free_devinfo_mem(info);
4872 }
4873
4874 static void dmar_remove_one_dev_info(struct device *dev)
4875 {
4876         struct device_domain_info *info;
4877         unsigned long flags;
4878
4879         spin_lock_irqsave(&device_domain_lock, flags);
4880         info = dev->archdata.iommu;
4881         __dmar_remove_one_dev_info(info);
4882         spin_unlock_irqrestore(&device_domain_lock, flags);
4883 }
4884
4885 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4886 {
4887         int adjust_width;
4888
4889         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4890         domain_reserve_special_ranges(domain);
4891
4892         /* calculate AGAW */
4893         domain->gaw = guest_width;
4894         adjust_width = guestwidth_to_adjustwidth(guest_width);
4895         domain->agaw = width_to_agaw(adjust_width);
4896
4897         domain->iommu_coherency = 0;
4898         domain->iommu_snooping = 0;
4899         domain->iommu_superpage = 0;
4900         domain->max_addr = 0;
4901
4902         /* always allocate the top pgd */
4903         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4904         if (!domain->pgd)
4905                 return -ENOMEM;
4906         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4907         return 0;
4908 }
4909
4910 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4911 {
4912         struct dmar_domain *dmar_domain;
4913         struct iommu_domain *domain;
4914
4915         switch (type) {
4916         case IOMMU_DOMAIN_DMA:
4917         /* fallthrough */
4918         case IOMMU_DOMAIN_UNMANAGED:
4919                 dmar_domain = alloc_domain(0);
4920                 if (!dmar_domain) {
4921                         pr_err("Can't allocate dmar_domain\n");
4922                         return NULL;
4923                 }
4924                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4925                         pr_err("Domain initialization failed\n");
4926                         domain_exit(dmar_domain);
4927                         return NULL;
4928                 }
4929
4930                 if (type == IOMMU_DOMAIN_DMA &&
4931                     init_iova_flush_queue(&dmar_domain->iovad,
4932                                           iommu_flush_iova, iova_entry_free)) {
4933                         pr_warn("iova flush queue initialization failed\n");
4934                         intel_iommu_strict = 1;
4935                 }
4936
4937                 domain_update_iommu_cap(dmar_domain);
4938
4939                 domain = &dmar_domain->domain;
4940                 domain->geometry.aperture_start = 0;
4941                 domain->geometry.aperture_end   =
4942                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4943                 domain->geometry.force_aperture = true;
4944
4945                 return domain;
4946         case IOMMU_DOMAIN_IDENTITY:
4947                 return &si_domain->domain;
4948         default:
4949                 return NULL;
4950         }
4951
4952         return NULL;
4953 }
4954
4955 static void intel_iommu_domain_free(struct iommu_domain *domain)
4956 {
4957         if (domain != &si_domain->domain)
4958                 domain_exit(to_dmar_domain(domain));
4959 }
4960
4961 /*
4962  * Check whether a @domain could be attached to the @dev through the
4963  * aux-domain attach/detach APIs.
4964  */
4965 static inline bool
4966 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4967 {
4968         struct device_domain_info *info = dev->archdata.iommu;
4969
4970         return info && info->auxd_enabled &&
4971                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4972 }
4973
4974 static void auxiliary_link_device(struct dmar_domain *domain,
4975                                   struct device *dev)
4976 {
4977         struct device_domain_info *info = dev->archdata.iommu;
4978
4979         assert_spin_locked(&device_domain_lock);
4980         if (WARN_ON(!info))
4981                 return;
4982
4983         domain->auxd_refcnt++;
4984         list_add(&domain->auxd, &info->auxiliary_domains);
4985 }
4986
4987 static void auxiliary_unlink_device(struct dmar_domain *domain,
4988                                     struct device *dev)
4989 {
4990         struct device_domain_info *info = dev->archdata.iommu;
4991
4992         assert_spin_locked(&device_domain_lock);
4993         if (WARN_ON(!info))
4994                 return;
4995
4996         list_del(&domain->auxd);
4997         domain->auxd_refcnt--;
4998
4999         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5000                 intel_pasid_free_id(domain->default_pasid);
5001 }
5002
5003 static int aux_domain_add_dev(struct dmar_domain *domain,
5004                               struct device *dev)
5005 {
5006         int ret;
5007         u8 bus, devfn;
5008         unsigned long flags;
5009         struct intel_iommu *iommu;
5010
5011         iommu = device_to_iommu(dev, &bus, &devfn);
5012         if (!iommu)
5013                 return -ENODEV;
5014
5015         if (domain->default_pasid <= 0) {
5016                 int pasid;
5017
5018                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
5019                                              pci_max_pasids(to_pci_dev(dev)),
5020                                              GFP_KERNEL);
5021                 if (pasid <= 0) {
5022                         pr_err("Can't allocate default pasid\n");
5023                         return -ENODEV;
5024                 }
5025                 domain->default_pasid = pasid;
5026         }
5027
5028         spin_lock_irqsave(&device_domain_lock, flags);
5029         /*
5030          * iommu->lock must be held to attach domain to iommu and setup the
5031          * pasid entry for second level translation.
5032          */
5033         spin_lock(&iommu->lock);
5034         ret = domain_attach_iommu(domain, iommu);
5035         if (ret)
5036                 goto attach_failed;
5037
5038         /* Setup the PASID entry for mediated devices: */
5039         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5040                                              domain->default_pasid);
5041         if (ret)
5042                 goto table_failed;
5043         spin_unlock(&iommu->lock);
5044
5045         auxiliary_link_device(domain, dev);
5046
5047         spin_unlock_irqrestore(&device_domain_lock, flags);
5048
5049         return 0;
5050
5051 table_failed:
5052         domain_detach_iommu(domain, iommu);
5053 attach_failed:
5054         spin_unlock(&iommu->lock);
5055         spin_unlock_irqrestore(&device_domain_lock, flags);
5056         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5057                 intel_pasid_free_id(domain->default_pasid);
5058
5059         return ret;
5060 }
5061
5062 static void aux_domain_remove_dev(struct dmar_domain *domain,
5063                                   struct device *dev)
5064 {
5065         struct device_domain_info *info;
5066         struct intel_iommu *iommu;
5067         unsigned long flags;
5068
5069         if (!is_aux_domain(dev, &domain->domain))
5070                 return;
5071
5072         spin_lock_irqsave(&device_domain_lock, flags);
5073         info = dev->archdata.iommu;
5074         iommu = info->iommu;
5075
5076         auxiliary_unlink_device(domain, dev);
5077
5078         spin_lock(&iommu->lock);
5079         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5080         domain_detach_iommu(domain, iommu);
5081         spin_unlock(&iommu->lock);
5082
5083         spin_unlock_irqrestore(&device_domain_lock, flags);
5084 }
5085
5086 static int prepare_domain_attach_device(struct iommu_domain *domain,
5087                                         struct device *dev)
5088 {
5089         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5090         struct intel_iommu *iommu;
5091         int addr_width;
5092         u8 bus, devfn;
5093
5094         iommu = device_to_iommu(dev, &bus, &devfn);
5095         if (!iommu)
5096                 return -ENODEV;
5097
5098         /* check if this iommu agaw is sufficient for max mapped address */
5099         addr_width = agaw_to_width(iommu->agaw);
5100         if (addr_width > cap_mgaw(iommu->cap))
5101                 addr_width = cap_mgaw(iommu->cap);
5102
5103         if (dmar_domain->max_addr > (1LL << addr_width)) {
5104                 dev_err(dev, "%s: iommu width (%d) is not "
5105                         "sufficient for the mapped address (%llx)\n",
5106                         __func__, addr_width, dmar_domain->max_addr);
5107                 return -EFAULT;
5108         }
5109         dmar_domain->gaw = addr_width;
5110
5111         /*
5112          * Knock out extra levels of page tables if necessary
5113          */
5114         while (iommu->agaw < dmar_domain->agaw) {
5115                 struct dma_pte *pte;
5116
5117                 pte = dmar_domain->pgd;
5118                 if (dma_pte_present(pte)) {
5119                         dmar_domain->pgd = (struct dma_pte *)
5120                                 phys_to_virt(dma_pte_addr(pte));
5121                         free_pgtable_page(pte);
5122                 }
5123                 dmar_domain->agaw--;
5124         }
5125
5126         return 0;
5127 }
5128
5129 static int intel_iommu_attach_device(struct iommu_domain *domain,
5130                                      struct device *dev)
5131 {
5132         int ret;
5133
5134         if (device_is_rmrr_locked(dev)) {
5135                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5136                 return -EPERM;
5137         }
5138
5139         if (is_aux_domain(dev, domain))
5140                 return -EPERM;
5141
5142         /* normally dev is not mapped */
5143         if (unlikely(domain_context_mapped(dev))) {
5144                 struct dmar_domain *old_domain;
5145
5146                 old_domain = find_domain(dev);
5147                 if (old_domain)
5148                         dmar_remove_one_dev_info(dev);
5149         }
5150
5151         ret = prepare_domain_attach_device(domain, dev);
5152         if (ret)
5153                 return ret;
5154
5155         return domain_add_dev_info(to_dmar_domain(domain), dev);
5156 }
5157
5158 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5159                                          struct device *dev)
5160 {
5161         int ret;
5162
5163         if (!is_aux_domain(dev, domain))
5164                 return -EPERM;
5165
5166         ret = prepare_domain_attach_device(domain, dev);
5167         if (ret)
5168                 return ret;
5169
5170         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5171 }
5172
5173 static void intel_iommu_detach_device(struct iommu_domain *domain,
5174                                       struct device *dev)
5175 {
5176         dmar_remove_one_dev_info(dev);
5177 }
5178
5179 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5180                                           struct device *dev)
5181 {
5182         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5183 }
5184
5185 static int intel_iommu_map(struct iommu_domain *domain,
5186                            unsigned long iova, phys_addr_t hpa,
5187                            size_t size, int iommu_prot)
5188 {
5189         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5190         u64 max_addr;
5191         int prot = 0;
5192         int ret;
5193
5194         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5195                 return -EINVAL;
5196
5197         if (iommu_prot & IOMMU_READ)
5198                 prot |= DMA_PTE_READ;
5199         if (iommu_prot & IOMMU_WRITE)
5200                 prot |= DMA_PTE_WRITE;
5201         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5202                 prot |= DMA_PTE_SNP;
5203
5204         max_addr = iova + size;
5205         if (dmar_domain->max_addr < max_addr) {
5206                 u64 end;
5207
5208                 /* check if minimum agaw is sufficient for mapped address */
5209                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5210                 if (end < max_addr) {
5211                         pr_err("%s: iommu width (%d) is not "
5212                                "sufficient for the mapped address (%llx)\n",
5213                                __func__, dmar_domain->gaw, max_addr);
5214                         return -EFAULT;
5215                 }
5216                 dmar_domain->max_addr = max_addr;
5217         }
5218         /* Round up size to next multiple of PAGE_SIZE, if it and
5219            the low bits of hpa would take us onto the next page */
5220         size = aligned_nrpages(hpa, size);
5221         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5222                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5223         return ret;
5224 }
5225
5226 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5227                                 unsigned long iova, size_t size)
5228 {
5229         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5230         struct page *freelist = NULL;
5231         unsigned long start_pfn, last_pfn;
5232         unsigned int npages;
5233         int iommu_id, level = 0;
5234
5235         /* Cope with horrid API which requires us to unmap more than the
5236            size argument if it happens to be a large-page mapping. */
5237         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5238         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5239                 return 0;
5240
5241         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5242                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5243
5244         start_pfn = iova >> VTD_PAGE_SHIFT;
5245         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5246
5247         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5248
5249         npages = last_pfn - start_pfn + 1;
5250
5251         for_each_domain_iommu(iommu_id, dmar_domain)
5252                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5253                                       start_pfn, npages, !freelist, 0);
5254
5255         dma_free_pagelist(freelist);
5256
5257         if (dmar_domain->max_addr == iova + size)
5258                 dmar_domain->max_addr = iova;
5259
5260         return size;
5261 }
5262
5263 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5264                                             dma_addr_t iova)
5265 {
5266         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5267         struct dma_pte *pte;
5268         int level = 0;
5269         u64 phys = 0;
5270
5271         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5272                 return 0;
5273
5274         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5275         if (pte)
5276                 phys = dma_pte_addr(pte);
5277
5278         return phys;
5279 }
5280
5281 static inline bool scalable_mode_support(void)
5282 {
5283         struct dmar_drhd_unit *drhd;
5284         struct intel_iommu *iommu;
5285         bool ret = true;
5286
5287         rcu_read_lock();
5288         for_each_active_iommu(iommu, drhd) {
5289                 if (!sm_supported(iommu)) {
5290                         ret = false;
5291                         break;
5292                 }
5293         }
5294         rcu_read_unlock();
5295
5296         return ret;
5297 }
5298
5299 static inline bool iommu_pasid_support(void)
5300 {
5301         struct dmar_drhd_unit *drhd;
5302         struct intel_iommu *iommu;
5303         bool ret = true;
5304
5305         rcu_read_lock();
5306         for_each_active_iommu(iommu, drhd) {
5307                 if (!pasid_supported(iommu)) {
5308                         ret = false;
5309                         break;
5310                 }
5311         }
5312         rcu_read_unlock();
5313
5314         return ret;
5315 }
5316
5317 static bool intel_iommu_capable(enum iommu_cap cap)
5318 {
5319         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5320                 return domain_update_iommu_snooping(NULL) == 1;
5321         if (cap == IOMMU_CAP_INTR_REMAP)
5322                 return irq_remapping_enabled == 1;
5323
5324         return false;
5325 }
5326
5327 static int intel_iommu_add_device(struct device *dev)
5328 {
5329         struct dmar_domain *dmar_domain;
5330         struct iommu_domain *domain;
5331         struct intel_iommu *iommu;
5332         struct iommu_group *group;
5333         u8 bus, devfn;
5334         int ret;
5335
5336         iommu = device_to_iommu(dev, &bus, &devfn);
5337         if (!iommu)
5338                 return -ENODEV;
5339
5340         iommu_device_link(&iommu->iommu, dev);
5341
5342         if (translation_pre_enabled(iommu))
5343                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5344
5345         group = iommu_group_get_for_dev(dev);
5346
5347         if (IS_ERR(group))
5348                 return PTR_ERR(group);
5349
5350         iommu_group_put(group);
5351
5352         domain = iommu_get_domain_for_dev(dev);
5353         dmar_domain = to_dmar_domain(domain);
5354         if (domain->type == IOMMU_DOMAIN_DMA) {
5355                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5356                         ret = iommu_request_dm_for_dev(dev);
5357                         if (ret) {
5358                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5359                                 domain_add_dev_info(si_domain, dev);
5360                                 dev_info(dev,
5361                                          "Device uses a private identity domain.\n");
5362                                 return 0;
5363                         }
5364
5365                         return -ENODEV;
5366                 }
5367         } else {
5368                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5369                         ret = iommu_request_dma_domain_for_dev(dev);
5370                         if (ret) {
5371                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5372                                 if (!get_private_domain_for_dev(dev)) {
5373                                         dev_warn(dev,
5374                                                  "Failed to get a private domain.\n");
5375                                         return -ENOMEM;
5376                                 }
5377
5378                                 dev_info(dev,
5379                                          "Device uses a private dma domain.\n");
5380                                 return 0;
5381                         }
5382
5383                         return -ENODEV;
5384                 }
5385         }
5386
5387         return 0;
5388 }
5389
5390 static void intel_iommu_remove_device(struct device *dev)
5391 {
5392         struct intel_iommu *iommu;
5393         u8 bus, devfn;
5394
5395         iommu = device_to_iommu(dev, &bus, &devfn);
5396         if (!iommu)
5397                 return;
5398
5399         iommu_group_remove_device(dev);
5400
5401         iommu_device_unlink(&iommu->iommu, dev);
5402 }
5403
5404 static void intel_iommu_get_resv_regions(struct device *device,
5405                                          struct list_head *head)
5406 {
5407         struct iommu_resv_region *reg;
5408         struct dmar_rmrr_unit *rmrr;
5409         struct device *i_dev;
5410         int i;
5411
5412         rcu_read_lock();
5413         for_each_rmrr_units(rmrr) {
5414                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5415                                           i, i_dev) {
5416                         if (i_dev != device)
5417                                 continue;
5418
5419                         list_add_tail(&rmrr->resv->list, head);
5420                 }
5421         }
5422         rcu_read_unlock();
5423
5424 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5425         if (dev_is_pci(device)) {
5426                 struct pci_dev *pdev = to_pci_dev(device);
5427
5428                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5429                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5430                                                       IOMMU_RESV_DIRECT);
5431                         if (reg)
5432                                 list_add_tail(&reg->list, head);
5433                 }
5434         }
5435 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5436
5437         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5438                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5439                                       0, IOMMU_RESV_MSI);
5440         if (!reg)
5441                 return;
5442         list_add_tail(&reg->list, head);
5443 }
5444
5445 static void intel_iommu_put_resv_regions(struct device *dev,
5446                                          struct list_head *head)
5447 {
5448         struct iommu_resv_region *entry, *next;
5449
5450         list_for_each_entry_safe(entry, next, head, list) {
5451                 if (entry->type == IOMMU_RESV_MSI)
5452                         kfree(entry);
5453         }
5454 }
5455
5456 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5457 {
5458         struct device_domain_info *info;
5459         struct context_entry *context;
5460         struct dmar_domain *domain;
5461         unsigned long flags;
5462         u64 ctx_lo;
5463         int ret;
5464
5465         domain = find_domain(dev);
5466         if (!domain)
5467                 return -EINVAL;
5468
5469         spin_lock_irqsave(&device_domain_lock, flags);
5470         spin_lock(&iommu->lock);
5471
5472         ret = -EINVAL;
5473         info = dev->archdata.iommu;
5474         if (!info || !info->pasid_supported)
5475                 goto out;
5476
5477         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5478         if (WARN_ON(!context))
5479                 goto out;
5480
5481         ctx_lo = context[0].lo;
5482
5483         if (!(ctx_lo & CONTEXT_PASIDE)) {
5484                 ctx_lo |= CONTEXT_PASIDE;
5485                 context[0].lo = ctx_lo;
5486                 wmb();
5487                 iommu->flush.flush_context(iommu,
5488                                            domain->iommu_did[iommu->seq_id],
5489                                            PCI_DEVID(info->bus, info->devfn),
5490                                            DMA_CCMD_MASK_NOBIT,
5491                                            DMA_CCMD_DEVICE_INVL);
5492         }
5493
5494         /* Enable PASID support in the device, if it wasn't already */
5495         if (!info->pasid_enabled)
5496                 iommu_enable_dev_iotlb(info);
5497
5498         ret = 0;
5499
5500  out:
5501         spin_unlock(&iommu->lock);
5502         spin_unlock_irqrestore(&device_domain_lock, flags);
5503
5504         return ret;
5505 }
5506
5507 static void intel_iommu_apply_resv_region(struct device *dev,
5508                                           struct iommu_domain *domain,
5509                                           struct iommu_resv_region *region)
5510 {
5511         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5512         unsigned long start, end;
5513
5514         start = IOVA_PFN(region->start);
5515         end   = IOVA_PFN(region->start + region->length - 1);
5516
5517         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5518 }
5519
5520 #ifdef CONFIG_INTEL_IOMMU_SVM
5521 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5522 {
5523         struct intel_iommu *iommu;
5524         u8 bus, devfn;
5525
5526         if (iommu_dummy(dev)) {
5527                 dev_warn(dev,
5528                          "No IOMMU translation for device; cannot enable SVM\n");
5529                 return NULL;
5530         }
5531
5532         iommu = device_to_iommu(dev, &bus, &devfn);
5533         if ((!iommu)) {
5534                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5535                 return NULL;
5536         }
5537
5538         return iommu;
5539 }
5540 #endif /* CONFIG_INTEL_IOMMU_SVM */
5541
5542 static int intel_iommu_enable_auxd(struct device *dev)
5543 {
5544         struct device_domain_info *info;
5545         struct intel_iommu *iommu;
5546         unsigned long flags;
5547         u8 bus, devfn;
5548         int ret;
5549
5550         iommu = device_to_iommu(dev, &bus, &devfn);
5551         if (!iommu || dmar_disabled)
5552                 return -EINVAL;
5553
5554         if (!sm_supported(iommu) || !pasid_supported(iommu))
5555                 return -EINVAL;
5556
5557         ret = intel_iommu_enable_pasid(iommu, dev);
5558         if (ret)
5559                 return -ENODEV;
5560
5561         spin_lock_irqsave(&device_domain_lock, flags);
5562         info = dev->archdata.iommu;
5563         info->auxd_enabled = 1;
5564         spin_unlock_irqrestore(&device_domain_lock, flags);
5565
5566         return 0;
5567 }
5568
5569 static int intel_iommu_disable_auxd(struct device *dev)
5570 {
5571         struct device_domain_info *info;
5572         unsigned long flags;
5573
5574         spin_lock_irqsave(&device_domain_lock, flags);
5575         info = dev->archdata.iommu;
5576         if (!WARN_ON(!info))
5577                 info->auxd_enabled = 0;
5578         spin_unlock_irqrestore(&device_domain_lock, flags);
5579
5580         return 0;
5581 }
5582
5583 /*
5584  * A PCI express designated vendor specific extended capability is defined
5585  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5586  * for system software and tools to detect endpoint devices supporting the
5587  * Intel scalable IO virtualization without host driver dependency.
5588  *
5589  * Returns the address of the matching extended capability structure within
5590  * the device's PCI configuration space or 0 if the device does not support
5591  * it.
5592  */
5593 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5594 {
5595         int pos;
5596         u16 vendor, id;
5597
5598         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5599         while (pos) {
5600                 pci_read_config_word(pdev, pos + 4, &vendor);
5601                 pci_read_config_word(pdev, pos + 8, &id);
5602                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5603                         return pos;
5604
5605                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5606         }
5607
5608         return 0;
5609 }
5610
5611 static bool
5612 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5613 {
5614         if (feat == IOMMU_DEV_FEAT_AUX) {
5615                 int ret;
5616
5617                 if (!dev_is_pci(dev) || dmar_disabled ||
5618                     !scalable_mode_support() || !iommu_pasid_support())
5619                         return false;
5620
5621                 ret = pci_pasid_features(to_pci_dev(dev));
5622                 if (ret < 0)
5623                         return false;
5624
5625                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5626         }
5627
5628         return false;
5629 }
5630
5631 static int
5632 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5633 {
5634         if (feat == IOMMU_DEV_FEAT_AUX)
5635                 return intel_iommu_enable_auxd(dev);
5636
5637         return -ENODEV;
5638 }
5639
5640 static int
5641 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5642 {
5643         if (feat == IOMMU_DEV_FEAT_AUX)
5644                 return intel_iommu_disable_auxd(dev);
5645
5646         return -ENODEV;
5647 }
5648
5649 static bool
5650 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5651 {
5652         struct device_domain_info *info = dev->archdata.iommu;
5653
5654         if (feat == IOMMU_DEV_FEAT_AUX)
5655                 return scalable_mode_support() && info && info->auxd_enabled;
5656
5657         return false;
5658 }
5659
5660 static int
5661 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5662 {
5663         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5664
5665         return dmar_domain->default_pasid > 0 ?
5666                         dmar_domain->default_pasid : -EINVAL;
5667 }
5668
5669 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5670                                            struct device *dev)
5671 {
5672         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5673 }
5674
5675 const struct iommu_ops intel_iommu_ops = {
5676         .capable                = intel_iommu_capable,
5677         .domain_alloc           = intel_iommu_domain_alloc,
5678         .domain_free            = intel_iommu_domain_free,
5679         .attach_dev             = intel_iommu_attach_device,
5680         .detach_dev             = intel_iommu_detach_device,
5681         .aux_attach_dev         = intel_iommu_aux_attach_device,
5682         .aux_detach_dev         = intel_iommu_aux_detach_device,
5683         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5684         .map                    = intel_iommu_map,
5685         .unmap                  = intel_iommu_unmap,
5686         .iova_to_phys           = intel_iommu_iova_to_phys,
5687         .add_device             = intel_iommu_add_device,
5688         .remove_device          = intel_iommu_remove_device,
5689         .get_resv_regions       = intel_iommu_get_resv_regions,
5690         .put_resv_regions       = intel_iommu_put_resv_regions,
5691         .apply_resv_region      = intel_iommu_apply_resv_region,
5692         .device_group           = pci_device_group,
5693         .dev_has_feat           = intel_iommu_dev_has_feat,
5694         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5695         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5696         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5697         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5698         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5699 };
5700
5701 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5702 {
5703         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5704         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5705         dmar_map_gfx = 0;
5706 }
5707
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5715
5716 static void quirk_iommu_rwbf(struct pci_dev *dev)
5717 {
5718         /*
5719          * Mobile 4 Series Chipset neglects to set RWBF capability,
5720          * but needs it. Same seems to hold for the desktop versions.
5721          */
5722         pci_info(dev, "Forcing write-buffer flush capability\n");
5723         rwbf_quirk = 1;
5724 }
5725
5726 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5727 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5728 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5729 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5730 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5731 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5732 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5733
5734 #define GGC 0x52
5735 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5736 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5737 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5738 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5739 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5740 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5741 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5742 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5743
5744 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5745 {
5746         unsigned short ggc;
5747
5748         if (pci_read_config_word(dev, GGC, &ggc))
5749                 return;
5750
5751         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5752                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5753                 dmar_map_gfx = 0;
5754         } else if (dmar_map_gfx) {
5755                 /* we have to ensure the gfx device is idle before we flush */
5756                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5757                 intel_iommu_strict = 1;
5758        }
5759 }
5760 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5761 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5762 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5763 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5764
5765 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5766    ISOCH DMAR unit for the Azalia sound device, but not give it any
5767    TLB entries, which causes it to deadlock. Check for that.  We do
5768    this in a function called from init_dmars(), instead of in a PCI
5769    quirk, because we don't want to print the obnoxious "BIOS broken"
5770    message if VT-d is actually disabled.
5771 */
5772 static void __init check_tylersburg_isoch(void)
5773 {
5774         struct pci_dev *pdev;
5775         uint32_t vtisochctrl;
5776
5777         /* If there's no Azalia in the system anyway, forget it. */
5778         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5779         if (!pdev)
5780                 return;
5781         pci_dev_put(pdev);
5782
5783         /* System Management Registers. Might be hidden, in which case
5784            we can't do the sanity check. But that's OK, because the
5785            known-broken BIOSes _don't_ actually hide it, so far. */
5786         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5787         if (!pdev)
5788                 return;
5789
5790         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5791                 pci_dev_put(pdev);
5792                 return;
5793         }
5794
5795         pci_dev_put(pdev);
5796
5797         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5798         if (vtisochctrl & 1)
5799                 return;
5800
5801         /* Drop all bits other than the number of TLB entries */
5802         vtisochctrl &= 0x1c;
5803
5804         /* If we have the recommended number of TLB entries (16), fine. */
5805         if (vtisochctrl == 0x10)
5806                 return;
5807
5808         /* Zero TLB entries? You get to ride the short bus to school. */
5809         if (!vtisochctrl) {
5810                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5811                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5812                      dmi_get_system_info(DMI_BIOS_VENDOR),
5813                      dmi_get_system_info(DMI_BIOS_VERSION),
5814                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5815                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5816                 return;
5817         }
5818
5819         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5820                vtisochctrl);
5821 }