]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
ca0a1d5d2983aa12e514318de03e8e3a26304bb5
[linux.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22 #define dev_fmt(fmt)    pr_fmt(fmt)
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/memory.h>
37 #include <linux/cpu.h>
38 #include <linux/timer.h>
39 #include <linux/io.h>
40 #include <linux/iova.h>
41 #include <linux/iommu.h>
42 #include <linux/intel-iommu.h>
43 #include <linux/syscore_ops.h>
44 #include <linux/tboot.h>
45 #include <linux/dmi.h>
46 #include <linux/pci-ats.h>
47 #include <linux/memblock.h>
48 #include <linux/dma-contiguous.h>
49 #include <linux/dma-direct.h>
50 #include <linux/crash_dump.h>
51 #include <linux/numa.h>
52 #include <asm/irq_remapping.h>
53 #include <asm/cacheflush.h>
54 #include <asm/iommu.h>
55
56 #include "irq_remapping.h"
57 #include "intel-pasid.h"
58
59 #define ROOT_SIZE               VTD_PAGE_SIZE
60 #define CONTEXT_SIZE            VTD_PAGE_SIZE
61
62 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
63 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
64 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
65 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
66
67 #define IOAPIC_RANGE_START      (0xfee00000)
68 #define IOAPIC_RANGE_END        (0xfeefffff)
69 #define IOVA_START_ADDR         (0x1000)
70
71 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
72
73 #define MAX_AGAW_WIDTH 64
74 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
75
76 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
77 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
78
79 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
80    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
81 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
82                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
83 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
84
85 /* IO virtual address start page frame number */
86 #define IOVA_START_PFN          (1)
87
88 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
89
90 /* page table handling */
91 #define LEVEL_STRIDE            (9)
92 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
93
94 /*
95  * This bitmap is used to advertise the page sizes our hardware support
96  * to the IOMMU core, which will then use this information to split
97  * physically contiguous memory regions it is mapping into page sizes
98  * that we support.
99  *
100  * Traditionally the IOMMU core just handed us the mappings directly,
101  * after making sure the size is an order of a 4KiB page and that the
102  * mapping has natural alignment.
103  *
104  * To retain this behavior, we currently advertise that we support
105  * all page sizes that are an order of 4KiB.
106  *
107  * If at some point we'd like to utilize the IOMMU core's new behavior,
108  * we could change this to advertise the real page sizes we support.
109  */
110 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
111
112 static inline int agaw_to_level(int agaw)
113 {
114         return agaw + 2;
115 }
116
117 static inline int agaw_to_width(int agaw)
118 {
119         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
120 }
121
122 static inline int width_to_agaw(int width)
123 {
124         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
125 }
126
127 static inline unsigned int level_to_offset_bits(int level)
128 {
129         return (level - 1) * LEVEL_STRIDE;
130 }
131
132 static inline int pfn_level_offset(unsigned long pfn, int level)
133 {
134         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
135 }
136
137 static inline unsigned long level_mask(int level)
138 {
139         return -1UL << level_to_offset_bits(level);
140 }
141
142 static inline unsigned long level_size(int level)
143 {
144         return 1UL << level_to_offset_bits(level);
145 }
146
147 static inline unsigned long align_to_level(unsigned long pfn, int level)
148 {
149         return (pfn + level_size(level) - 1) & level_mask(level);
150 }
151
152 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
153 {
154         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
155 }
156
157 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
158    are never going to work. */
159 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
160 {
161         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
162 }
163
164 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
165 {
166         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
167 }
168 static inline unsigned long page_to_dma_pfn(struct page *pg)
169 {
170         return mm_to_dma_pfn(page_to_pfn(pg));
171 }
172 static inline unsigned long virt_to_dma_pfn(void *p)
173 {
174         return page_to_dma_pfn(virt_to_page(p));
175 }
176
177 /* global iommu list, set NULL for ignored DMAR units */
178 static struct intel_iommu **g_iommus;
179
180 static void __init check_tylersburg_isoch(void);
181 static int rwbf_quirk;
182
183 /*
184  * set to 1 to panic kernel if can't successfully enable VT-d
185  * (used when kernel is launched w/ TXT)
186  */
187 static int force_on = 0;
188 int intel_iommu_tboot_noforce;
189 static int no_platform_optin;
190
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193 /*
194  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
195  * if marked present.
196  */
197 static phys_addr_t root_entry_lctp(struct root_entry *re)
198 {
199         if (!(re->lo & 1))
200                 return 0;
201
202         return re->lo & VTD_PAGE_MASK;
203 }
204
205 /*
206  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
207  * if marked present.
208  */
209 static phys_addr_t root_entry_uctp(struct root_entry *re)
210 {
211         if (!(re->hi & 1))
212                 return 0;
213
214         return re->hi & VTD_PAGE_MASK;
215 }
216
217 static inline void context_clear_pasid_enable(struct context_entry *context)
218 {
219         context->lo &= ~(1ULL << 11);
220 }
221
222 static inline bool context_pasid_enabled(struct context_entry *context)
223 {
224         return !!(context->lo & (1ULL << 11));
225 }
226
227 static inline void context_set_copied(struct context_entry *context)
228 {
229         context->hi |= (1ull << 3);
230 }
231
232 static inline bool context_copied(struct context_entry *context)
233 {
234         return !!(context->hi & (1ULL << 3));
235 }
236
237 static inline bool __context_present(struct context_entry *context)
238 {
239         return (context->lo & 1);
240 }
241
242 bool context_present(struct context_entry *context)
243 {
244         return context_pasid_enabled(context) ?
245              __context_present(context) :
246              __context_present(context) && !context_copied(context);
247 }
248
249 static inline void context_set_present(struct context_entry *context)
250 {
251         context->lo |= 1;
252 }
253
254 static inline void context_set_fault_enable(struct context_entry *context)
255 {
256         context->lo &= (((u64)-1) << 2) | 1;
257 }
258
259 static inline void context_set_translation_type(struct context_entry *context,
260                                                 unsigned long value)
261 {
262         context->lo &= (((u64)-1) << 4) | 3;
263         context->lo |= (value & 3) << 2;
264 }
265
266 static inline void context_set_address_root(struct context_entry *context,
267                                             unsigned long value)
268 {
269         context->lo &= ~VTD_PAGE_MASK;
270         context->lo |= value & VTD_PAGE_MASK;
271 }
272
273 static inline void context_set_address_width(struct context_entry *context,
274                                              unsigned long value)
275 {
276         context->hi |= value & 7;
277 }
278
279 static inline void context_set_domain_id(struct context_entry *context,
280                                          unsigned long value)
281 {
282         context->hi |= (value & ((1 << 16) - 1)) << 8;
283 }
284
285 static inline int context_domain_id(struct context_entry *c)
286 {
287         return((c->hi >> 8) & 0xffff);
288 }
289
290 static inline void context_clear_entry(struct context_entry *context)
291 {
292         context->lo = 0;
293         context->hi = 0;
294 }
295
296 /*
297  * This domain is a statically identity mapping domain.
298  *      1. This domain creats a static 1:1 mapping to all usable memory.
299  *      2. It maps to each iommu if successful.
300  *      3. Each iommu mapps to this domain if successful.
301  */
302 static struct dmar_domain *si_domain;
303 static int hw_pass_through = 1;
304
305 /* si_domain contains mulitple devices */
306 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
307
308 /*
309  * This is a DMA domain allocated through the iommu domain allocation
310  * interface. But one or more devices belonging to this domain have
311  * been chosen to use a private domain. We should avoid to use the
312  * map/unmap/iova_to_phys APIs on it.
313  */
314 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
315
316 #define for_each_domain_iommu(idx, domain)                      \
317         for (idx = 0; idx < g_num_of_iommus; idx++)             \
318                 if (domain->iommu_refcnt[idx])
319
320 struct dmar_rmrr_unit {
321         struct list_head list;          /* list of rmrr units   */
322         struct acpi_dmar_header *hdr;   /* ACPI header          */
323         u64     base_address;           /* reserved base address*/
324         u64     end_address;            /* reserved end address */
325         struct dmar_dev_scope *devices; /* target devices */
326         int     devices_cnt;            /* target device count */
327 };
328
329 struct dmar_atsr_unit {
330         struct list_head list;          /* list of ATSR units */
331         struct acpi_dmar_header *hdr;   /* ACPI header */
332         struct dmar_dev_scope *devices; /* target devices */
333         int devices_cnt;                /* target device count */
334         u8 include_all:1;               /* include all ports */
335 };
336
337 static LIST_HEAD(dmar_atsr_units);
338 static LIST_HEAD(dmar_rmrr_units);
339
340 #define for_each_rmrr_units(rmrr) \
341         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
342
343 /* bitmap for indexing intel_iommus */
344 static int g_num_of_iommus;
345
346 static void domain_exit(struct dmar_domain *domain);
347 static void domain_remove_dev_info(struct dmar_domain *domain);
348 static void dmar_remove_one_dev_info(struct device *dev);
349 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
350 static void domain_context_clear(struct intel_iommu *iommu,
351                                  struct device *dev);
352 static int domain_detach_iommu(struct dmar_domain *domain,
353                                struct intel_iommu *iommu);
354 static bool device_is_rmrr_locked(struct device *dev);
355 static int intel_iommu_attach_device(struct iommu_domain *domain,
356                                      struct device *dev);
357
358 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
359 int dmar_disabled = 0;
360 #else
361 int dmar_disabled = 1;
362 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
363
364 int intel_iommu_sm;
365 int intel_iommu_enabled = 0;
366 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
367
368 static int dmar_map_gfx = 1;
369 static int dmar_forcedac;
370 static int intel_iommu_strict;
371 static int intel_iommu_superpage = 1;
372 static int iommu_identity_mapping;
373
374 #define IDENTMAP_ALL            1
375 #define IDENTMAP_GFX            2
376 #define IDENTMAP_AZALIA         4
377
378 int intel_iommu_gfx_mapped;
379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
380
381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
382 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
383 static DEFINE_SPINLOCK(device_domain_lock);
384 static LIST_HEAD(device_domain_list);
385
386 /*
387  * Iterate over elements in device_domain_list and call the specified
388  * callback @fn against each element.
389  */
390 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
391                                      void *data), void *data)
392 {
393         int ret = 0;
394         unsigned long flags;
395         struct device_domain_info *info;
396
397         spin_lock_irqsave(&device_domain_lock, flags);
398         list_for_each_entry(info, &device_domain_list, global) {
399                 ret = fn(info, data);
400                 if (ret) {
401                         spin_unlock_irqrestore(&device_domain_lock, flags);
402                         return ret;
403                 }
404         }
405         spin_unlock_irqrestore(&device_domain_lock, flags);
406
407         return 0;
408 }
409
410 const struct iommu_ops intel_iommu_ops;
411
412 static bool translation_pre_enabled(struct intel_iommu *iommu)
413 {
414         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
415 }
416
417 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
418 {
419         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
420 }
421
422 static void init_translation_status(struct intel_iommu *iommu)
423 {
424         u32 gsts;
425
426         gsts = readl(iommu->reg + DMAR_GSTS_REG);
427         if (gsts & DMA_GSTS_TES)
428                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
429 }
430
431 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
432 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
433 {
434         return container_of(dom, struct dmar_domain, domain);
435 }
436
437 static int __init intel_iommu_setup(char *str)
438 {
439         if (!str)
440                 return -EINVAL;
441         while (*str) {
442                 if (!strncmp(str, "on", 2)) {
443                         dmar_disabled = 0;
444                         pr_info("IOMMU enabled\n");
445                 } else if (!strncmp(str, "off", 3)) {
446                         dmar_disabled = 1;
447                         no_platform_optin = 1;
448                         pr_info("IOMMU disabled\n");
449                 } else if (!strncmp(str, "igfx_off", 8)) {
450                         dmar_map_gfx = 0;
451                         pr_info("Disable GFX device mapping\n");
452                 } else if (!strncmp(str, "forcedac", 8)) {
453                         pr_info("Forcing DAC for PCI devices\n");
454                         dmar_forcedac = 1;
455                 } else if (!strncmp(str, "strict", 6)) {
456                         pr_info("Disable batched IOTLB flush\n");
457                         intel_iommu_strict = 1;
458                 } else if (!strncmp(str, "sp_off", 6)) {
459                         pr_info("Disable supported super page\n");
460                         intel_iommu_superpage = 0;
461                 } else if (!strncmp(str, "sm_on", 5)) {
462                         pr_info("Intel-IOMMU: scalable mode supported\n");
463                         intel_iommu_sm = 1;
464                 } else if (!strncmp(str, "tboot_noforce", 13)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
467                         intel_iommu_tboot_noforce = 1;
468                 }
469
470                 str += strcspn(str, ",");
471                 while (*str == ',')
472                         str++;
473         }
474         return 0;
475 }
476 __setup("intel_iommu=", intel_iommu_setup);
477
478 static struct kmem_cache *iommu_domain_cache;
479 static struct kmem_cache *iommu_devinfo_cache;
480
481 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
482 {
483         struct dmar_domain **domains;
484         int idx = did >> 8;
485
486         domains = iommu->domains[idx];
487         if (!domains)
488                 return NULL;
489
490         return domains[did & 0xff];
491 }
492
493 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
494                              struct dmar_domain *domain)
495 {
496         struct dmar_domain **domains;
497         int idx = did >> 8;
498
499         if (!iommu->domains[idx]) {
500                 size_t size = 256 * sizeof(struct dmar_domain *);
501                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
502         }
503
504         domains = iommu->domains[idx];
505         if (WARN_ON(!domains))
506                 return;
507         else
508                 domains[did & 0xff] = domain;
509 }
510
511 void *alloc_pgtable_page(int node)
512 {
513         struct page *page;
514         void *vaddr = NULL;
515
516         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
517         if (page)
518                 vaddr = page_address(page);
519         return vaddr;
520 }
521
522 void free_pgtable_page(void *vaddr)
523 {
524         free_page((unsigned long)vaddr);
525 }
526
527 static inline void *alloc_domain_mem(void)
528 {
529         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
530 }
531
532 static void free_domain_mem(void *vaddr)
533 {
534         kmem_cache_free(iommu_domain_cache, vaddr);
535 }
536
537 static inline void * alloc_devinfo_mem(void)
538 {
539         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
540 }
541
542 static inline void free_devinfo_mem(void *vaddr)
543 {
544         kmem_cache_free(iommu_devinfo_cache, vaddr);
545 }
546
547 static inline int domain_type_is_si(struct dmar_domain *domain)
548 {
549         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
550 }
551
552 static inline int domain_pfn_supported(struct dmar_domain *domain,
553                                        unsigned long pfn)
554 {
555         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
556
557         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
558 }
559
560 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
561 {
562         unsigned long sagaw;
563         int agaw = -1;
564
565         sagaw = cap_sagaw(iommu->cap);
566         for (agaw = width_to_agaw(max_gaw);
567              agaw >= 0; agaw--) {
568                 if (test_bit(agaw, &sagaw))
569                         break;
570         }
571
572         return agaw;
573 }
574
575 /*
576  * Calculate max SAGAW for each iommu.
577  */
578 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
579 {
580         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
581 }
582
583 /*
584  * calculate agaw for each iommu.
585  * "SAGAW" may be different across iommus, use a default agaw, and
586  * get a supported less agaw for iommus that don't support the default agaw.
587  */
588 int iommu_calculate_agaw(struct intel_iommu *iommu)
589 {
590         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
591 }
592
593 /* This functionin only returns single iommu in a domain */
594 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
595 {
596         int iommu_id;
597
598         /* si_domain and vm domain should not get here. */
599         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
600                 return NULL;
601
602         for_each_domain_iommu(iommu_id, domain)
603                 break;
604
605         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
606                 return NULL;
607
608         return g_iommus[iommu_id];
609 }
610
611 static void domain_update_iommu_coherency(struct dmar_domain *domain)
612 {
613         struct dmar_drhd_unit *drhd;
614         struct intel_iommu *iommu;
615         bool found = false;
616         int i;
617
618         domain->iommu_coherency = 1;
619
620         for_each_domain_iommu(i, domain) {
621                 found = true;
622                 if (!ecap_coherent(g_iommus[i]->ecap)) {
623                         domain->iommu_coherency = 0;
624                         break;
625                 }
626         }
627         if (found)
628                 return;
629
630         /* No hardware attached; use lowest common denominator */
631         rcu_read_lock();
632         for_each_active_iommu(iommu, drhd) {
633                 if (!ecap_coherent(iommu->ecap)) {
634                         domain->iommu_coherency = 0;
635                         break;
636                 }
637         }
638         rcu_read_unlock();
639 }
640
641 static int domain_update_iommu_snooping(struct intel_iommu *skip)
642 {
643         struct dmar_drhd_unit *drhd;
644         struct intel_iommu *iommu;
645         int ret = 1;
646
647         rcu_read_lock();
648         for_each_active_iommu(iommu, drhd) {
649                 if (iommu != skip) {
650                         if (!ecap_sc_support(iommu->ecap)) {
651                                 ret = 0;
652                                 break;
653                         }
654                 }
655         }
656         rcu_read_unlock();
657
658         return ret;
659 }
660
661 static int domain_update_iommu_superpage(struct intel_iommu *skip)
662 {
663         struct dmar_drhd_unit *drhd;
664         struct intel_iommu *iommu;
665         int mask = 0xf;
666
667         if (!intel_iommu_superpage) {
668                 return 0;
669         }
670
671         /* set iommu_superpage to the smallest common denominator */
672         rcu_read_lock();
673         for_each_active_iommu(iommu, drhd) {
674                 if (iommu != skip) {
675                         mask &= cap_super_page_val(iommu->cap);
676                         if (!mask)
677                                 break;
678                 }
679         }
680         rcu_read_unlock();
681
682         return fls(mask);
683 }
684
685 /* Some capabilities may be different across iommus */
686 static void domain_update_iommu_cap(struct dmar_domain *domain)
687 {
688         domain_update_iommu_coherency(domain);
689         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
690         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
691 }
692
693 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
694                                          u8 devfn, int alloc)
695 {
696         struct root_entry *root = &iommu->root_entry[bus];
697         struct context_entry *context;
698         u64 *entry;
699
700         entry = &root->lo;
701         if (sm_supported(iommu)) {
702                 if (devfn >= 0x80) {
703                         devfn -= 0x80;
704                         entry = &root->hi;
705                 }
706                 devfn *= 2;
707         }
708         if (*entry & 1)
709                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
710         else {
711                 unsigned long phy_addr;
712                 if (!alloc)
713                         return NULL;
714
715                 context = alloc_pgtable_page(iommu->node);
716                 if (!context)
717                         return NULL;
718
719                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
720                 phy_addr = virt_to_phys((void *)context);
721                 *entry = phy_addr | 1;
722                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
723         }
724         return &context[devfn];
725 }
726
727 static int iommu_dummy(struct device *dev)
728 {
729         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
730 }
731
732 /**
733  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
734  *                               sub-hierarchy of a candidate PCI-PCI bridge
735  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
736  * @bridge: the candidate PCI-PCI bridge
737  *
738  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
739  */
740 static bool
741 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
742 {
743         struct pci_dev *pdev, *pbridge;
744
745         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
746                 return false;
747
748         pdev = to_pci_dev(dev);
749         pbridge = to_pci_dev(bridge);
750
751         if (pbridge->subordinate &&
752             pbridge->subordinate->number <= pdev->bus->number &&
753             pbridge->subordinate->busn_res.end >= pdev->bus->number)
754                 return true;
755
756         return false;
757 }
758
759 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
760 {
761         struct dmar_drhd_unit *drhd = NULL;
762         struct intel_iommu *iommu;
763         struct device *tmp;
764         struct pci_dev *pdev = NULL;
765         u16 segment = 0;
766         int i;
767
768         if (iommu_dummy(dev))
769                 return NULL;
770
771         if (dev_is_pci(dev)) {
772                 struct pci_dev *pf_pdev;
773
774                 pdev = to_pci_dev(dev);
775
776 #ifdef CONFIG_X86
777                 /* VMD child devices currently cannot be handled individually */
778                 if (is_vmd(pdev->bus))
779                         return NULL;
780 #endif
781
782                 /* VFs aren't listed in scope tables; we need to look up
783                  * the PF instead to find the IOMMU. */
784                 pf_pdev = pci_physfn(pdev);
785                 dev = &pf_pdev->dev;
786                 segment = pci_domain_nr(pdev->bus);
787         } else if (has_acpi_companion(dev))
788                 dev = &ACPI_COMPANION(dev)->dev;
789
790         rcu_read_lock();
791         for_each_active_iommu(iommu, drhd) {
792                 if (pdev && segment != drhd->segment)
793                         continue;
794
795                 for_each_active_dev_scope(drhd->devices,
796                                           drhd->devices_cnt, i, tmp) {
797                         if (tmp == dev) {
798                                 /* For a VF use its original BDF# not that of the PF
799                                  * which we used for the IOMMU lookup. Strictly speaking
800                                  * we could do this for all PCI devices; we only need to
801                                  * get the BDF# from the scope table for ACPI matches. */
802                                 if (pdev && pdev->is_virtfn)
803                                         goto got_pdev;
804
805                                 *bus = drhd->devices[i].bus;
806                                 *devfn = drhd->devices[i].devfn;
807                                 goto out;
808                         }
809
810                         if (is_downstream_to_pci_bridge(dev, tmp))
811                                 goto got_pdev;
812                 }
813
814                 if (pdev && drhd->include_all) {
815                 got_pdev:
816                         *bus = pdev->bus->number;
817                         *devfn = pdev->devfn;
818                         goto out;
819                 }
820         }
821         iommu = NULL;
822  out:
823         rcu_read_unlock();
824
825         return iommu;
826 }
827
828 static void domain_flush_cache(struct dmar_domain *domain,
829                                void *addr, int size)
830 {
831         if (!domain->iommu_coherency)
832                 clflush_cache_range(addr, size);
833 }
834
835 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
836 {
837         struct context_entry *context;
838         int ret = 0;
839         unsigned long flags;
840
841         spin_lock_irqsave(&iommu->lock, flags);
842         context = iommu_context_addr(iommu, bus, devfn, 0);
843         if (context)
844                 ret = context_present(context);
845         spin_unlock_irqrestore(&iommu->lock, flags);
846         return ret;
847 }
848
849 static void free_context_table(struct intel_iommu *iommu)
850 {
851         int i;
852         unsigned long flags;
853         struct context_entry *context;
854
855         spin_lock_irqsave(&iommu->lock, flags);
856         if (!iommu->root_entry) {
857                 goto out;
858         }
859         for (i = 0; i < ROOT_ENTRY_NR; i++) {
860                 context = iommu_context_addr(iommu, i, 0, 0);
861                 if (context)
862                         free_pgtable_page(context);
863
864                 if (!sm_supported(iommu))
865                         continue;
866
867                 context = iommu_context_addr(iommu, i, 0x80, 0);
868                 if (context)
869                         free_pgtable_page(context);
870
871         }
872         free_pgtable_page(iommu->root_entry);
873         iommu->root_entry = NULL;
874 out:
875         spin_unlock_irqrestore(&iommu->lock, flags);
876 }
877
878 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
879                                       unsigned long pfn, int *target_level)
880 {
881         struct dma_pte *parent, *pte;
882         int level = agaw_to_level(domain->agaw);
883         int offset;
884
885         BUG_ON(!domain->pgd);
886
887         if (!domain_pfn_supported(domain, pfn))
888                 /* Address beyond IOMMU's addressing capabilities. */
889                 return NULL;
890
891         parent = domain->pgd;
892
893         while (1) {
894                 void *tmp_page;
895
896                 offset = pfn_level_offset(pfn, level);
897                 pte = &parent[offset];
898                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
899                         break;
900                 if (level == *target_level)
901                         break;
902
903                 if (!dma_pte_present(pte)) {
904                         uint64_t pteval;
905
906                         tmp_page = alloc_pgtable_page(domain->nid);
907
908                         if (!tmp_page)
909                                 return NULL;
910
911                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
912                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
913                         if (cmpxchg64(&pte->val, 0ULL, pteval))
914                                 /* Someone else set it while we were thinking; use theirs. */
915                                 free_pgtable_page(tmp_page);
916                         else
917                                 domain_flush_cache(domain, pte, sizeof(*pte));
918                 }
919                 if (level == 1)
920                         break;
921
922                 parent = phys_to_virt(dma_pte_addr(pte));
923                 level--;
924         }
925
926         if (!*target_level)
927                 *target_level = level;
928
929         return pte;
930 }
931
932 /* return address's pte at specific level */
933 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
934                                          unsigned long pfn,
935                                          int level, int *large_page)
936 {
937         struct dma_pte *parent, *pte;
938         int total = agaw_to_level(domain->agaw);
939         int offset;
940
941         parent = domain->pgd;
942         while (level <= total) {
943                 offset = pfn_level_offset(pfn, total);
944                 pte = &parent[offset];
945                 if (level == total)
946                         return pte;
947
948                 if (!dma_pte_present(pte)) {
949                         *large_page = total;
950                         break;
951                 }
952
953                 if (dma_pte_superpage(pte)) {
954                         *large_page = total;
955                         return pte;
956                 }
957
958                 parent = phys_to_virt(dma_pte_addr(pte));
959                 total--;
960         }
961         return NULL;
962 }
963
964 /* clear last level pte, a tlb flush should be followed */
965 static void dma_pte_clear_range(struct dmar_domain *domain,
966                                 unsigned long start_pfn,
967                                 unsigned long last_pfn)
968 {
969         unsigned int large_page;
970         struct dma_pte *first_pte, *pte;
971
972         BUG_ON(!domain_pfn_supported(domain, start_pfn));
973         BUG_ON(!domain_pfn_supported(domain, last_pfn));
974         BUG_ON(start_pfn > last_pfn);
975
976         /* we don't need lock here; nobody else touches the iova range */
977         do {
978                 large_page = 1;
979                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
980                 if (!pte) {
981                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
982                         continue;
983                 }
984                 do {
985                         dma_clear_pte(pte);
986                         start_pfn += lvl_to_nr_pages(large_page);
987                         pte++;
988                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
989
990                 domain_flush_cache(domain, first_pte,
991                                    (void *)pte - (void *)first_pte);
992
993         } while (start_pfn && start_pfn <= last_pfn);
994 }
995
996 static void dma_pte_free_level(struct dmar_domain *domain, int level,
997                                int retain_level, struct dma_pte *pte,
998                                unsigned long pfn, unsigned long start_pfn,
999                                unsigned long last_pfn)
1000 {
1001         pfn = max(start_pfn, pfn);
1002         pte = &pte[pfn_level_offset(pfn, level)];
1003
1004         do {
1005                 unsigned long level_pfn;
1006                 struct dma_pte *level_pte;
1007
1008                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1009                         goto next;
1010
1011                 level_pfn = pfn & level_mask(level);
1012                 level_pte = phys_to_virt(dma_pte_addr(pte));
1013
1014                 if (level > 2) {
1015                         dma_pte_free_level(domain, level - 1, retain_level,
1016                                            level_pte, level_pfn, start_pfn,
1017                                            last_pfn);
1018                 }
1019
1020                 /*
1021                  * Free the page table if we're below the level we want to
1022                  * retain and the range covers the entire table.
1023                  */
1024                 if (level < retain_level && !(start_pfn > level_pfn ||
1025                       last_pfn < level_pfn + level_size(level) - 1)) {
1026                         dma_clear_pte(pte);
1027                         domain_flush_cache(domain, pte, sizeof(*pte));
1028                         free_pgtable_page(level_pte);
1029                 }
1030 next:
1031                 pfn += level_size(level);
1032         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1033 }
1034
1035 /*
1036  * clear last level (leaf) ptes and free page table pages below the
1037  * level we wish to keep intact.
1038  */
1039 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1040                                    unsigned long start_pfn,
1041                                    unsigned long last_pfn,
1042                                    int retain_level)
1043 {
1044         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1045         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1046         BUG_ON(start_pfn > last_pfn);
1047
1048         dma_pte_clear_range(domain, start_pfn, last_pfn);
1049
1050         /* We don't need lock here; nobody else touches the iova range */
1051         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1052                            domain->pgd, 0, start_pfn, last_pfn);
1053
1054         /* free pgd */
1055         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1056                 free_pgtable_page(domain->pgd);
1057                 domain->pgd = NULL;
1058         }
1059 }
1060
1061 /* When a page at a given level is being unlinked from its parent, we don't
1062    need to *modify* it at all. All we need to do is make a list of all the
1063    pages which can be freed just as soon as we've flushed the IOTLB and we
1064    know the hardware page-walk will no longer touch them.
1065    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1066    be freed. */
1067 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1068                                             int level, struct dma_pte *pte,
1069                                             struct page *freelist)
1070 {
1071         struct page *pg;
1072
1073         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1074         pg->freelist = freelist;
1075         freelist = pg;
1076
1077         if (level == 1)
1078                 return freelist;
1079
1080         pte = page_address(pg);
1081         do {
1082                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1083                         freelist = dma_pte_list_pagetables(domain, level - 1,
1084                                                            pte, freelist);
1085                 pte++;
1086         } while (!first_pte_in_page(pte));
1087
1088         return freelist;
1089 }
1090
1091 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1092                                         struct dma_pte *pte, unsigned long pfn,
1093                                         unsigned long start_pfn,
1094                                         unsigned long last_pfn,
1095                                         struct page *freelist)
1096 {
1097         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1098
1099         pfn = max(start_pfn, pfn);
1100         pte = &pte[pfn_level_offset(pfn, level)];
1101
1102         do {
1103                 unsigned long level_pfn;
1104
1105                 if (!dma_pte_present(pte))
1106                         goto next;
1107
1108                 level_pfn = pfn & level_mask(level);
1109
1110                 /* If range covers entire pagetable, free it */
1111                 if (start_pfn <= level_pfn &&
1112                     last_pfn >= level_pfn + level_size(level) - 1) {
1113                         /* These suborbinate page tables are going away entirely. Don't
1114                            bother to clear them; we're just going to *free* them. */
1115                         if (level > 1 && !dma_pte_superpage(pte))
1116                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1117
1118                         dma_clear_pte(pte);
1119                         if (!first_pte)
1120                                 first_pte = pte;
1121                         last_pte = pte;
1122                 } else if (level > 1) {
1123                         /* Recurse down into a level that isn't *entirely* obsolete */
1124                         freelist = dma_pte_clear_level(domain, level - 1,
1125                                                        phys_to_virt(dma_pte_addr(pte)),
1126                                                        level_pfn, start_pfn, last_pfn,
1127                                                        freelist);
1128                 }
1129 next:
1130                 pfn += level_size(level);
1131         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1132
1133         if (first_pte)
1134                 domain_flush_cache(domain, first_pte,
1135                                    (void *)++last_pte - (void *)first_pte);
1136
1137         return freelist;
1138 }
1139
1140 /* We can't just free the pages because the IOMMU may still be walking
1141    the page tables, and may have cached the intermediate levels. The
1142    pages can only be freed after the IOTLB flush has been done. */
1143 static struct page *domain_unmap(struct dmar_domain *domain,
1144                                  unsigned long start_pfn,
1145                                  unsigned long last_pfn)
1146 {
1147         struct page *freelist;
1148
1149         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1150         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1151         BUG_ON(start_pfn > last_pfn);
1152
1153         /* we don't need lock here; nobody else touches the iova range */
1154         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1155                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1156
1157         /* free pgd */
1158         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1159                 struct page *pgd_page = virt_to_page(domain->pgd);
1160                 pgd_page->freelist = freelist;
1161                 freelist = pgd_page;
1162
1163                 domain->pgd = NULL;
1164         }
1165
1166         return freelist;
1167 }
1168
1169 static void dma_free_pagelist(struct page *freelist)
1170 {
1171         struct page *pg;
1172
1173         while ((pg = freelist)) {
1174                 freelist = pg->freelist;
1175                 free_pgtable_page(page_address(pg));
1176         }
1177 }
1178
1179 static void iova_entry_free(unsigned long data)
1180 {
1181         struct page *freelist = (struct page *)data;
1182
1183         dma_free_pagelist(freelist);
1184 }
1185
1186 /* iommu handling */
1187 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1188 {
1189         struct root_entry *root;
1190         unsigned long flags;
1191
1192         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1193         if (!root) {
1194                 pr_err("Allocating root entry for %s failed\n",
1195                         iommu->name);
1196                 return -ENOMEM;
1197         }
1198
1199         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1200
1201         spin_lock_irqsave(&iommu->lock, flags);
1202         iommu->root_entry = root;
1203         spin_unlock_irqrestore(&iommu->lock, flags);
1204
1205         return 0;
1206 }
1207
1208 static void iommu_set_root_entry(struct intel_iommu *iommu)
1209 {
1210         u64 addr;
1211         u32 sts;
1212         unsigned long flag;
1213
1214         addr = virt_to_phys(iommu->root_entry);
1215         if (sm_supported(iommu))
1216                 addr |= DMA_RTADDR_SMT;
1217
1218         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1219         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1220
1221         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1222
1223         /* Make sure hardware complete it */
1224         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225                       readl, (sts & DMA_GSTS_RTPS), sts);
1226
1227         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1228 }
1229
1230 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1231 {
1232         u32 val;
1233         unsigned long flag;
1234
1235         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1236                 return;
1237
1238         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1239         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1240
1241         /* Make sure hardware complete it */
1242         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1243                       readl, (!(val & DMA_GSTS_WBFS)), val);
1244
1245         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1246 }
1247
1248 /* return value determine if we need a write buffer flush */
1249 static void __iommu_flush_context(struct intel_iommu *iommu,
1250                                   u16 did, u16 source_id, u8 function_mask,
1251                                   u64 type)
1252 {
1253         u64 val = 0;
1254         unsigned long flag;
1255
1256         switch (type) {
1257         case DMA_CCMD_GLOBAL_INVL:
1258                 val = DMA_CCMD_GLOBAL_INVL;
1259                 break;
1260         case DMA_CCMD_DOMAIN_INVL:
1261                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1262                 break;
1263         case DMA_CCMD_DEVICE_INVL:
1264                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1265                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1266                 break;
1267         default:
1268                 BUG();
1269         }
1270         val |= DMA_CCMD_ICC;
1271
1272         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1273         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1274
1275         /* Make sure hardware complete it */
1276         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1277                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1278
1279         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1280 }
1281
1282 /* return value determine if we need a write buffer flush */
1283 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1284                                 u64 addr, unsigned int size_order, u64 type)
1285 {
1286         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1287         u64 val = 0, val_iva = 0;
1288         unsigned long flag;
1289
1290         switch (type) {
1291         case DMA_TLB_GLOBAL_FLUSH:
1292                 /* global flush doesn't need set IVA_REG */
1293                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1294                 break;
1295         case DMA_TLB_DSI_FLUSH:
1296                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1297                 break;
1298         case DMA_TLB_PSI_FLUSH:
1299                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1300                 /* IH bit is passed in as part of address */
1301                 val_iva = size_order | addr;
1302                 break;
1303         default:
1304                 BUG();
1305         }
1306         /* Note: set drain read/write */
1307 #if 0
1308         /*
1309          * This is probably to be super secure.. Looks like we can
1310          * ignore it without any impact.
1311          */
1312         if (cap_read_drain(iommu->cap))
1313                 val |= DMA_TLB_READ_DRAIN;
1314 #endif
1315         if (cap_write_drain(iommu->cap))
1316                 val |= DMA_TLB_WRITE_DRAIN;
1317
1318         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1319         /* Note: Only uses first TLB reg currently */
1320         if (val_iva)
1321                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1322         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1323
1324         /* Make sure hardware complete it */
1325         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1326                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1327
1328         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1329
1330         /* check IOTLB invalidation granularity */
1331         if (DMA_TLB_IAIG(val) == 0)
1332                 pr_err("Flush IOTLB failed\n");
1333         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1334                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1335                         (unsigned long long)DMA_TLB_IIRG(type),
1336                         (unsigned long long)DMA_TLB_IAIG(val));
1337 }
1338
1339 static struct device_domain_info *
1340 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1341                          u8 bus, u8 devfn)
1342 {
1343         struct device_domain_info *info;
1344
1345         assert_spin_locked(&device_domain_lock);
1346
1347         if (!iommu->qi)
1348                 return NULL;
1349
1350         list_for_each_entry(info, &domain->devices, link)
1351                 if (info->iommu == iommu && info->bus == bus &&
1352                     info->devfn == devfn) {
1353                         if (info->ats_supported && info->dev)
1354                                 return info;
1355                         break;
1356                 }
1357
1358         return NULL;
1359 }
1360
1361 static void domain_update_iotlb(struct dmar_domain *domain)
1362 {
1363         struct device_domain_info *info;
1364         bool has_iotlb_device = false;
1365
1366         assert_spin_locked(&device_domain_lock);
1367
1368         list_for_each_entry(info, &domain->devices, link) {
1369                 struct pci_dev *pdev;
1370
1371                 if (!info->dev || !dev_is_pci(info->dev))
1372                         continue;
1373
1374                 pdev = to_pci_dev(info->dev);
1375                 if (pdev->ats_enabled) {
1376                         has_iotlb_device = true;
1377                         break;
1378                 }
1379         }
1380
1381         domain->has_iotlb_device = has_iotlb_device;
1382 }
1383
1384 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1385 {
1386         struct pci_dev *pdev;
1387
1388         assert_spin_locked(&device_domain_lock);
1389
1390         if (!info || !dev_is_pci(info->dev))
1391                 return;
1392
1393         pdev = to_pci_dev(info->dev);
1394         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1395          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1396          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1397          * reserved, which should be set to 0.
1398          */
1399         if (!ecap_dit(info->iommu->ecap))
1400                 info->pfsid = 0;
1401         else {
1402                 struct pci_dev *pf_pdev;
1403
1404                 /* pdev will be returned if device is not a vf */
1405                 pf_pdev = pci_physfn(pdev);
1406                 info->pfsid = pci_dev_id(pf_pdev);
1407         }
1408
1409 #ifdef CONFIG_INTEL_IOMMU_SVM
1410         /* The PCIe spec, in its wisdom, declares that the behaviour of
1411            the device if you enable PASID support after ATS support is
1412            undefined. So always enable PASID support on devices which
1413            have it, even if we can't yet know if we're ever going to
1414            use it. */
1415         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1416                 info->pasid_enabled = 1;
1417
1418         if (info->pri_supported &&
1419             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1420             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1421                 info->pri_enabled = 1;
1422 #endif
1423         if (!pdev->untrusted && info->ats_supported &&
1424             pci_ats_page_aligned(pdev) &&
1425             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1426                 info->ats_enabled = 1;
1427                 domain_update_iotlb(info->domain);
1428                 info->ats_qdep = pci_ats_queue_depth(pdev);
1429         }
1430 }
1431
1432 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1433 {
1434         struct pci_dev *pdev;
1435
1436         assert_spin_locked(&device_domain_lock);
1437
1438         if (!dev_is_pci(info->dev))
1439                 return;
1440
1441         pdev = to_pci_dev(info->dev);
1442
1443         if (info->ats_enabled) {
1444                 pci_disable_ats(pdev);
1445                 info->ats_enabled = 0;
1446                 domain_update_iotlb(info->domain);
1447         }
1448 #ifdef CONFIG_INTEL_IOMMU_SVM
1449         if (info->pri_enabled) {
1450                 pci_disable_pri(pdev);
1451                 info->pri_enabled = 0;
1452         }
1453         if (info->pasid_enabled) {
1454                 pci_disable_pasid(pdev);
1455                 info->pasid_enabled = 0;
1456         }
1457 #endif
1458 }
1459
1460 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1461                                   u64 addr, unsigned mask)
1462 {
1463         u16 sid, qdep;
1464         unsigned long flags;
1465         struct device_domain_info *info;
1466
1467         if (!domain->has_iotlb_device)
1468                 return;
1469
1470         spin_lock_irqsave(&device_domain_lock, flags);
1471         list_for_each_entry(info, &domain->devices, link) {
1472                 if (!info->ats_enabled)
1473                         continue;
1474
1475                 sid = info->bus << 8 | info->devfn;
1476                 qdep = info->ats_qdep;
1477                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1478                                 qdep, addr, mask);
1479         }
1480         spin_unlock_irqrestore(&device_domain_lock, flags);
1481 }
1482
1483 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1484                                   struct dmar_domain *domain,
1485                                   unsigned long pfn, unsigned int pages,
1486                                   int ih, int map)
1487 {
1488         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1489         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1490         u16 did = domain->iommu_did[iommu->seq_id];
1491
1492         BUG_ON(pages == 0);
1493
1494         if (ih)
1495                 ih = 1 << 6;
1496         /*
1497          * Fallback to domain selective flush if no PSI support or the size is
1498          * too big.
1499          * PSI requires page size to be 2 ^ x, and the base address is naturally
1500          * aligned to the size
1501          */
1502         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1503                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1504                                                 DMA_TLB_DSI_FLUSH);
1505         else
1506                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1507                                                 DMA_TLB_PSI_FLUSH);
1508
1509         /*
1510          * In caching mode, changes of pages from non-present to present require
1511          * flush. However, device IOTLB doesn't need to be flushed in this case.
1512          */
1513         if (!cap_caching_mode(iommu->cap) || !map)
1514                 iommu_flush_dev_iotlb(domain, addr, mask);
1515 }
1516
1517 /* Notification for newly created mappings */
1518 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1519                                         struct dmar_domain *domain,
1520                                         unsigned long pfn, unsigned int pages)
1521 {
1522         /* It's a non-present to present mapping. Only flush if caching mode */
1523         if (cap_caching_mode(iommu->cap))
1524                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1525         else
1526                 iommu_flush_write_buffer(iommu);
1527 }
1528
1529 static void iommu_flush_iova(struct iova_domain *iovad)
1530 {
1531         struct dmar_domain *domain;
1532         int idx;
1533
1534         domain = container_of(iovad, struct dmar_domain, iovad);
1535
1536         for_each_domain_iommu(idx, domain) {
1537                 struct intel_iommu *iommu = g_iommus[idx];
1538                 u16 did = domain->iommu_did[iommu->seq_id];
1539
1540                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1541
1542                 if (!cap_caching_mode(iommu->cap))
1543                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1544                                               0, MAX_AGAW_PFN_WIDTH);
1545         }
1546 }
1547
1548 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1549 {
1550         u32 pmen;
1551         unsigned long flags;
1552
1553         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1554                 return;
1555
1556         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1557         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1558         pmen &= ~DMA_PMEN_EPM;
1559         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1560
1561         /* wait for the protected region status bit to clear */
1562         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1563                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1564
1565         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1566 }
1567
1568 static void iommu_enable_translation(struct intel_iommu *iommu)
1569 {
1570         u32 sts;
1571         unsigned long flags;
1572
1573         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1574         iommu->gcmd |= DMA_GCMD_TE;
1575         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1576
1577         /* Make sure hardware complete it */
1578         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1579                       readl, (sts & DMA_GSTS_TES), sts);
1580
1581         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1582 }
1583
1584 static void iommu_disable_translation(struct intel_iommu *iommu)
1585 {
1586         u32 sts;
1587         unsigned long flag;
1588
1589         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1590         iommu->gcmd &= ~DMA_GCMD_TE;
1591         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1592
1593         /* Make sure hardware complete it */
1594         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1595                       readl, (!(sts & DMA_GSTS_TES)), sts);
1596
1597         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1598 }
1599
1600 static int iommu_init_domains(struct intel_iommu *iommu)
1601 {
1602         u32 ndomains, nlongs;
1603         size_t size;
1604
1605         ndomains = cap_ndoms(iommu->cap);
1606         pr_debug("%s: Number of Domains supported <%d>\n",
1607                  iommu->name, ndomains);
1608         nlongs = BITS_TO_LONGS(ndomains);
1609
1610         spin_lock_init(&iommu->lock);
1611
1612         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1613         if (!iommu->domain_ids) {
1614                 pr_err("%s: Allocating domain id array failed\n",
1615                        iommu->name);
1616                 return -ENOMEM;
1617         }
1618
1619         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1620         iommu->domains = kzalloc(size, GFP_KERNEL);
1621
1622         if (iommu->domains) {
1623                 size = 256 * sizeof(struct dmar_domain *);
1624                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1625         }
1626
1627         if (!iommu->domains || !iommu->domains[0]) {
1628                 pr_err("%s: Allocating domain array failed\n",
1629                        iommu->name);
1630                 kfree(iommu->domain_ids);
1631                 kfree(iommu->domains);
1632                 iommu->domain_ids = NULL;
1633                 iommu->domains    = NULL;
1634                 return -ENOMEM;
1635         }
1636
1637         /*
1638          * If Caching mode is set, then invalid translations are tagged
1639          * with domain-id 0, hence we need to pre-allocate it. We also
1640          * use domain-id 0 as a marker for non-allocated domain-id, so
1641          * make sure it is not used for a real domain.
1642          */
1643         set_bit(0, iommu->domain_ids);
1644
1645         /*
1646          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1647          * entry for first-level or pass-through translation modes should
1648          * be programmed with a domain id different from those used for
1649          * second-level or nested translation. We reserve a domain id for
1650          * this purpose.
1651          */
1652         if (sm_supported(iommu))
1653                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1654
1655         return 0;
1656 }
1657
1658 static void disable_dmar_iommu(struct intel_iommu *iommu)
1659 {
1660         struct device_domain_info *info, *tmp;
1661         unsigned long flags;
1662
1663         if (!iommu->domains || !iommu->domain_ids)
1664                 return;
1665
1666         spin_lock_irqsave(&device_domain_lock, flags);
1667         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1668                 if (info->iommu != iommu)
1669                         continue;
1670
1671                 if (!info->dev || !info->domain)
1672                         continue;
1673
1674                 __dmar_remove_one_dev_info(info);
1675         }
1676         spin_unlock_irqrestore(&device_domain_lock, flags);
1677
1678         if (iommu->gcmd & DMA_GCMD_TE)
1679                 iommu_disable_translation(iommu);
1680 }
1681
1682 static void free_dmar_iommu(struct intel_iommu *iommu)
1683 {
1684         if ((iommu->domains) && (iommu->domain_ids)) {
1685                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1686                 int i;
1687
1688                 for (i = 0; i < elems; i++)
1689                         kfree(iommu->domains[i]);
1690                 kfree(iommu->domains);
1691                 kfree(iommu->domain_ids);
1692                 iommu->domains = NULL;
1693                 iommu->domain_ids = NULL;
1694         }
1695
1696         g_iommus[iommu->seq_id] = NULL;
1697
1698         /* free context mapping */
1699         free_context_table(iommu);
1700
1701 #ifdef CONFIG_INTEL_IOMMU_SVM
1702         if (pasid_supported(iommu)) {
1703                 if (ecap_prs(iommu->ecap))
1704                         intel_svm_finish_prq(iommu);
1705         }
1706 #endif
1707 }
1708
1709 static struct dmar_domain *alloc_domain(int flags)
1710 {
1711         struct dmar_domain *domain;
1712
1713         domain = alloc_domain_mem();
1714         if (!domain)
1715                 return NULL;
1716
1717         memset(domain, 0, sizeof(*domain));
1718         domain->nid = NUMA_NO_NODE;
1719         domain->flags = flags;
1720         domain->has_iotlb_device = false;
1721         INIT_LIST_HEAD(&domain->devices);
1722
1723         return domain;
1724 }
1725
1726 /* Must be called with iommu->lock */
1727 static int domain_attach_iommu(struct dmar_domain *domain,
1728                                struct intel_iommu *iommu)
1729 {
1730         unsigned long ndomains;
1731         int num;
1732
1733         assert_spin_locked(&device_domain_lock);
1734         assert_spin_locked(&iommu->lock);
1735
1736         domain->iommu_refcnt[iommu->seq_id] += 1;
1737         domain->iommu_count += 1;
1738         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1739                 ndomains = cap_ndoms(iommu->cap);
1740                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1741
1742                 if (num >= ndomains) {
1743                         pr_err("%s: No free domain ids\n", iommu->name);
1744                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1745                         domain->iommu_count -= 1;
1746                         return -ENOSPC;
1747                 }
1748
1749                 set_bit(num, iommu->domain_ids);
1750                 set_iommu_domain(iommu, num, domain);
1751
1752                 domain->iommu_did[iommu->seq_id] = num;
1753                 domain->nid                      = iommu->node;
1754
1755                 domain_update_iommu_cap(domain);
1756         }
1757
1758         return 0;
1759 }
1760
1761 static int domain_detach_iommu(struct dmar_domain *domain,
1762                                struct intel_iommu *iommu)
1763 {
1764         int num, count;
1765
1766         assert_spin_locked(&device_domain_lock);
1767         assert_spin_locked(&iommu->lock);
1768
1769         domain->iommu_refcnt[iommu->seq_id] -= 1;
1770         count = --domain->iommu_count;
1771         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1772                 num = domain->iommu_did[iommu->seq_id];
1773                 clear_bit(num, iommu->domain_ids);
1774                 set_iommu_domain(iommu, num, NULL);
1775
1776                 domain_update_iommu_cap(domain);
1777                 domain->iommu_did[iommu->seq_id] = 0;
1778         }
1779
1780         return count;
1781 }
1782
1783 static struct iova_domain reserved_iova_list;
1784 static struct lock_class_key reserved_rbtree_key;
1785
1786 static int dmar_init_reserved_ranges(void)
1787 {
1788         struct pci_dev *pdev = NULL;
1789         struct iova *iova;
1790         int i;
1791
1792         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1793
1794         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1795                 &reserved_rbtree_key);
1796
1797         /* IOAPIC ranges shouldn't be accessed by DMA */
1798         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1799                 IOVA_PFN(IOAPIC_RANGE_END));
1800         if (!iova) {
1801                 pr_err("Reserve IOAPIC range failed\n");
1802                 return -ENODEV;
1803         }
1804
1805         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1806         for_each_pci_dev(pdev) {
1807                 struct resource *r;
1808
1809                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1810                         r = &pdev->resource[i];
1811                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1812                                 continue;
1813                         iova = reserve_iova(&reserved_iova_list,
1814                                             IOVA_PFN(r->start),
1815                                             IOVA_PFN(r->end));
1816                         if (!iova) {
1817                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1818                                 return -ENODEV;
1819                         }
1820                 }
1821         }
1822         return 0;
1823 }
1824
1825 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1826 {
1827         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1828 }
1829
1830 static inline int guestwidth_to_adjustwidth(int gaw)
1831 {
1832         int agaw;
1833         int r = (gaw - 12) % 9;
1834
1835         if (r == 0)
1836                 agaw = gaw;
1837         else
1838                 agaw = gaw + 9 - r;
1839         if (agaw > 64)
1840                 agaw = 64;
1841         return agaw;
1842 }
1843
1844 static void domain_exit(struct dmar_domain *domain)
1845 {
1846         struct page *freelist;
1847
1848         /* Remove associated devices and clear attached or cached domains */
1849         domain_remove_dev_info(domain);
1850
1851         /* destroy iovas */
1852         put_iova_domain(&domain->iovad);
1853
1854         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1855
1856         dma_free_pagelist(freelist);
1857
1858         free_domain_mem(domain);
1859 }
1860
1861 /*
1862  * Get the PASID directory size for scalable mode context entry.
1863  * Value of X in the PDTS field of a scalable mode context entry
1864  * indicates PASID directory with 2^(X + 7) entries.
1865  */
1866 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1867 {
1868         int pds, max_pde;
1869
1870         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1871         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1872         if (pds < 7)
1873                 return 0;
1874
1875         return pds - 7;
1876 }
1877
1878 /*
1879  * Set the RID_PASID field of a scalable mode context entry. The
1880  * IOMMU hardware will use the PASID value set in this field for
1881  * DMA translations of DMA requests without PASID.
1882  */
1883 static inline void
1884 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1885 {
1886         context->hi |= pasid & ((1 << 20) - 1);
1887         context->hi |= (1 << 20);
1888 }
1889
1890 /*
1891  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1892  * entry.
1893  */
1894 static inline void context_set_sm_dte(struct context_entry *context)
1895 {
1896         context->lo |= (1 << 2);
1897 }
1898
1899 /*
1900  * Set the PRE(Page Request Enable) field of a scalable mode context
1901  * entry.
1902  */
1903 static inline void context_set_sm_pre(struct context_entry *context)
1904 {
1905         context->lo |= (1 << 4);
1906 }
1907
1908 /* Convert value to context PASID directory size field coding. */
1909 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1910
1911 static int domain_context_mapping_one(struct dmar_domain *domain,
1912                                       struct intel_iommu *iommu,
1913                                       struct pasid_table *table,
1914                                       u8 bus, u8 devfn)
1915 {
1916         u16 did = domain->iommu_did[iommu->seq_id];
1917         int translation = CONTEXT_TT_MULTI_LEVEL;
1918         struct device_domain_info *info = NULL;
1919         struct context_entry *context;
1920         unsigned long flags;
1921         int ret;
1922
1923         WARN_ON(did == 0);
1924
1925         if (hw_pass_through && domain_type_is_si(domain))
1926                 translation = CONTEXT_TT_PASS_THROUGH;
1927
1928         pr_debug("Set context mapping for %02x:%02x.%d\n",
1929                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1930
1931         BUG_ON(!domain->pgd);
1932
1933         spin_lock_irqsave(&device_domain_lock, flags);
1934         spin_lock(&iommu->lock);
1935
1936         ret = -ENOMEM;
1937         context = iommu_context_addr(iommu, bus, devfn, 1);
1938         if (!context)
1939                 goto out_unlock;
1940
1941         ret = 0;
1942         if (context_present(context))
1943                 goto out_unlock;
1944
1945         /*
1946          * For kdump cases, old valid entries may be cached due to the
1947          * in-flight DMA and copied pgtable, but there is no unmapping
1948          * behaviour for them, thus we need an explicit cache flush for
1949          * the newly-mapped device. For kdump, at this point, the device
1950          * is supposed to finish reset at its driver probe stage, so no
1951          * in-flight DMA will exist, and we don't need to worry anymore
1952          * hereafter.
1953          */
1954         if (context_copied(context)) {
1955                 u16 did_old = context_domain_id(context);
1956
1957                 if (did_old < cap_ndoms(iommu->cap)) {
1958                         iommu->flush.flush_context(iommu, did_old,
1959                                                    (((u16)bus) << 8) | devfn,
1960                                                    DMA_CCMD_MASK_NOBIT,
1961                                                    DMA_CCMD_DEVICE_INVL);
1962                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1963                                                  DMA_TLB_DSI_FLUSH);
1964                 }
1965         }
1966
1967         context_clear_entry(context);
1968
1969         if (sm_supported(iommu)) {
1970                 unsigned long pds;
1971
1972                 WARN_ON(!table);
1973
1974                 /* Setup the PASID DIR pointer: */
1975                 pds = context_get_sm_pds(table);
1976                 context->lo = (u64)virt_to_phys(table->table) |
1977                                 context_pdts(pds);
1978
1979                 /* Setup the RID_PASID field: */
1980                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
1981
1982                 /*
1983                  * Setup the Device-TLB enable bit and Page request
1984                  * Enable bit:
1985                  */
1986                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1987                 if (info && info->ats_supported)
1988                         context_set_sm_dte(context);
1989                 if (info && info->pri_supported)
1990                         context_set_sm_pre(context);
1991         } else {
1992                 struct dma_pte *pgd = domain->pgd;
1993                 int agaw;
1994
1995                 context_set_domain_id(context, did);
1996
1997                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1998                         /*
1999                          * Skip top levels of page tables for iommu which has
2000                          * less agaw than default. Unnecessary for PT mode.
2001                          */
2002                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2003                                 ret = -ENOMEM;
2004                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2005                                 if (!dma_pte_present(pgd))
2006                                         goto out_unlock;
2007                         }
2008
2009                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2010                         if (info && info->ats_supported)
2011                                 translation = CONTEXT_TT_DEV_IOTLB;
2012                         else
2013                                 translation = CONTEXT_TT_MULTI_LEVEL;
2014
2015                         context_set_address_root(context, virt_to_phys(pgd));
2016                         context_set_address_width(context, agaw);
2017                 } else {
2018                         /*
2019                          * In pass through mode, AW must be programmed to
2020                          * indicate the largest AGAW value supported by
2021                          * hardware. And ASR is ignored by hardware.
2022                          */
2023                         context_set_address_width(context, iommu->msagaw);
2024                 }
2025
2026                 context_set_translation_type(context, translation);
2027         }
2028
2029         context_set_fault_enable(context);
2030         context_set_present(context);
2031         domain_flush_cache(domain, context, sizeof(*context));
2032
2033         /*
2034          * It's a non-present to present mapping. If hardware doesn't cache
2035          * non-present entry we only need to flush the write-buffer. If the
2036          * _does_ cache non-present entries, then it does so in the special
2037          * domain #0, which we have to flush:
2038          */
2039         if (cap_caching_mode(iommu->cap)) {
2040                 iommu->flush.flush_context(iommu, 0,
2041                                            (((u16)bus) << 8) | devfn,
2042                                            DMA_CCMD_MASK_NOBIT,
2043                                            DMA_CCMD_DEVICE_INVL);
2044                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2045         } else {
2046                 iommu_flush_write_buffer(iommu);
2047         }
2048         iommu_enable_dev_iotlb(info);
2049
2050         ret = 0;
2051
2052 out_unlock:
2053         spin_unlock(&iommu->lock);
2054         spin_unlock_irqrestore(&device_domain_lock, flags);
2055
2056         return ret;
2057 }
2058
2059 struct domain_context_mapping_data {
2060         struct dmar_domain *domain;
2061         struct intel_iommu *iommu;
2062         struct pasid_table *table;
2063 };
2064
2065 static int domain_context_mapping_cb(struct pci_dev *pdev,
2066                                      u16 alias, void *opaque)
2067 {
2068         struct domain_context_mapping_data *data = opaque;
2069
2070         return domain_context_mapping_one(data->domain, data->iommu,
2071                                           data->table, PCI_BUS_NUM(alias),
2072                                           alias & 0xff);
2073 }
2074
2075 static int
2076 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2077 {
2078         struct domain_context_mapping_data data;
2079         struct pasid_table *table;
2080         struct intel_iommu *iommu;
2081         u8 bus, devfn;
2082
2083         iommu = device_to_iommu(dev, &bus, &devfn);
2084         if (!iommu)
2085                 return -ENODEV;
2086
2087         table = intel_pasid_get_table(dev);
2088
2089         if (!dev_is_pci(dev))
2090                 return domain_context_mapping_one(domain, iommu, table,
2091                                                   bus, devfn);
2092
2093         data.domain = domain;
2094         data.iommu = iommu;
2095         data.table = table;
2096
2097         return pci_for_each_dma_alias(to_pci_dev(dev),
2098                                       &domain_context_mapping_cb, &data);
2099 }
2100
2101 static int domain_context_mapped_cb(struct pci_dev *pdev,
2102                                     u16 alias, void *opaque)
2103 {
2104         struct intel_iommu *iommu = opaque;
2105
2106         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2107 }
2108
2109 static int domain_context_mapped(struct device *dev)
2110 {
2111         struct intel_iommu *iommu;
2112         u8 bus, devfn;
2113
2114         iommu = device_to_iommu(dev, &bus, &devfn);
2115         if (!iommu)
2116                 return -ENODEV;
2117
2118         if (!dev_is_pci(dev))
2119                 return device_context_mapped(iommu, bus, devfn);
2120
2121         return !pci_for_each_dma_alias(to_pci_dev(dev),
2122                                        domain_context_mapped_cb, iommu);
2123 }
2124
2125 /* Returns a number of VTD pages, but aligned to MM page size */
2126 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2127                                             size_t size)
2128 {
2129         host_addr &= ~PAGE_MASK;
2130         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2131 }
2132
2133 /* Return largest possible superpage level for a given mapping */
2134 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2135                                           unsigned long iov_pfn,
2136                                           unsigned long phy_pfn,
2137                                           unsigned long pages)
2138 {
2139         int support, level = 1;
2140         unsigned long pfnmerge;
2141
2142         support = domain->iommu_superpage;
2143
2144         /* To use a large page, the virtual *and* physical addresses
2145            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2146            of them will mean we have to use smaller pages. So just
2147            merge them and check both at once. */
2148         pfnmerge = iov_pfn | phy_pfn;
2149
2150         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2151                 pages >>= VTD_STRIDE_SHIFT;
2152                 if (!pages)
2153                         break;
2154                 pfnmerge >>= VTD_STRIDE_SHIFT;
2155                 level++;
2156                 support--;
2157         }
2158         return level;
2159 }
2160
2161 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2162                             struct scatterlist *sg, unsigned long phys_pfn,
2163                             unsigned long nr_pages, int prot)
2164 {
2165         struct dma_pte *first_pte = NULL, *pte = NULL;
2166         phys_addr_t uninitialized_var(pteval);
2167         unsigned long sg_res = 0;
2168         unsigned int largepage_lvl = 0;
2169         unsigned long lvl_pages = 0;
2170
2171         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2172
2173         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2174                 return -EINVAL;
2175
2176         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2177
2178         if (!sg) {
2179                 sg_res = nr_pages;
2180                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2181         }
2182
2183         while (nr_pages > 0) {
2184                 uint64_t tmp;
2185
2186                 if (!sg_res) {
2187                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2188
2189                         sg_res = aligned_nrpages(sg->offset, sg->length);
2190                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2191                         sg->dma_length = sg->length;
2192                         pteval = (sg_phys(sg) - pgoff) | prot;
2193                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2194                 }
2195
2196                 if (!pte) {
2197                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2198
2199                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2200                         if (!pte)
2201                                 return -ENOMEM;
2202                         /* It is large page*/
2203                         if (largepage_lvl > 1) {
2204                                 unsigned long nr_superpages, end_pfn;
2205
2206                                 pteval |= DMA_PTE_LARGE_PAGE;
2207                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2208
2209                                 nr_superpages = sg_res / lvl_pages;
2210                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2211
2212                                 /*
2213                                  * Ensure that old small page tables are
2214                                  * removed to make room for superpage(s).
2215                                  * We're adding new large pages, so make sure
2216                                  * we don't remove their parent tables.
2217                                  */
2218                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2219                                                        largepage_lvl + 1);
2220                         } else {
2221                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2222                         }
2223
2224                 }
2225                 /* We don't need lock here, nobody else
2226                  * touches the iova range
2227                  */
2228                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2229                 if (tmp) {
2230                         static int dumps = 5;
2231                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2232                                 iov_pfn, tmp, (unsigned long long)pteval);
2233                         if (dumps) {
2234                                 dumps--;
2235                                 debug_dma_dump_mappings(NULL);
2236                         }
2237                         WARN_ON(1);
2238                 }
2239
2240                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2241
2242                 BUG_ON(nr_pages < lvl_pages);
2243                 BUG_ON(sg_res < lvl_pages);
2244
2245                 nr_pages -= lvl_pages;
2246                 iov_pfn += lvl_pages;
2247                 phys_pfn += lvl_pages;
2248                 pteval += lvl_pages * VTD_PAGE_SIZE;
2249                 sg_res -= lvl_pages;
2250
2251                 /* If the next PTE would be the first in a new page, then we
2252                    need to flush the cache on the entries we've just written.
2253                    And then we'll need to recalculate 'pte', so clear it and
2254                    let it get set again in the if (!pte) block above.
2255
2256                    If we're done (!nr_pages) we need to flush the cache too.
2257
2258                    Also if we've been setting superpages, we may need to
2259                    recalculate 'pte' and switch back to smaller pages for the
2260                    end of the mapping, if the trailing size is not enough to
2261                    use another superpage (i.e. sg_res < lvl_pages). */
2262                 pte++;
2263                 if (!nr_pages || first_pte_in_page(pte) ||
2264                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2265                         domain_flush_cache(domain, first_pte,
2266                                            (void *)pte - (void *)first_pte);
2267                         pte = NULL;
2268                 }
2269
2270                 if (!sg_res && nr_pages)
2271                         sg = sg_next(sg);
2272         }
2273         return 0;
2274 }
2275
2276 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2277                           struct scatterlist *sg, unsigned long phys_pfn,
2278                           unsigned long nr_pages, int prot)
2279 {
2280         int iommu_id, ret;
2281         struct intel_iommu *iommu;
2282
2283         /* Do the real mapping first */
2284         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2285         if (ret)
2286                 return ret;
2287
2288         for_each_domain_iommu(iommu_id, domain) {
2289                 iommu = g_iommus[iommu_id];
2290                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2291         }
2292
2293         return 0;
2294 }
2295
2296 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2297                                     struct scatterlist *sg, unsigned long nr_pages,
2298                                     int prot)
2299 {
2300         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2301 }
2302
2303 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2304                                      unsigned long phys_pfn, unsigned long nr_pages,
2305                                      int prot)
2306 {
2307         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2308 }
2309
2310 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2311 {
2312         unsigned long flags;
2313         struct context_entry *context;
2314         u16 did_old;
2315
2316         if (!iommu)
2317                 return;
2318
2319         spin_lock_irqsave(&iommu->lock, flags);
2320         context = iommu_context_addr(iommu, bus, devfn, 0);
2321         if (!context) {
2322                 spin_unlock_irqrestore(&iommu->lock, flags);
2323                 return;
2324         }
2325         did_old = context_domain_id(context);
2326         context_clear_entry(context);
2327         __iommu_flush_cache(iommu, context, sizeof(*context));
2328         spin_unlock_irqrestore(&iommu->lock, flags);
2329         iommu->flush.flush_context(iommu,
2330                                    did_old,
2331                                    (((u16)bus) << 8) | devfn,
2332                                    DMA_CCMD_MASK_NOBIT,
2333                                    DMA_CCMD_DEVICE_INVL);
2334         iommu->flush.flush_iotlb(iommu,
2335                                  did_old,
2336                                  0,
2337                                  0,
2338                                  DMA_TLB_DSI_FLUSH);
2339 }
2340
2341 static inline void unlink_domain_info(struct device_domain_info *info)
2342 {
2343         assert_spin_locked(&device_domain_lock);
2344         list_del(&info->link);
2345         list_del(&info->global);
2346         if (info->dev)
2347                 info->dev->archdata.iommu = NULL;
2348 }
2349
2350 static void domain_remove_dev_info(struct dmar_domain *domain)
2351 {
2352         struct device_domain_info *info, *tmp;
2353         unsigned long flags;
2354
2355         spin_lock_irqsave(&device_domain_lock, flags);
2356         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2357                 __dmar_remove_one_dev_info(info);
2358         spin_unlock_irqrestore(&device_domain_lock, flags);
2359 }
2360
2361 /*
2362  * find_domain
2363  * Note: we use struct device->archdata.iommu stores the info
2364  */
2365 static struct dmar_domain *find_domain(struct device *dev)
2366 {
2367         struct device_domain_info *info;
2368
2369         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2370                 struct iommu_domain *domain;
2371
2372                 dev->archdata.iommu = NULL;
2373                 domain = iommu_get_domain_for_dev(dev);
2374                 if (domain)
2375                         intel_iommu_attach_device(domain, dev);
2376         }
2377
2378         /* No lock here, assumes no domain exit in normal case */
2379         info = dev->archdata.iommu;
2380
2381         if (likely(info))
2382                 return info->domain;
2383         return NULL;
2384 }
2385
2386 static inline struct device_domain_info *
2387 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2388 {
2389         struct device_domain_info *info;
2390
2391         list_for_each_entry(info, &device_domain_list, global)
2392                 if (info->iommu->segment == segment && info->bus == bus &&
2393                     info->devfn == devfn)
2394                         return info;
2395
2396         return NULL;
2397 }
2398
2399 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2400                                                     int bus, int devfn,
2401                                                     struct device *dev,
2402                                                     struct dmar_domain *domain)
2403 {
2404         struct dmar_domain *found = NULL;
2405         struct device_domain_info *info;
2406         unsigned long flags;
2407         int ret;
2408
2409         info = alloc_devinfo_mem();
2410         if (!info)
2411                 return NULL;
2412
2413         info->bus = bus;
2414         info->devfn = devfn;
2415         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2416         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2417         info->ats_qdep = 0;
2418         info->dev = dev;
2419         info->domain = domain;
2420         info->iommu = iommu;
2421         info->pasid_table = NULL;
2422         info->auxd_enabled = 0;
2423         INIT_LIST_HEAD(&info->auxiliary_domains);
2424
2425         if (dev && dev_is_pci(dev)) {
2426                 struct pci_dev *pdev = to_pci_dev(info->dev);
2427
2428                 if (!pdev->untrusted &&
2429                     !pci_ats_disabled() &&
2430                     ecap_dev_iotlb_support(iommu->ecap) &&
2431                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2432                     dmar_find_matched_atsr_unit(pdev))
2433                         info->ats_supported = 1;
2434
2435                 if (sm_supported(iommu)) {
2436                         if (pasid_supported(iommu)) {
2437                                 int features = pci_pasid_features(pdev);
2438                                 if (features >= 0)
2439                                         info->pasid_supported = features | 1;
2440                         }
2441
2442                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2443                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2444                                 info->pri_supported = 1;
2445                 }
2446         }
2447
2448         spin_lock_irqsave(&device_domain_lock, flags);
2449         if (dev)
2450                 found = find_domain(dev);
2451
2452         if (!found) {
2453                 struct device_domain_info *info2;
2454                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2455                 if (info2) {
2456                         found      = info2->domain;
2457                         info2->dev = dev;
2458                 }
2459         }
2460
2461         if (found) {
2462                 spin_unlock_irqrestore(&device_domain_lock, flags);
2463                 free_devinfo_mem(info);
2464                 /* Caller must free the original domain */
2465                 return found;
2466         }
2467
2468         spin_lock(&iommu->lock);
2469         ret = domain_attach_iommu(domain, iommu);
2470         spin_unlock(&iommu->lock);
2471
2472         if (ret) {
2473                 spin_unlock_irqrestore(&device_domain_lock, flags);
2474                 free_devinfo_mem(info);
2475                 return NULL;
2476         }
2477
2478         list_add(&info->link, &domain->devices);
2479         list_add(&info->global, &device_domain_list);
2480         if (dev)
2481                 dev->archdata.iommu = info;
2482         spin_unlock_irqrestore(&device_domain_lock, flags);
2483
2484         /* PASID table is mandatory for a PCI device in scalable mode. */
2485         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2486                 ret = intel_pasid_alloc_table(dev);
2487                 if (ret) {
2488                         dev_err(dev, "PASID table allocation failed\n");
2489                         dmar_remove_one_dev_info(dev);
2490                         return NULL;
2491                 }
2492
2493                 /* Setup the PASID entry for requests without PASID: */
2494                 spin_lock(&iommu->lock);
2495                 if (hw_pass_through && domain_type_is_si(domain))
2496                         ret = intel_pasid_setup_pass_through(iommu, domain,
2497                                         dev, PASID_RID2PASID);
2498                 else
2499                         ret = intel_pasid_setup_second_level(iommu, domain,
2500                                         dev, PASID_RID2PASID);
2501                 spin_unlock(&iommu->lock);
2502                 if (ret) {
2503                         dev_err(dev, "Setup RID2PASID failed\n");
2504                         dmar_remove_one_dev_info(dev);
2505                         return NULL;
2506                 }
2507         }
2508
2509         if (dev && domain_context_mapping(domain, dev)) {
2510                 dev_err(dev, "Domain context map failed\n");
2511                 dmar_remove_one_dev_info(dev);
2512                 return NULL;
2513         }
2514
2515         return domain;
2516 }
2517
2518 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2519 {
2520         *(u16 *)opaque = alias;
2521         return 0;
2522 }
2523
2524 static int domain_init(struct dmar_domain *domain, int guest_width)
2525 {
2526         int adjust_width;
2527
2528         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
2529         domain_reserve_special_ranges(domain);
2530
2531         /* calculate AGAW */
2532         domain->gaw = guest_width;
2533         adjust_width = guestwidth_to_adjustwidth(guest_width);
2534         domain->agaw = width_to_agaw(adjust_width);
2535
2536         domain->iommu_coherency = 0;
2537         domain->iommu_snooping = 0;
2538         domain->iommu_superpage = 0;
2539         domain->max_addr = 0;
2540
2541         /* always allocate the top pgd */
2542         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
2543         if (!domain->pgd)
2544                 return -ENOMEM;
2545         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2546         return 0;
2547 }
2548
2549 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2550 {
2551         struct device_domain_info *info;
2552         struct dmar_domain *domain = NULL;
2553         struct intel_iommu *iommu;
2554         u16 dma_alias;
2555         unsigned long flags;
2556         u8 bus, devfn;
2557
2558         iommu = device_to_iommu(dev, &bus, &devfn);
2559         if (!iommu)
2560                 return NULL;
2561
2562         if (dev_is_pci(dev)) {
2563                 struct pci_dev *pdev = to_pci_dev(dev);
2564
2565                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2566
2567                 spin_lock_irqsave(&device_domain_lock, flags);
2568                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2569                                                       PCI_BUS_NUM(dma_alias),
2570                                                       dma_alias & 0xff);
2571                 if (info) {
2572                         iommu = info->iommu;
2573                         domain = info->domain;
2574                 }
2575                 spin_unlock_irqrestore(&device_domain_lock, flags);
2576
2577                 /* DMA alias already has a domain, use it */
2578                 if (info)
2579                         goto out;
2580         }
2581
2582         /* Allocate and initialize new domain for the device */
2583         domain = alloc_domain(0);
2584         if (!domain)
2585                 return NULL;
2586
2587         if (domain_init(domain, gaw)) {
2588                 domain_exit(domain);
2589                 return NULL;
2590         }
2591
2592         if (init_iova_flush_queue(&domain->iovad,
2593                                   iommu_flush_iova,
2594                                   iova_entry_free)) {
2595                 pr_warn("iova flush queue initialization failed\n");
2596                 intel_iommu_strict = 1;
2597         }
2598
2599 out:
2600         return domain;
2601 }
2602
2603 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2604                                               struct dmar_domain *domain)
2605 {
2606         struct intel_iommu *iommu;
2607         struct dmar_domain *tmp;
2608         u16 req_id, dma_alias;
2609         u8 bus, devfn;
2610
2611         iommu = device_to_iommu(dev, &bus, &devfn);
2612         if (!iommu)
2613                 return NULL;
2614
2615         req_id = ((u16)bus << 8) | devfn;
2616
2617         if (dev_is_pci(dev)) {
2618                 struct pci_dev *pdev = to_pci_dev(dev);
2619
2620                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2621
2622                 /* register PCI DMA alias device */
2623                 if (req_id != dma_alias) {
2624                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2625                                         dma_alias & 0xff, NULL, domain);
2626
2627                         if (!tmp || tmp != domain)
2628                                 return tmp;
2629                 }
2630         }
2631
2632         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2633         if (!tmp || tmp != domain)
2634                 return tmp;
2635
2636         return domain;
2637 }
2638
2639 static int iommu_domain_identity_map(struct dmar_domain *domain,
2640                                      unsigned long long start,
2641                                      unsigned long long end)
2642 {
2643         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2644         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2645
2646         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2647                           dma_to_mm_pfn(last_vpfn))) {
2648                 pr_err("Reserving iova failed\n");
2649                 return -ENOMEM;
2650         }
2651
2652         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2653         /*
2654          * RMRR range might have overlap with physical memory range,
2655          * clear it first
2656          */
2657         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2658
2659         return __domain_mapping(domain, first_vpfn, NULL,
2660                                 first_vpfn, last_vpfn - first_vpfn + 1,
2661                                 DMA_PTE_READ|DMA_PTE_WRITE);
2662 }
2663
2664 static int domain_prepare_identity_map(struct device *dev,
2665                                        struct dmar_domain *domain,
2666                                        unsigned long long start,
2667                                        unsigned long long end)
2668 {
2669         /* For _hardware_ passthrough, don't bother. But for software
2670            passthrough, we do it anyway -- it may indicate a memory
2671            range which is reserved in E820, so which didn't get set
2672            up to start with in si_domain */
2673         if (domain == si_domain && hw_pass_through) {
2674                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2675                          start, end);
2676                 return 0;
2677         }
2678
2679         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2680
2681         if (end < start) {
2682                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2683                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2684                         dmi_get_system_info(DMI_BIOS_VENDOR),
2685                         dmi_get_system_info(DMI_BIOS_VERSION),
2686                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2687                 return -EIO;
2688         }
2689
2690         if (end >> agaw_to_width(domain->agaw)) {
2691                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2692                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2693                      agaw_to_width(domain->agaw),
2694                      dmi_get_system_info(DMI_BIOS_VENDOR),
2695                      dmi_get_system_info(DMI_BIOS_VERSION),
2696                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2697                 return -EIO;
2698         }
2699
2700         return iommu_domain_identity_map(domain, start, end);
2701 }
2702
2703 static int __init si_domain_init(int hw)
2704 {
2705         struct dmar_rmrr_unit *rmrr;
2706         struct device *dev;
2707         int i, nid, ret;
2708
2709         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2710         if (!si_domain)
2711                 return -EFAULT;
2712
2713         if (domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2714                 domain_exit(si_domain);
2715                 return -EFAULT;
2716         }
2717
2718         if (hw)
2719                 return 0;
2720
2721         for_each_online_node(nid) {
2722                 unsigned long start_pfn, end_pfn;
2723                 int i;
2724
2725                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2726                         ret = iommu_domain_identity_map(si_domain,
2727                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2728                         if (ret)
2729                                 return ret;
2730                 }
2731         }
2732
2733         /*
2734          * Normally we use DMA domains for devices which have RMRRs. But we
2735          * loose this requirement for graphic and usb devices. Identity map
2736          * the RMRRs for graphic and USB devices so that they could use the
2737          * si_domain.
2738          */
2739         for_each_rmrr_units(rmrr) {
2740                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2741                                           i, dev) {
2742                         unsigned long long start = rmrr->base_address;
2743                         unsigned long long end = rmrr->end_address;
2744
2745                         if (device_is_rmrr_locked(dev))
2746                                 continue;
2747
2748                         if (WARN_ON(end < start ||
2749                                     end >> agaw_to_width(si_domain->agaw)))
2750                                 continue;
2751
2752                         ret = iommu_domain_identity_map(si_domain, start, end);
2753                         if (ret)
2754                                 return ret;
2755                 }
2756         }
2757
2758         return 0;
2759 }
2760
2761 static int identity_mapping(struct device *dev)
2762 {
2763         struct device_domain_info *info;
2764
2765         info = dev->archdata.iommu;
2766         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2767                 return (info->domain == si_domain);
2768
2769         return 0;
2770 }
2771
2772 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2773 {
2774         struct dmar_domain *ndomain;
2775         struct intel_iommu *iommu;
2776         u8 bus, devfn;
2777
2778         iommu = device_to_iommu(dev, &bus, &devfn);
2779         if (!iommu)
2780                 return -ENODEV;
2781
2782         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2783         if (ndomain != domain)
2784                 return -EBUSY;
2785
2786         return 0;
2787 }
2788
2789 static bool device_has_rmrr(struct device *dev)
2790 {
2791         struct dmar_rmrr_unit *rmrr;
2792         struct device *tmp;
2793         int i;
2794
2795         rcu_read_lock();
2796         for_each_rmrr_units(rmrr) {
2797                 /*
2798                  * Return TRUE if this RMRR contains the device that
2799                  * is passed in.
2800                  */
2801                 for_each_active_dev_scope(rmrr->devices,
2802                                           rmrr->devices_cnt, i, tmp)
2803                         if (tmp == dev ||
2804                             is_downstream_to_pci_bridge(dev, tmp)) {
2805                                 rcu_read_unlock();
2806                                 return true;
2807                         }
2808         }
2809         rcu_read_unlock();
2810         return false;
2811 }
2812
2813 /**
2814  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2815  * is relaxable (ie. is allowed to be not enforced under some conditions)
2816  * @dev: device handle
2817  *
2818  * We assume that PCI USB devices with RMRRs have them largely
2819  * for historical reasons and that the RMRR space is not actively used post
2820  * boot.  This exclusion may change if vendors begin to abuse it.
2821  *
2822  * The same exception is made for graphics devices, with the requirement that
2823  * any use of the RMRR regions will be torn down before assigning the device
2824  * to a guest.
2825  *
2826  * Return: true if the RMRR is relaxable, false otherwise
2827  */
2828 static bool device_rmrr_is_relaxable(struct device *dev)
2829 {
2830         struct pci_dev *pdev;
2831
2832         if (!dev_is_pci(dev))
2833                 return false;
2834
2835         pdev = to_pci_dev(dev);
2836         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2837                 return true;
2838         else
2839                 return false;
2840 }
2841
2842 /*
2843  * There are a couple cases where we need to restrict the functionality of
2844  * devices associated with RMRRs.  The first is when evaluating a device for
2845  * identity mapping because problems exist when devices are moved in and out
2846  * of domains and their respective RMRR information is lost.  This means that
2847  * a device with associated RMRRs will never be in a "passthrough" domain.
2848  * The second is use of the device through the IOMMU API.  This interface
2849  * expects to have full control of the IOVA space for the device.  We cannot
2850  * satisfy both the requirement that RMRR access is maintained and have an
2851  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2852  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2853  * We therefore prevent devices associated with an RMRR from participating in
2854  * the IOMMU API, which eliminates them from device assignment.
2855  *
2856  * In both cases, devices which have relaxable RMRRs are not concerned by this
2857  * restriction. See device_rmrr_is_relaxable comment.
2858  */
2859 static bool device_is_rmrr_locked(struct device *dev)
2860 {
2861         if (!device_has_rmrr(dev))
2862                 return false;
2863
2864         if (device_rmrr_is_relaxable(dev))
2865                 return false;
2866
2867         return true;
2868 }
2869
2870 /*
2871  * Return the required default domain type for a specific device.
2872  *
2873  * @dev: the device in query
2874  * @startup: true if this is during early boot
2875  *
2876  * Returns:
2877  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2878  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2879  *  - 0: both identity and dynamic domains work for this device
2880  */
2881 static int device_def_domain_type(struct device *dev)
2882 {
2883         if (dev_is_pci(dev)) {
2884                 struct pci_dev *pdev = to_pci_dev(dev);
2885
2886                 if (device_is_rmrr_locked(dev))
2887                         return IOMMU_DOMAIN_DMA;
2888
2889                 /*
2890                  * Prevent any device marked as untrusted from getting
2891                  * placed into the statically identity mapping domain.
2892                  */
2893                 if (pdev->untrusted)
2894                         return IOMMU_DOMAIN_DMA;
2895
2896                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2897                         return IOMMU_DOMAIN_IDENTITY;
2898
2899                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2900                         return IOMMU_DOMAIN_IDENTITY;
2901
2902                 /*
2903                  * We want to start off with all devices in the 1:1 domain, and
2904                  * take them out later if we find they can't access all of memory.
2905                  *
2906                  * However, we can't do this for PCI devices behind bridges,
2907                  * because all PCI devices behind the same bridge will end up
2908                  * with the same source-id on their transactions.
2909                  *
2910                  * Practically speaking, we can't change things around for these
2911                  * devices at run-time, because we can't be sure there'll be no
2912                  * DMA transactions in flight for any of their siblings.
2913                  *
2914                  * So PCI devices (unless they're on the root bus) as well as
2915                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2916                  * the 1:1 domain, just in _case_ one of their siblings turns out
2917                  * not to be able to map all of memory.
2918                  */
2919                 if (!pci_is_pcie(pdev)) {
2920                         if (!pci_is_root_bus(pdev->bus))
2921                                 return IOMMU_DOMAIN_DMA;
2922                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2923                                 return IOMMU_DOMAIN_DMA;
2924                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2925                         return IOMMU_DOMAIN_DMA;
2926         } else {
2927                 if (device_has_rmrr(dev))
2928                         return IOMMU_DOMAIN_DMA;
2929         }
2930
2931         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2932                         IOMMU_DOMAIN_IDENTITY : 0;
2933 }
2934
2935 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2936 {
2937         /*
2938          * Start from the sane iommu hardware state.
2939          * If the queued invalidation is already initialized by us
2940          * (for example, while enabling interrupt-remapping) then
2941          * we got the things already rolling from a sane state.
2942          */
2943         if (!iommu->qi) {
2944                 /*
2945                  * Clear any previous faults.
2946                  */
2947                 dmar_fault(-1, iommu);
2948                 /*
2949                  * Disable queued invalidation if supported and already enabled
2950                  * before OS handover.
2951                  */
2952                 dmar_disable_qi(iommu);
2953         }
2954
2955         if (dmar_enable_qi(iommu)) {
2956                 /*
2957                  * Queued Invalidate not enabled, use Register Based Invalidate
2958                  */
2959                 iommu->flush.flush_context = __iommu_flush_context;
2960                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2961                 pr_info("%s: Using Register based invalidation\n",
2962                         iommu->name);
2963         } else {
2964                 iommu->flush.flush_context = qi_flush_context;
2965                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2966                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2967         }
2968 }
2969
2970 static int copy_context_table(struct intel_iommu *iommu,
2971                               struct root_entry *old_re,
2972                               struct context_entry **tbl,
2973                               int bus, bool ext)
2974 {
2975         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2976         struct context_entry *new_ce = NULL, ce;
2977         struct context_entry *old_ce = NULL;
2978         struct root_entry re;
2979         phys_addr_t old_ce_phys;
2980
2981         tbl_idx = ext ? bus * 2 : bus;
2982         memcpy(&re, old_re, sizeof(re));
2983
2984         for (devfn = 0; devfn < 256; devfn++) {
2985                 /* First calculate the correct index */
2986                 idx = (ext ? devfn * 2 : devfn) % 256;
2987
2988                 if (idx == 0) {
2989                         /* First save what we may have and clean up */
2990                         if (new_ce) {
2991                                 tbl[tbl_idx] = new_ce;
2992                                 __iommu_flush_cache(iommu, new_ce,
2993                                                     VTD_PAGE_SIZE);
2994                                 pos = 1;
2995                         }
2996
2997                         if (old_ce)
2998                                 memunmap(old_ce);
2999
3000                         ret = 0;
3001                         if (devfn < 0x80)
3002                                 old_ce_phys = root_entry_lctp(&re);
3003                         else
3004                                 old_ce_phys = root_entry_uctp(&re);
3005
3006                         if (!old_ce_phys) {
3007                                 if (ext && devfn == 0) {
3008                                         /* No LCTP, try UCTP */
3009                                         devfn = 0x7f;
3010                                         continue;
3011                                 } else {
3012                                         goto out;
3013                                 }
3014                         }
3015
3016                         ret = -ENOMEM;
3017                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3018                                         MEMREMAP_WB);
3019                         if (!old_ce)
3020                                 goto out;
3021
3022                         new_ce = alloc_pgtable_page(iommu->node);
3023                         if (!new_ce)
3024                                 goto out_unmap;
3025
3026                         ret = 0;
3027                 }
3028
3029                 /* Now copy the context entry */
3030                 memcpy(&ce, old_ce + idx, sizeof(ce));
3031
3032                 if (!__context_present(&ce))
3033                         continue;
3034
3035                 did = context_domain_id(&ce);
3036                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3037                         set_bit(did, iommu->domain_ids);
3038
3039                 /*
3040                  * We need a marker for copied context entries. This
3041                  * marker needs to work for the old format as well as
3042                  * for extended context entries.
3043                  *
3044                  * Bit 67 of the context entry is used. In the old
3045                  * format this bit is available to software, in the
3046                  * extended format it is the PGE bit, but PGE is ignored
3047                  * by HW if PASIDs are disabled (and thus still
3048                  * available).
3049                  *
3050                  * So disable PASIDs first and then mark the entry
3051                  * copied. This means that we don't copy PASID
3052                  * translations from the old kernel, but this is fine as
3053                  * faults there are not fatal.
3054                  */
3055                 context_clear_pasid_enable(&ce);
3056                 context_set_copied(&ce);
3057
3058                 new_ce[idx] = ce;
3059         }
3060
3061         tbl[tbl_idx + pos] = new_ce;
3062
3063         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3064
3065 out_unmap:
3066         memunmap(old_ce);
3067
3068 out:
3069         return ret;
3070 }
3071
3072 static int copy_translation_tables(struct intel_iommu *iommu)
3073 {
3074         struct context_entry **ctxt_tbls;
3075         struct root_entry *old_rt;
3076         phys_addr_t old_rt_phys;
3077         int ctxt_table_entries;
3078         unsigned long flags;
3079         u64 rtaddr_reg;
3080         int bus, ret;
3081         bool new_ext, ext;
3082
3083         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3084         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3085         new_ext    = !!ecap_ecs(iommu->ecap);
3086
3087         /*
3088          * The RTT bit can only be changed when translation is disabled,
3089          * but disabling translation means to open a window for data
3090          * corruption. So bail out and don't copy anything if we would
3091          * have to change the bit.
3092          */
3093         if (new_ext != ext)
3094                 return -EINVAL;
3095
3096         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3097         if (!old_rt_phys)
3098                 return -EINVAL;
3099
3100         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3101         if (!old_rt)
3102                 return -ENOMEM;
3103
3104         /* This is too big for the stack - allocate it from slab */
3105         ctxt_table_entries = ext ? 512 : 256;
3106         ret = -ENOMEM;
3107         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3108         if (!ctxt_tbls)
3109                 goto out_unmap;
3110
3111         for (bus = 0; bus < 256; bus++) {
3112                 ret = copy_context_table(iommu, &old_rt[bus],
3113                                          ctxt_tbls, bus, ext);
3114                 if (ret) {
3115                         pr_err("%s: Failed to copy context table for bus %d\n",
3116                                 iommu->name, bus);
3117                         continue;
3118                 }
3119         }
3120
3121         spin_lock_irqsave(&iommu->lock, flags);
3122
3123         /* Context tables are copied, now write them to the root_entry table */
3124         for (bus = 0; bus < 256; bus++) {
3125                 int idx = ext ? bus * 2 : bus;
3126                 u64 val;
3127
3128                 if (ctxt_tbls[idx]) {
3129                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3130                         iommu->root_entry[bus].lo = val;
3131                 }
3132
3133                 if (!ext || !ctxt_tbls[idx + 1])
3134                         continue;
3135
3136                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3137                 iommu->root_entry[bus].hi = val;
3138         }
3139
3140         spin_unlock_irqrestore(&iommu->lock, flags);
3141
3142         kfree(ctxt_tbls);
3143
3144         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3145
3146         ret = 0;
3147
3148 out_unmap:
3149         memunmap(old_rt);
3150
3151         return ret;
3152 }
3153
3154 static int __init init_dmars(void)
3155 {
3156         struct dmar_drhd_unit *drhd;
3157         struct intel_iommu *iommu;
3158         int ret;
3159
3160         /*
3161          * for each drhd
3162          *    allocate root
3163          *    initialize and program root entry to not present
3164          * endfor
3165          */
3166         for_each_drhd_unit(drhd) {
3167                 /*
3168                  * lock not needed as this is only incremented in the single
3169                  * threaded kernel __init code path all other access are read
3170                  * only
3171                  */
3172                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3173                         g_num_of_iommus++;
3174                         continue;
3175                 }
3176                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3177         }
3178
3179         /* Preallocate enough resources for IOMMU hot-addition */
3180         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3181                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3182
3183         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3184                         GFP_KERNEL);
3185         if (!g_iommus) {
3186                 pr_err("Allocating global iommu array failed\n");
3187                 ret = -ENOMEM;
3188                 goto error;
3189         }
3190
3191         for_each_iommu(iommu, drhd) {
3192                 if (drhd->ignored) {
3193                         iommu_disable_translation(iommu);
3194                         continue;
3195                 }
3196
3197                 /*
3198                  * Find the max pasid size of all IOMMU's in the system.
3199                  * We need to ensure the system pasid table is no bigger
3200                  * than the smallest supported.
3201                  */
3202                 if (pasid_supported(iommu)) {
3203                         u32 temp = 2 << ecap_pss(iommu->ecap);
3204
3205                         intel_pasid_max_id = min_t(u32, temp,
3206                                                    intel_pasid_max_id);
3207                 }
3208
3209                 g_iommus[iommu->seq_id] = iommu;
3210
3211                 intel_iommu_init_qi(iommu);
3212
3213                 ret = iommu_init_domains(iommu);
3214                 if (ret)
3215                         goto free_iommu;
3216
3217                 init_translation_status(iommu);
3218
3219                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3220                         iommu_disable_translation(iommu);
3221                         clear_translation_pre_enabled(iommu);
3222                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3223                                 iommu->name);
3224                 }
3225
3226                 /*
3227                  * TBD:
3228                  * we could share the same root & context tables
3229                  * among all IOMMU's. Need to Split it later.
3230                  */
3231                 ret = iommu_alloc_root_entry(iommu);
3232                 if (ret)
3233                         goto free_iommu;
3234
3235                 if (translation_pre_enabled(iommu)) {
3236                         pr_info("Translation already enabled - trying to copy translation structures\n");
3237
3238                         ret = copy_translation_tables(iommu);
3239                         if (ret) {
3240                                 /*
3241                                  * We found the IOMMU with translation
3242                                  * enabled - but failed to copy over the
3243                                  * old root-entry table. Try to proceed
3244                                  * by disabling translation now and
3245                                  * allocating a clean root-entry table.
3246                                  * This might cause DMAR faults, but
3247                                  * probably the dump will still succeed.
3248                                  */
3249                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3250                                        iommu->name);
3251                                 iommu_disable_translation(iommu);
3252                                 clear_translation_pre_enabled(iommu);
3253                         } else {
3254                                 pr_info("Copied translation tables from previous kernel for %s\n",
3255                                         iommu->name);
3256                         }
3257                 }
3258
3259                 if (!ecap_pass_through(iommu->ecap))
3260                         hw_pass_through = 0;
3261 #ifdef CONFIG_INTEL_IOMMU_SVM
3262                 if (pasid_supported(iommu))
3263                         intel_svm_init(iommu);
3264 #endif
3265         }
3266
3267         /*
3268          * Now that qi is enabled on all iommus, set the root entry and flush
3269          * caches. This is required on some Intel X58 chipsets, otherwise the
3270          * flush_context function will loop forever and the boot hangs.
3271          */
3272         for_each_active_iommu(iommu, drhd) {
3273                 iommu_flush_write_buffer(iommu);
3274                 iommu_set_root_entry(iommu);
3275                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3276                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3277         }
3278
3279         if (iommu_pass_through)
3280                 iommu_identity_mapping |= IDENTMAP_ALL;
3281
3282 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3283         dmar_map_gfx = 0;
3284 #endif
3285
3286         if (!dmar_map_gfx)
3287                 iommu_identity_mapping |= IDENTMAP_GFX;
3288
3289         check_tylersburg_isoch();
3290
3291         ret = si_domain_init(hw_pass_through);
3292         if (ret)
3293                 goto free_iommu;
3294
3295         /*
3296          * for each drhd
3297          *   enable fault log
3298          *   global invalidate context cache
3299          *   global invalidate iotlb
3300          *   enable translation
3301          */
3302         for_each_iommu(iommu, drhd) {
3303                 if (drhd->ignored) {
3304                         /*
3305                          * we always have to disable PMRs or DMA may fail on
3306                          * this device
3307                          */
3308                         if (force_on)
3309                                 iommu_disable_protect_mem_regions(iommu);
3310                         continue;
3311                 }
3312
3313                 iommu_flush_write_buffer(iommu);
3314
3315 #ifdef CONFIG_INTEL_IOMMU_SVM
3316                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3317                         /*
3318                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3319                          * could cause possible lock race condition.
3320                          */
3321                         up_write(&dmar_global_lock);
3322                         ret = intel_svm_enable_prq(iommu);
3323                         down_write(&dmar_global_lock);
3324                         if (ret)
3325                                 goto free_iommu;
3326                 }
3327 #endif
3328                 ret = dmar_set_interrupt(iommu);
3329                 if (ret)
3330                         goto free_iommu;
3331         }
3332
3333         return 0;
3334
3335 free_iommu:
3336         for_each_active_iommu(iommu, drhd) {
3337                 disable_dmar_iommu(iommu);
3338                 free_dmar_iommu(iommu);
3339         }
3340
3341         kfree(g_iommus);
3342
3343 error:
3344         return ret;
3345 }
3346
3347 /* This takes a number of _MM_ pages, not VTD pages */
3348 static unsigned long intel_alloc_iova(struct device *dev,
3349                                      struct dmar_domain *domain,
3350                                      unsigned long nrpages, uint64_t dma_mask)
3351 {
3352         unsigned long iova_pfn;
3353
3354         /* Restrict dma_mask to the width that the iommu can handle */
3355         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3356         /* Ensure we reserve the whole size-aligned region */
3357         nrpages = __roundup_pow_of_two(nrpages);
3358
3359         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3360                 /*
3361                  * First try to allocate an io virtual address in
3362                  * DMA_BIT_MASK(32) and if that fails then try allocating
3363                  * from higher range
3364                  */
3365                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3366                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3367                 if (iova_pfn)
3368                         return iova_pfn;
3369         }
3370         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3371                                    IOVA_PFN(dma_mask), true);
3372         if (unlikely(!iova_pfn)) {
3373                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3374                 return 0;
3375         }
3376
3377         return iova_pfn;
3378 }
3379
3380 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3381 {
3382         struct dmar_domain *domain, *tmp;
3383         struct dmar_rmrr_unit *rmrr;
3384         struct device *i_dev;
3385         int i, ret;
3386
3387         /* Device shouldn't be attached by any domains. */
3388         domain = find_domain(dev);
3389         if (domain)
3390                 return NULL;
3391
3392         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3393         if (!domain)
3394                 goto out;
3395
3396         /* We have a new domain - setup possible RMRRs for the device */
3397         rcu_read_lock();
3398         for_each_rmrr_units(rmrr) {
3399                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3400                                           i, i_dev) {
3401                         if (i_dev != dev)
3402                                 continue;
3403
3404                         ret = domain_prepare_identity_map(dev, domain,
3405                                                           rmrr->base_address,
3406                                                           rmrr->end_address);
3407                         if (ret)
3408                                 dev_err(dev, "Mapping reserved region failed\n");
3409                 }
3410         }
3411         rcu_read_unlock();
3412
3413         tmp = set_domain_for_dev(dev, domain);
3414         if (!tmp || domain != tmp) {
3415                 domain_exit(domain);
3416                 domain = tmp;
3417         }
3418
3419 out:
3420         if (!domain)
3421                 dev_err(dev, "Allocating domain failed\n");
3422         else
3423                 domain->domain.type = IOMMU_DOMAIN_DMA;
3424
3425         return domain;
3426 }
3427
3428 /* Check if the dev needs to go through non-identity map and unmap process.*/
3429 static bool iommu_need_mapping(struct device *dev)
3430 {
3431         int ret;
3432
3433         if (iommu_dummy(dev))
3434                 return false;
3435
3436         ret = identity_mapping(dev);
3437         if (ret) {
3438                 u64 dma_mask = *dev->dma_mask;
3439
3440                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3441                         dma_mask = dev->coherent_dma_mask;
3442
3443                 if (dma_mask >= dma_get_required_mask(dev))
3444                         return false;
3445
3446                 /*
3447                  * 32 bit DMA is removed from si_domain and fall back to
3448                  * non-identity mapping.
3449                  */
3450                 dmar_remove_one_dev_info(dev);
3451                 ret = iommu_request_dma_domain_for_dev(dev);
3452                 if (ret) {
3453                         struct iommu_domain *domain;
3454                         struct dmar_domain *dmar_domain;
3455
3456                         domain = iommu_get_domain_for_dev(dev);
3457                         if (domain) {
3458                                 dmar_domain = to_dmar_domain(domain);
3459                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3460                         }
3461                         get_private_domain_for_dev(dev);
3462                 }
3463
3464                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3465         }
3466
3467         return true;
3468 }
3469
3470 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3471                                      size_t size, int dir, u64 dma_mask)
3472 {
3473         struct dmar_domain *domain;
3474         phys_addr_t start_paddr;
3475         unsigned long iova_pfn;
3476         int prot = 0;
3477         int ret;
3478         struct intel_iommu *iommu;
3479         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3480
3481         BUG_ON(dir == DMA_NONE);
3482
3483         domain = find_domain(dev);
3484         if (!domain)
3485                 return DMA_MAPPING_ERROR;
3486
3487         iommu = domain_get_iommu(domain);
3488         size = aligned_nrpages(paddr, size);
3489
3490         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3491         if (!iova_pfn)
3492                 goto error;
3493
3494         /*
3495          * Check if DMAR supports zero-length reads on write only
3496          * mappings..
3497          */
3498         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3499                         !cap_zlr(iommu->cap))
3500                 prot |= DMA_PTE_READ;
3501         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3502                 prot |= DMA_PTE_WRITE;
3503         /*
3504          * paddr - (paddr + size) might be partial page, we should map the whole
3505          * page.  Note: if two part of one page are separately mapped, we
3506          * might have two guest_addr mapping to the same host paddr, but this
3507          * is not a big problem
3508          */
3509         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3510                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3511         if (ret)
3512                 goto error;
3513
3514         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3515         start_paddr += paddr & ~PAGE_MASK;
3516         return start_paddr;
3517
3518 error:
3519         if (iova_pfn)
3520                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3521         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3522                 size, (unsigned long long)paddr, dir);
3523         return DMA_MAPPING_ERROR;
3524 }
3525
3526 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3527                                  unsigned long offset, size_t size,
3528                                  enum dma_data_direction dir,
3529                                  unsigned long attrs)
3530 {
3531         if (iommu_need_mapping(dev))
3532                 return __intel_map_single(dev, page_to_phys(page) + offset,
3533                                 size, dir, *dev->dma_mask);
3534         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3535 }
3536
3537 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3538                                      size_t size, enum dma_data_direction dir,
3539                                      unsigned long attrs)
3540 {
3541         if (iommu_need_mapping(dev))
3542                 return __intel_map_single(dev, phys_addr, size, dir,
3543                                 *dev->dma_mask);
3544         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3545 }
3546
3547 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3548 {
3549         struct dmar_domain *domain;
3550         unsigned long start_pfn, last_pfn;
3551         unsigned long nrpages;
3552         unsigned long iova_pfn;
3553         struct intel_iommu *iommu;
3554         struct page *freelist;
3555         struct pci_dev *pdev = NULL;
3556
3557         domain = find_domain(dev);
3558         BUG_ON(!domain);
3559
3560         iommu = domain_get_iommu(domain);
3561
3562         iova_pfn = IOVA_PFN(dev_addr);
3563
3564         nrpages = aligned_nrpages(dev_addr, size);
3565         start_pfn = mm_to_dma_pfn(iova_pfn);
3566         last_pfn = start_pfn + nrpages - 1;
3567
3568         if (dev_is_pci(dev))
3569                 pdev = to_pci_dev(dev);
3570
3571         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3572
3573         freelist = domain_unmap(domain, start_pfn, last_pfn);
3574
3575         if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3576                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3577                                       nrpages, !freelist, 0);
3578                 /* free iova */
3579                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3580                 dma_free_pagelist(freelist);
3581         } else {
3582                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3583                            (unsigned long)freelist);
3584                 /*
3585                  * queue up the release of the unmap to save the 1/6th of the
3586                  * cpu used up by the iotlb flush operation...
3587                  */
3588         }
3589 }
3590
3591 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3592                              size_t size, enum dma_data_direction dir,
3593                              unsigned long attrs)
3594 {
3595         if (iommu_need_mapping(dev))
3596                 intel_unmap(dev, dev_addr, size);
3597         else
3598                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3599 }
3600
3601 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3602                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3603 {
3604         if (iommu_need_mapping(dev))
3605                 intel_unmap(dev, dev_addr, size);
3606 }
3607
3608 static void *intel_alloc_coherent(struct device *dev, size_t size,
3609                                   dma_addr_t *dma_handle, gfp_t flags,
3610                                   unsigned long attrs)
3611 {
3612         struct page *page = NULL;
3613         int order;
3614
3615         if (!iommu_need_mapping(dev))
3616                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3617
3618         size = PAGE_ALIGN(size);
3619         order = get_order(size);
3620
3621         if (gfpflags_allow_blocking(flags)) {
3622                 unsigned int count = size >> PAGE_SHIFT;
3623
3624                 page = dma_alloc_from_contiguous(dev, count, order,
3625                                                  flags & __GFP_NOWARN);
3626         }
3627
3628         if (!page)
3629                 page = alloc_pages(flags, order);
3630         if (!page)
3631                 return NULL;
3632         memset(page_address(page), 0, size);
3633
3634         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3635                                          DMA_BIDIRECTIONAL,
3636                                          dev->coherent_dma_mask);
3637         if (*dma_handle != DMA_MAPPING_ERROR)
3638                 return page_address(page);
3639         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3640                 __free_pages(page, order);
3641
3642         return NULL;
3643 }
3644
3645 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3646                                 dma_addr_t dma_handle, unsigned long attrs)
3647 {
3648         int order;
3649         struct page *page = virt_to_page(vaddr);
3650
3651         if (!iommu_need_mapping(dev))
3652                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3653
3654         size = PAGE_ALIGN(size);
3655         order = get_order(size);
3656
3657         intel_unmap(dev, dma_handle, size);
3658         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3659                 __free_pages(page, order);
3660 }
3661
3662 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3663                            int nelems, enum dma_data_direction dir,
3664                            unsigned long attrs)
3665 {
3666         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3667         unsigned long nrpages = 0;
3668         struct scatterlist *sg;
3669         int i;
3670
3671         if (!iommu_need_mapping(dev))
3672                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3673
3674         for_each_sg(sglist, sg, nelems, i) {
3675                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3676         }
3677
3678         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3679 }
3680
3681 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3682                         enum dma_data_direction dir, unsigned long attrs)
3683 {
3684         int i;
3685         struct dmar_domain *domain;
3686         size_t size = 0;
3687         int prot = 0;
3688         unsigned long iova_pfn;
3689         int ret;
3690         struct scatterlist *sg;
3691         unsigned long start_vpfn;
3692         struct intel_iommu *iommu;
3693
3694         BUG_ON(dir == DMA_NONE);
3695         if (!iommu_need_mapping(dev))
3696                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3697
3698         domain = find_domain(dev);
3699         if (!domain)
3700                 return 0;
3701
3702         iommu = domain_get_iommu(domain);
3703
3704         for_each_sg(sglist, sg, nelems, i)
3705                 size += aligned_nrpages(sg->offset, sg->length);
3706
3707         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3708                                 *dev->dma_mask);
3709         if (!iova_pfn) {
3710                 sglist->dma_length = 0;
3711                 return 0;
3712         }
3713
3714         /*
3715          * Check if DMAR supports zero-length reads on write only
3716          * mappings..
3717          */
3718         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3719                         !cap_zlr(iommu->cap))
3720                 prot |= DMA_PTE_READ;
3721         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3722                 prot |= DMA_PTE_WRITE;
3723
3724         start_vpfn = mm_to_dma_pfn(iova_pfn);
3725
3726         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3727         if (unlikely(ret)) {
3728                 dma_pte_free_pagetable(domain, start_vpfn,
3729                                        start_vpfn + size - 1,
3730                                        agaw_to_level(domain->agaw) + 1);
3731                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3732                 return 0;
3733         }
3734
3735         return nelems;
3736 }
3737
3738 static const struct dma_map_ops intel_dma_ops = {
3739         .alloc = intel_alloc_coherent,
3740         .free = intel_free_coherent,
3741         .map_sg = intel_map_sg,
3742         .unmap_sg = intel_unmap_sg,
3743         .map_page = intel_map_page,
3744         .unmap_page = intel_unmap_page,
3745         .map_resource = intel_map_resource,
3746         .unmap_resource = intel_unmap_resource,
3747         .dma_supported = dma_direct_supported,
3748 };
3749
3750 static inline int iommu_domain_cache_init(void)
3751 {
3752         int ret = 0;
3753
3754         iommu_domain_cache = kmem_cache_create("iommu_domain",
3755                                          sizeof(struct dmar_domain),
3756                                          0,
3757                                          SLAB_HWCACHE_ALIGN,
3758
3759                                          NULL);
3760         if (!iommu_domain_cache) {
3761                 pr_err("Couldn't create iommu_domain cache\n");
3762                 ret = -ENOMEM;
3763         }
3764
3765         return ret;
3766 }
3767
3768 static inline int iommu_devinfo_cache_init(void)
3769 {
3770         int ret = 0;
3771
3772         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3773                                          sizeof(struct device_domain_info),
3774                                          0,
3775                                          SLAB_HWCACHE_ALIGN,
3776                                          NULL);
3777         if (!iommu_devinfo_cache) {
3778                 pr_err("Couldn't create devinfo cache\n");
3779                 ret = -ENOMEM;
3780         }
3781
3782         return ret;
3783 }
3784
3785 static int __init iommu_init_mempool(void)
3786 {
3787         int ret;
3788         ret = iova_cache_get();
3789         if (ret)
3790                 return ret;
3791
3792         ret = iommu_domain_cache_init();
3793         if (ret)
3794                 goto domain_error;
3795
3796         ret = iommu_devinfo_cache_init();
3797         if (!ret)
3798                 return ret;
3799
3800         kmem_cache_destroy(iommu_domain_cache);
3801 domain_error:
3802         iova_cache_put();
3803
3804         return -ENOMEM;
3805 }
3806
3807 static void __init iommu_exit_mempool(void)
3808 {
3809         kmem_cache_destroy(iommu_devinfo_cache);
3810         kmem_cache_destroy(iommu_domain_cache);
3811         iova_cache_put();
3812 }
3813
3814 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3815 {
3816         struct dmar_drhd_unit *drhd;
3817         u32 vtbar;
3818         int rc;
3819
3820         /* We know that this device on this chipset has its own IOMMU.
3821          * If we find it under a different IOMMU, then the BIOS is lying
3822          * to us. Hope that the IOMMU for this device is actually
3823          * disabled, and it needs no translation...
3824          */
3825         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3826         if (rc) {
3827                 /* "can't" happen */
3828                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3829                 return;
3830         }
3831         vtbar &= 0xffff0000;
3832
3833         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3834         drhd = dmar_find_matched_drhd_unit(pdev);
3835         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3836                             TAINT_FIRMWARE_WORKAROUND,
3837                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3838                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3839 }
3840 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3841
3842 static void __init init_no_remapping_devices(void)
3843 {
3844         struct dmar_drhd_unit *drhd;
3845         struct device *dev;
3846         int i;
3847
3848         for_each_drhd_unit(drhd) {
3849                 if (!drhd->include_all) {
3850                         for_each_active_dev_scope(drhd->devices,
3851                                                   drhd->devices_cnt, i, dev)
3852                                 break;
3853                         /* ignore DMAR unit if no devices exist */
3854                         if (i == drhd->devices_cnt)
3855                                 drhd->ignored = 1;
3856                 }
3857         }
3858
3859         for_each_active_drhd_unit(drhd) {
3860                 if (drhd->include_all)
3861                         continue;
3862
3863                 for_each_active_dev_scope(drhd->devices,
3864                                           drhd->devices_cnt, i, dev)
3865                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3866                                 break;
3867                 if (i < drhd->devices_cnt)
3868                         continue;
3869
3870                 /* This IOMMU has *only* gfx devices. Either bypass it or
3871                    set the gfx_mapped flag, as appropriate */
3872                 if (!dmar_map_gfx) {
3873                         drhd->ignored = 1;
3874                         for_each_active_dev_scope(drhd->devices,
3875                                                   drhd->devices_cnt, i, dev)
3876                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3877                 }
3878         }
3879 }
3880
3881 #ifdef CONFIG_SUSPEND
3882 static int init_iommu_hw(void)
3883 {
3884         struct dmar_drhd_unit *drhd;
3885         struct intel_iommu *iommu = NULL;
3886
3887         for_each_active_iommu(iommu, drhd)
3888                 if (iommu->qi)
3889                         dmar_reenable_qi(iommu);
3890
3891         for_each_iommu(iommu, drhd) {
3892                 if (drhd->ignored) {
3893                         /*
3894                          * we always have to disable PMRs or DMA may fail on
3895                          * this device
3896                          */
3897                         if (force_on)
3898                                 iommu_disable_protect_mem_regions(iommu);
3899                         continue;
3900                 }
3901
3902                 iommu_flush_write_buffer(iommu);
3903
3904                 iommu_set_root_entry(iommu);
3905
3906                 iommu->flush.flush_context(iommu, 0, 0, 0,
3907                                            DMA_CCMD_GLOBAL_INVL);
3908                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3909                 iommu_enable_translation(iommu);
3910                 iommu_disable_protect_mem_regions(iommu);
3911         }
3912
3913         return 0;
3914 }
3915
3916 static void iommu_flush_all(void)
3917 {
3918         struct dmar_drhd_unit *drhd;
3919         struct intel_iommu *iommu;
3920
3921         for_each_active_iommu(iommu, drhd) {
3922                 iommu->flush.flush_context(iommu, 0, 0, 0,
3923                                            DMA_CCMD_GLOBAL_INVL);
3924                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3925                                          DMA_TLB_GLOBAL_FLUSH);
3926         }
3927 }
3928
3929 static int iommu_suspend(void)
3930 {
3931         struct dmar_drhd_unit *drhd;
3932         struct intel_iommu *iommu = NULL;
3933         unsigned long flag;
3934
3935         for_each_active_iommu(iommu, drhd) {
3936                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3937                                                  GFP_ATOMIC);
3938                 if (!iommu->iommu_state)
3939                         goto nomem;
3940         }
3941
3942         iommu_flush_all();
3943
3944         for_each_active_iommu(iommu, drhd) {
3945                 iommu_disable_translation(iommu);
3946
3947                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3948
3949                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3950                         readl(iommu->reg + DMAR_FECTL_REG);
3951                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3952                         readl(iommu->reg + DMAR_FEDATA_REG);
3953                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3954                         readl(iommu->reg + DMAR_FEADDR_REG);
3955                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3956                         readl(iommu->reg + DMAR_FEUADDR_REG);
3957
3958                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3959         }
3960         return 0;
3961
3962 nomem:
3963         for_each_active_iommu(iommu, drhd)
3964                 kfree(iommu->iommu_state);
3965
3966         return -ENOMEM;
3967 }
3968
3969 static void iommu_resume(void)
3970 {
3971         struct dmar_drhd_unit *drhd;
3972         struct intel_iommu *iommu = NULL;
3973         unsigned long flag;
3974
3975         if (init_iommu_hw()) {
3976                 if (force_on)
3977                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3978                 else
3979                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3980                 return;
3981         }
3982
3983         for_each_active_iommu(iommu, drhd) {
3984
3985                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3986
3987                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3988                         iommu->reg + DMAR_FECTL_REG);
3989                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3990                         iommu->reg + DMAR_FEDATA_REG);
3991                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3992                         iommu->reg + DMAR_FEADDR_REG);
3993                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3994                         iommu->reg + DMAR_FEUADDR_REG);
3995
3996                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3997         }
3998
3999         for_each_active_iommu(iommu, drhd)
4000                 kfree(iommu->iommu_state);
4001 }
4002
4003 static struct syscore_ops iommu_syscore_ops = {
4004         .resume         = iommu_resume,
4005         .suspend        = iommu_suspend,
4006 };
4007
4008 static void __init init_iommu_pm_ops(void)
4009 {
4010         register_syscore_ops(&iommu_syscore_ops);
4011 }
4012
4013 #else
4014 static inline void init_iommu_pm_ops(void) {}
4015 #endif  /* CONFIG_PM */
4016
4017 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4018 {
4019         struct acpi_dmar_reserved_memory *rmrr;
4020         struct dmar_rmrr_unit *rmrru;
4021
4022         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4023         if (!rmrru)
4024                 goto out;
4025
4026         rmrru->hdr = header;
4027         rmrr = (struct acpi_dmar_reserved_memory *)header;
4028         rmrru->base_address = rmrr->base_address;
4029         rmrru->end_address = rmrr->end_address;
4030
4031         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4032                                 ((void *)rmrr) + rmrr->header.length,
4033                                 &rmrru->devices_cnt);
4034         if (rmrru->devices_cnt && rmrru->devices == NULL)
4035                 goto free_rmrru;
4036
4037         list_add(&rmrru->list, &dmar_rmrr_units);
4038
4039         return 0;
4040 free_rmrru:
4041         kfree(rmrru);
4042 out:
4043         return -ENOMEM;
4044 }
4045
4046 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4047 {
4048         struct dmar_atsr_unit *atsru;
4049         struct acpi_dmar_atsr *tmp;
4050
4051         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4052                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4053                 if (atsr->segment != tmp->segment)
4054                         continue;
4055                 if (atsr->header.length != tmp->header.length)
4056                         continue;
4057                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4058                         return atsru;
4059         }
4060
4061         return NULL;
4062 }
4063
4064 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4065 {
4066         struct acpi_dmar_atsr *atsr;
4067         struct dmar_atsr_unit *atsru;
4068
4069         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4070                 return 0;
4071
4072         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4073         atsru = dmar_find_atsr(atsr);
4074         if (atsru)
4075                 return 0;
4076
4077         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4078         if (!atsru)
4079                 return -ENOMEM;
4080
4081         /*
4082          * If memory is allocated from slab by ACPI _DSM method, we need to
4083          * copy the memory content because the memory buffer will be freed
4084          * on return.
4085          */
4086         atsru->hdr = (void *)(atsru + 1);
4087         memcpy(atsru->hdr, hdr, hdr->length);
4088         atsru->include_all = atsr->flags & 0x1;
4089         if (!atsru->include_all) {
4090                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4091                                 (void *)atsr + atsr->header.length,
4092                                 &atsru->devices_cnt);
4093                 if (atsru->devices_cnt && atsru->devices == NULL) {
4094                         kfree(atsru);
4095                         return -ENOMEM;
4096                 }
4097         }
4098
4099         list_add_rcu(&atsru->list, &dmar_atsr_units);
4100
4101         return 0;
4102 }
4103
4104 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4105 {
4106         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4107         kfree(atsru);
4108 }
4109
4110 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4111 {
4112         struct acpi_dmar_atsr *atsr;
4113         struct dmar_atsr_unit *atsru;
4114
4115         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4116         atsru = dmar_find_atsr(atsr);
4117         if (atsru) {
4118                 list_del_rcu(&atsru->list);
4119                 synchronize_rcu();
4120                 intel_iommu_free_atsr(atsru);
4121         }
4122
4123         return 0;
4124 }
4125
4126 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4127 {
4128         int i;
4129         struct device *dev;
4130         struct acpi_dmar_atsr *atsr;
4131         struct dmar_atsr_unit *atsru;
4132
4133         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4134         atsru = dmar_find_atsr(atsr);
4135         if (!atsru)
4136                 return 0;
4137
4138         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4139                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4140                                           i, dev)
4141                         return -EBUSY;
4142         }
4143
4144         return 0;
4145 }
4146
4147 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4148 {
4149         int sp, ret;
4150         struct intel_iommu *iommu = dmaru->iommu;
4151
4152         if (g_iommus[iommu->seq_id])
4153                 return 0;
4154
4155         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4156                 pr_warn("%s: Doesn't support hardware pass through.\n",
4157                         iommu->name);
4158                 return -ENXIO;
4159         }
4160         if (!ecap_sc_support(iommu->ecap) &&
4161             domain_update_iommu_snooping(iommu)) {
4162                 pr_warn("%s: Doesn't support snooping.\n",
4163                         iommu->name);
4164                 return -ENXIO;
4165         }
4166         sp = domain_update_iommu_superpage(iommu) - 1;
4167         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4168                 pr_warn("%s: Doesn't support large page.\n",
4169                         iommu->name);
4170                 return -ENXIO;
4171         }
4172
4173         /*
4174          * Disable translation if already enabled prior to OS handover.
4175          */
4176         if (iommu->gcmd & DMA_GCMD_TE)
4177                 iommu_disable_translation(iommu);
4178
4179         g_iommus[iommu->seq_id] = iommu;
4180         ret = iommu_init_domains(iommu);
4181         if (ret == 0)
4182                 ret = iommu_alloc_root_entry(iommu);
4183         if (ret)
4184                 goto out;
4185
4186 #ifdef CONFIG_INTEL_IOMMU_SVM
4187         if (pasid_supported(iommu))
4188                 intel_svm_init(iommu);
4189 #endif
4190
4191         if (dmaru->ignored) {
4192                 /*
4193                  * we always have to disable PMRs or DMA may fail on this device
4194                  */
4195                 if (force_on)
4196                         iommu_disable_protect_mem_regions(iommu);
4197                 return 0;
4198         }
4199
4200         intel_iommu_init_qi(iommu);
4201         iommu_flush_write_buffer(iommu);
4202
4203 #ifdef CONFIG_INTEL_IOMMU_SVM
4204         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4205                 ret = intel_svm_enable_prq(iommu);
4206                 if (ret)
4207                         goto disable_iommu;
4208         }
4209 #endif
4210         ret = dmar_set_interrupt(iommu);
4211         if (ret)
4212                 goto disable_iommu;
4213
4214         iommu_set_root_entry(iommu);
4215         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4216         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4217         iommu_enable_translation(iommu);
4218
4219         iommu_disable_protect_mem_regions(iommu);
4220         return 0;
4221
4222 disable_iommu:
4223         disable_dmar_iommu(iommu);
4224 out:
4225         free_dmar_iommu(iommu);
4226         return ret;
4227 }
4228
4229 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4230 {
4231         int ret = 0;
4232         struct intel_iommu *iommu = dmaru->iommu;
4233
4234         if (!intel_iommu_enabled)
4235                 return 0;
4236         if (iommu == NULL)
4237                 return -EINVAL;
4238
4239         if (insert) {
4240                 ret = intel_iommu_add(dmaru);
4241         } else {
4242                 disable_dmar_iommu(iommu);
4243                 free_dmar_iommu(iommu);
4244         }
4245
4246         return ret;
4247 }
4248
4249 static void intel_iommu_free_dmars(void)
4250 {
4251         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4252         struct dmar_atsr_unit *atsru, *atsr_n;
4253
4254         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4255                 list_del(&rmrru->list);
4256                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4257                 kfree(rmrru);
4258         }
4259
4260         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4261                 list_del(&atsru->list);
4262                 intel_iommu_free_atsr(atsru);
4263         }
4264 }
4265
4266 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4267 {
4268         int i, ret = 1;
4269         struct pci_bus *bus;
4270         struct pci_dev *bridge = NULL;
4271         struct device *tmp;
4272         struct acpi_dmar_atsr *atsr;
4273         struct dmar_atsr_unit *atsru;
4274
4275         dev = pci_physfn(dev);
4276         for (bus = dev->bus; bus; bus = bus->parent) {
4277                 bridge = bus->self;
4278                 /* If it's an integrated device, allow ATS */
4279                 if (!bridge)
4280                         return 1;
4281                 /* Connected via non-PCIe: no ATS */
4282                 if (!pci_is_pcie(bridge) ||
4283                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4284                         return 0;
4285                 /* If we found the root port, look it up in the ATSR */
4286                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4287                         break;
4288         }
4289
4290         rcu_read_lock();
4291         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4292                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4293                 if (atsr->segment != pci_domain_nr(dev->bus))
4294                         continue;
4295
4296                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4297                         if (tmp == &bridge->dev)
4298                                 goto out;
4299
4300                 if (atsru->include_all)
4301                         goto out;
4302         }
4303         ret = 0;
4304 out:
4305         rcu_read_unlock();
4306
4307         return ret;
4308 }
4309
4310 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4311 {
4312         int ret;
4313         struct dmar_rmrr_unit *rmrru;
4314         struct dmar_atsr_unit *atsru;
4315         struct acpi_dmar_atsr *atsr;
4316         struct acpi_dmar_reserved_memory *rmrr;
4317
4318         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4319                 return 0;
4320
4321         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4322                 rmrr = container_of(rmrru->hdr,
4323                                     struct acpi_dmar_reserved_memory, header);
4324                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4325                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4326                                 ((void *)rmrr) + rmrr->header.length,
4327                                 rmrr->segment, rmrru->devices,
4328                                 rmrru->devices_cnt);
4329                         if (ret < 0)
4330                                 return ret;
4331                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4332                         dmar_remove_dev_scope(info, rmrr->segment,
4333                                 rmrru->devices, rmrru->devices_cnt);
4334                 }
4335         }
4336
4337         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4338                 if (atsru->include_all)
4339                         continue;
4340
4341                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4342                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4343                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4344                                         (void *)atsr + atsr->header.length,
4345                                         atsr->segment, atsru->devices,
4346                                         atsru->devices_cnt);
4347                         if (ret > 0)
4348                                 break;
4349                         else if (ret < 0)
4350                                 return ret;
4351                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4352                         if (dmar_remove_dev_scope(info, atsr->segment,
4353                                         atsru->devices, atsru->devices_cnt))
4354                                 break;
4355                 }
4356         }
4357
4358         return 0;
4359 }
4360
4361 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4362                                        unsigned long val, void *v)
4363 {
4364         struct memory_notify *mhp = v;
4365         unsigned long long start, end;
4366         unsigned long start_vpfn, last_vpfn;
4367
4368         switch (val) {
4369         case MEM_GOING_ONLINE:
4370                 start = mhp->start_pfn << PAGE_SHIFT;
4371                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4372                 if (iommu_domain_identity_map(si_domain, start, end)) {
4373                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4374                                 start, end);
4375                         return NOTIFY_BAD;
4376                 }
4377                 break;
4378
4379         case MEM_OFFLINE:
4380         case MEM_CANCEL_ONLINE:
4381                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4382                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4383                 while (start_vpfn <= last_vpfn) {
4384                         struct iova *iova;
4385                         struct dmar_drhd_unit *drhd;
4386                         struct intel_iommu *iommu;
4387                         struct page *freelist;
4388
4389                         iova = find_iova(&si_domain->iovad, start_vpfn);
4390                         if (iova == NULL) {
4391                                 pr_debug("Failed get IOVA for PFN %lx\n",
4392                                          start_vpfn);
4393                                 break;
4394                         }
4395
4396                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4397                                                      start_vpfn, last_vpfn);
4398                         if (iova == NULL) {
4399                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4400                                         start_vpfn, last_vpfn);
4401                                 return NOTIFY_BAD;
4402                         }
4403
4404                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4405                                                iova->pfn_hi);
4406
4407                         rcu_read_lock();
4408                         for_each_active_iommu(iommu, drhd)
4409                                 iommu_flush_iotlb_psi(iommu, si_domain,
4410                                         iova->pfn_lo, iova_size(iova),
4411                                         !freelist, 0);
4412                         rcu_read_unlock();
4413                         dma_free_pagelist(freelist);
4414
4415                         start_vpfn = iova->pfn_hi + 1;
4416                         free_iova_mem(iova);
4417                 }
4418                 break;
4419         }
4420
4421         return NOTIFY_OK;
4422 }
4423
4424 static struct notifier_block intel_iommu_memory_nb = {
4425         .notifier_call = intel_iommu_memory_notifier,
4426         .priority = 0
4427 };
4428
4429 static void free_all_cpu_cached_iovas(unsigned int cpu)
4430 {
4431         int i;
4432
4433         for (i = 0; i < g_num_of_iommus; i++) {
4434                 struct intel_iommu *iommu = g_iommus[i];
4435                 struct dmar_domain *domain;
4436                 int did;
4437
4438                 if (!iommu)
4439                         continue;
4440
4441                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4442                         domain = get_iommu_domain(iommu, (u16)did);
4443
4444                         if (!domain)
4445                                 continue;
4446                         free_cpu_cached_iovas(cpu, &domain->iovad);
4447                 }
4448         }
4449 }
4450
4451 static int intel_iommu_cpu_dead(unsigned int cpu)
4452 {
4453         free_all_cpu_cached_iovas(cpu);
4454         return 0;
4455 }
4456
4457 static void intel_disable_iommus(void)
4458 {
4459         struct intel_iommu *iommu = NULL;
4460         struct dmar_drhd_unit *drhd;
4461
4462         for_each_iommu(iommu, drhd)
4463                 iommu_disable_translation(iommu);
4464 }
4465
4466 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4467 {
4468         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4469
4470         return container_of(iommu_dev, struct intel_iommu, iommu);
4471 }
4472
4473 static ssize_t intel_iommu_show_version(struct device *dev,
4474                                         struct device_attribute *attr,
4475                                         char *buf)
4476 {
4477         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4478         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4479         return sprintf(buf, "%d:%d\n",
4480                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4481 }
4482 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4483
4484 static ssize_t intel_iommu_show_address(struct device *dev,
4485                                         struct device_attribute *attr,
4486                                         char *buf)
4487 {
4488         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4489         return sprintf(buf, "%llx\n", iommu->reg_phys);
4490 }
4491 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4492
4493 static ssize_t intel_iommu_show_cap(struct device *dev,
4494                                     struct device_attribute *attr,
4495                                     char *buf)
4496 {
4497         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4498         return sprintf(buf, "%llx\n", iommu->cap);
4499 }
4500 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4501
4502 static ssize_t intel_iommu_show_ecap(struct device *dev,
4503                                     struct device_attribute *attr,
4504                                     char *buf)
4505 {
4506         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4507         return sprintf(buf, "%llx\n", iommu->ecap);
4508 }
4509 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4510
4511 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4512                                       struct device_attribute *attr,
4513                                       char *buf)
4514 {
4515         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4516         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4517 }
4518 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4519
4520 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4521                                            struct device_attribute *attr,
4522                                            char *buf)
4523 {
4524         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4525         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4526                                                   cap_ndoms(iommu->cap)));
4527 }
4528 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4529
4530 static struct attribute *intel_iommu_attrs[] = {
4531         &dev_attr_version.attr,
4532         &dev_attr_address.attr,
4533         &dev_attr_cap.attr,
4534         &dev_attr_ecap.attr,
4535         &dev_attr_domains_supported.attr,
4536         &dev_attr_domains_used.attr,
4537         NULL,
4538 };
4539
4540 static struct attribute_group intel_iommu_group = {
4541         .name = "intel-iommu",
4542         .attrs = intel_iommu_attrs,
4543 };
4544
4545 const struct attribute_group *intel_iommu_groups[] = {
4546         &intel_iommu_group,
4547         NULL,
4548 };
4549
4550 static int __init platform_optin_force_iommu(void)
4551 {
4552         struct pci_dev *pdev = NULL;
4553         bool has_untrusted_dev = false;
4554
4555         if (!dmar_platform_optin() || no_platform_optin)
4556                 return 0;
4557
4558         for_each_pci_dev(pdev) {
4559                 if (pdev->untrusted) {
4560                         has_untrusted_dev = true;
4561                         break;
4562                 }
4563         }
4564
4565         if (!has_untrusted_dev)
4566                 return 0;
4567
4568         if (no_iommu || dmar_disabled)
4569                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4570
4571         /*
4572          * If Intel-IOMMU is disabled by default, we will apply identity
4573          * map for all devices except those marked as being untrusted.
4574          */
4575         if (dmar_disabled)
4576                 iommu_identity_mapping |= IDENTMAP_ALL;
4577
4578         dmar_disabled = 0;
4579 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4580         swiotlb = 0;
4581 #endif
4582         no_iommu = 0;
4583
4584         return 1;
4585 }
4586
4587 static int __init probe_acpi_namespace_devices(void)
4588 {
4589         struct dmar_drhd_unit *drhd;
4590         /* To avoid a -Wunused-but-set-variable warning. */
4591         struct intel_iommu *iommu __maybe_unused;
4592         struct device *dev;
4593         int i, ret = 0;
4594
4595         for_each_active_iommu(iommu, drhd) {
4596                 for_each_active_dev_scope(drhd->devices,
4597                                           drhd->devices_cnt, i, dev) {
4598                         struct acpi_device_physical_node *pn;
4599                         struct iommu_group *group;
4600                         struct acpi_device *adev;
4601
4602                         if (dev->bus != &acpi_bus_type)
4603                                 continue;
4604
4605                         adev = to_acpi_device(dev);
4606                         mutex_lock(&adev->physical_node_lock);
4607                         list_for_each_entry(pn,
4608                                             &adev->physical_node_list, node) {
4609                                 group = iommu_group_get(pn->dev);
4610                                 if (group) {
4611                                         iommu_group_put(group);
4612                                         continue;
4613                                 }
4614
4615                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4616                                 ret = iommu_probe_device(pn->dev);
4617                                 if (ret)
4618                                         break;
4619                         }
4620                         mutex_unlock(&adev->physical_node_lock);
4621
4622                         if (ret)
4623                                 return ret;
4624                 }
4625         }
4626
4627         return 0;
4628 }
4629
4630 int __init intel_iommu_init(void)
4631 {
4632         int ret = -ENODEV;
4633         struct dmar_drhd_unit *drhd;
4634         struct intel_iommu *iommu;
4635
4636         /*
4637          * Intel IOMMU is required for a TXT/tboot launch or platform
4638          * opt in, so enforce that.
4639          */
4640         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4641
4642         if (iommu_init_mempool()) {
4643                 if (force_on)
4644                         panic("tboot: Failed to initialize iommu memory\n");
4645                 return -ENOMEM;
4646         }
4647
4648         down_write(&dmar_global_lock);
4649         if (dmar_table_init()) {
4650                 if (force_on)
4651                         panic("tboot: Failed to initialize DMAR table\n");
4652                 goto out_free_dmar;
4653         }
4654
4655         if (dmar_dev_scope_init() < 0) {
4656                 if (force_on)
4657                         panic("tboot: Failed to initialize DMAR device scope\n");
4658                 goto out_free_dmar;
4659         }
4660
4661         up_write(&dmar_global_lock);
4662
4663         /*
4664          * The bus notifier takes the dmar_global_lock, so lockdep will
4665          * complain later when we register it under the lock.
4666          */
4667         dmar_register_bus_notifier();
4668
4669         down_write(&dmar_global_lock);
4670
4671         if (no_iommu || dmar_disabled) {
4672                 /*
4673                  * We exit the function here to ensure IOMMU's remapping and
4674                  * mempool aren't setup, which means that the IOMMU's PMRs
4675                  * won't be disabled via the call to init_dmars(). So disable
4676                  * it explicitly here. The PMRs were setup by tboot prior to
4677                  * calling SENTER, but the kernel is expected to reset/tear
4678                  * down the PMRs.
4679                  */
4680                 if (intel_iommu_tboot_noforce) {
4681                         for_each_iommu(iommu, drhd)
4682                                 iommu_disable_protect_mem_regions(iommu);
4683                 }
4684
4685                 /*
4686                  * Make sure the IOMMUs are switched off, even when we
4687                  * boot into a kexec kernel and the previous kernel left
4688                  * them enabled
4689                  */
4690                 intel_disable_iommus();
4691                 goto out_free_dmar;
4692         }
4693
4694         if (list_empty(&dmar_rmrr_units))
4695                 pr_info("No RMRR found\n");
4696
4697         if (list_empty(&dmar_atsr_units))
4698                 pr_info("No ATSR found\n");
4699
4700         if (dmar_init_reserved_ranges()) {
4701                 if (force_on)
4702                         panic("tboot: Failed to reserve iommu ranges\n");
4703                 goto out_free_reserved_range;
4704         }
4705
4706         if (dmar_map_gfx)
4707                 intel_iommu_gfx_mapped = 1;
4708
4709         init_no_remapping_devices();
4710
4711         ret = init_dmars();
4712         if (ret) {
4713                 if (force_on)
4714                         panic("tboot: Failed to initialize DMARs\n");
4715                 pr_err("Initialization failed\n");
4716                 goto out_free_reserved_range;
4717         }
4718         up_write(&dmar_global_lock);
4719
4720 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4721         swiotlb = 0;
4722 #endif
4723         dma_ops = &intel_dma_ops;
4724
4725         init_iommu_pm_ops();
4726
4727         for_each_active_iommu(iommu, drhd) {
4728                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4729                                        intel_iommu_groups,
4730                                        "%s", iommu->name);
4731                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4732                 iommu_device_register(&iommu->iommu);
4733         }
4734
4735         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4736         if (si_domain && !hw_pass_through)
4737                 register_memory_notifier(&intel_iommu_memory_nb);
4738         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4739                           intel_iommu_cpu_dead);
4740
4741         down_read(&dmar_global_lock);
4742         if (probe_acpi_namespace_devices())
4743                 pr_warn("ACPI name space devices didn't probe correctly\n");
4744         up_read(&dmar_global_lock);
4745
4746         /* Finally, we enable the DMA remapping hardware. */
4747         for_each_iommu(iommu, drhd) {
4748                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4749                         iommu_enable_translation(iommu);
4750
4751                 iommu_disable_protect_mem_regions(iommu);
4752         }
4753         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4754
4755         intel_iommu_enabled = 1;
4756         intel_iommu_debugfs_init();
4757
4758         return 0;
4759
4760 out_free_reserved_range:
4761         put_iova_domain(&reserved_iova_list);
4762 out_free_dmar:
4763         intel_iommu_free_dmars();
4764         up_write(&dmar_global_lock);
4765         iommu_exit_mempool();
4766         return ret;
4767 }
4768
4769 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4770 {
4771         struct intel_iommu *iommu = opaque;
4772
4773         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4774         return 0;
4775 }
4776
4777 /*
4778  * NB - intel-iommu lacks any sort of reference counting for the users of
4779  * dependent devices.  If multiple endpoints have intersecting dependent
4780  * devices, unbinding the driver from any one of them will possibly leave
4781  * the others unable to operate.
4782  */
4783 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4784 {
4785         if (!iommu || !dev || !dev_is_pci(dev))
4786                 return;
4787
4788         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4789 }
4790
4791 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4792 {
4793         struct dmar_domain *domain;
4794         struct intel_iommu *iommu;
4795         unsigned long flags;
4796
4797         assert_spin_locked(&device_domain_lock);
4798
4799         if (WARN_ON(!info))
4800                 return;
4801
4802         iommu = info->iommu;
4803         domain = info->domain;
4804
4805         if (info->dev) {
4806                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4807                         intel_pasid_tear_down_entry(iommu, info->dev,
4808                                         PASID_RID2PASID);
4809
4810                 iommu_disable_dev_iotlb(info);
4811                 domain_context_clear(iommu, info->dev);
4812                 intel_pasid_free_table(info->dev);
4813         }
4814
4815         unlink_domain_info(info);
4816
4817         spin_lock_irqsave(&iommu->lock, flags);
4818         domain_detach_iommu(domain, iommu);
4819         spin_unlock_irqrestore(&iommu->lock, flags);
4820
4821         /* free the private domain */
4822         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4823             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4824                 domain_exit(info->domain);
4825
4826         free_devinfo_mem(info);
4827 }
4828
4829 static void dmar_remove_one_dev_info(struct device *dev)
4830 {
4831         struct device_domain_info *info;
4832         unsigned long flags;
4833
4834         spin_lock_irqsave(&device_domain_lock, flags);
4835         info = dev->archdata.iommu;
4836         __dmar_remove_one_dev_info(info);
4837         spin_unlock_irqrestore(&device_domain_lock, flags);
4838 }
4839
4840 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4841 {
4842         struct dmar_domain *dmar_domain;
4843         struct iommu_domain *domain;
4844
4845         switch (type) {
4846         case IOMMU_DOMAIN_DMA:
4847         /* fallthrough */
4848         case IOMMU_DOMAIN_UNMANAGED:
4849                 dmar_domain = alloc_domain(0);
4850                 if (!dmar_domain) {
4851                         pr_err("Can't allocate dmar_domain\n");
4852                         return NULL;
4853                 }
4854                 if (domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4855                         pr_err("Domain initialization failed\n");
4856                         domain_exit(dmar_domain);
4857                         return NULL;
4858                 }
4859
4860                 if (type == IOMMU_DOMAIN_DMA &&
4861                     init_iova_flush_queue(&dmar_domain->iovad,
4862                                           iommu_flush_iova, iova_entry_free)) {
4863                         pr_warn("iova flush queue initialization failed\n");
4864                         intel_iommu_strict = 1;
4865                 }
4866
4867                 domain_update_iommu_cap(dmar_domain);
4868
4869                 domain = &dmar_domain->domain;
4870                 domain->geometry.aperture_start = 0;
4871                 domain->geometry.aperture_end   =
4872                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4873                 domain->geometry.force_aperture = true;
4874
4875                 return domain;
4876         case IOMMU_DOMAIN_IDENTITY:
4877                 return &si_domain->domain;
4878         default:
4879                 return NULL;
4880         }
4881
4882         return NULL;
4883 }
4884
4885 static void intel_iommu_domain_free(struct iommu_domain *domain)
4886 {
4887         if (domain != &si_domain->domain)
4888                 domain_exit(to_dmar_domain(domain));
4889 }
4890
4891 /*
4892  * Check whether a @domain could be attached to the @dev through the
4893  * aux-domain attach/detach APIs.
4894  */
4895 static inline bool
4896 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4897 {
4898         struct device_domain_info *info = dev->archdata.iommu;
4899
4900         return info && info->auxd_enabled &&
4901                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4902 }
4903
4904 static void auxiliary_link_device(struct dmar_domain *domain,
4905                                   struct device *dev)
4906 {
4907         struct device_domain_info *info = dev->archdata.iommu;
4908
4909         assert_spin_locked(&device_domain_lock);
4910         if (WARN_ON(!info))
4911                 return;
4912
4913         domain->auxd_refcnt++;
4914         list_add(&domain->auxd, &info->auxiliary_domains);
4915 }
4916
4917 static void auxiliary_unlink_device(struct dmar_domain *domain,
4918                                     struct device *dev)
4919 {
4920         struct device_domain_info *info = dev->archdata.iommu;
4921
4922         assert_spin_locked(&device_domain_lock);
4923         if (WARN_ON(!info))
4924                 return;
4925
4926         list_del(&domain->auxd);
4927         domain->auxd_refcnt--;
4928
4929         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4930                 intel_pasid_free_id(domain->default_pasid);
4931 }
4932
4933 static int aux_domain_add_dev(struct dmar_domain *domain,
4934                               struct device *dev)
4935 {
4936         int ret;
4937         u8 bus, devfn;
4938         unsigned long flags;
4939         struct intel_iommu *iommu;
4940
4941         iommu = device_to_iommu(dev, &bus, &devfn);
4942         if (!iommu)
4943                 return -ENODEV;
4944
4945         if (domain->default_pasid <= 0) {
4946                 int pasid;
4947
4948                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4949                                              pci_max_pasids(to_pci_dev(dev)),
4950                                              GFP_KERNEL);
4951                 if (pasid <= 0) {
4952                         pr_err("Can't allocate default pasid\n");
4953                         return -ENODEV;
4954                 }
4955                 domain->default_pasid = pasid;
4956         }
4957
4958         spin_lock_irqsave(&device_domain_lock, flags);
4959         /*
4960          * iommu->lock must be held to attach domain to iommu and setup the
4961          * pasid entry for second level translation.
4962          */
4963         spin_lock(&iommu->lock);
4964         ret = domain_attach_iommu(domain, iommu);
4965         if (ret)
4966                 goto attach_failed;
4967
4968         /* Setup the PASID entry for mediated devices: */
4969         ret = intel_pasid_setup_second_level(iommu, domain, dev,
4970                                              domain->default_pasid);
4971         if (ret)
4972                 goto table_failed;
4973         spin_unlock(&iommu->lock);
4974
4975         auxiliary_link_device(domain, dev);
4976
4977         spin_unlock_irqrestore(&device_domain_lock, flags);
4978
4979         return 0;
4980
4981 table_failed:
4982         domain_detach_iommu(domain, iommu);
4983 attach_failed:
4984         spin_unlock(&iommu->lock);
4985         spin_unlock_irqrestore(&device_domain_lock, flags);
4986         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4987                 intel_pasid_free_id(domain->default_pasid);
4988
4989         return ret;
4990 }
4991
4992 static void aux_domain_remove_dev(struct dmar_domain *domain,
4993                                   struct device *dev)
4994 {
4995         struct device_domain_info *info;
4996         struct intel_iommu *iommu;
4997         unsigned long flags;
4998
4999         if (!is_aux_domain(dev, &domain->domain))
5000                 return;
5001
5002         spin_lock_irqsave(&device_domain_lock, flags);
5003         info = dev->archdata.iommu;
5004         iommu = info->iommu;
5005
5006         auxiliary_unlink_device(domain, dev);
5007
5008         spin_lock(&iommu->lock);
5009         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5010         domain_detach_iommu(domain, iommu);
5011         spin_unlock(&iommu->lock);
5012
5013         spin_unlock_irqrestore(&device_domain_lock, flags);
5014 }
5015
5016 static int prepare_domain_attach_device(struct iommu_domain *domain,
5017                                         struct device *dev)
5018 {
5019         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5020         struct intel_iommu *iommu;
5021         int addr_width;
5022         u8 bus, devfn;
5023
5024         iommu = device_to_iommu(dev, &bus, &devfn);
5025         if (!iommu)
5026                 return -ENODEV;
5027
5028         /* check if this iommu agaw is sufficient for max mapped address */
5029         addr_width = agaw_to_width(iommu->agaw);
5030         if (addr_width > cap_mgaw(iommu->cap))
5031                 addr_width = cap_mgaw(iommu->cap);
5032
5033         if (dmar_domain->max_addr > (1LL << addr_width)) {
5034                 dev_err(dev, "%s: iommu width (%d) is not "
5035                         "sufficient for the mapped address (%llx)\n",
5036                         __func__, addr_width, dmar_domain->max_addr);
5037                 return -EFAULT;
5038         }
5039         dmar_domain->gaw = addr_width;
5040
5041         /*
5042          * Knock out extra levels of page tables if necessary
5043          */
5044         while (iommu->agaw < dmar_domain->agaw) {
5045                 struct dma_pte *pte;
5046
5047                 pte = dmar_domain->pgd;
5048                 if (dma_pte_present(pte)) {
5049                         dmar_domain->pgd = (struct dma_pte *)
5050                                 phys_to_virt(dma_pte_addr(pte));
5051                         free_pgtable_page(pte);
5052                 }
5053                 dmar_domain->agaw--;
5054         }
5055
5056         return 0;
5057 }
5058
5059 static int intel_iommu_attach_device(struct iommu_domain *domain,
5060                                      struct device *dev)
5061 {
5062         int ret;
5063
5064         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5065             device_is_rmrr_locked(dev)) {
5066                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5067                 return -EPERM;
5068         }
5069
5070         if (is_aux_domain(dev, domain))
5071                 return -EPERM;
5072
5073         /* normally dev is not mapped */
5074         if (unlikely(domain_context_mapped(dev))) {
5075                 struct dmar_domain *old_domain;
5076
5077                 old_domain = find_domain(dev);
5078                 if (old_domain)
5079                         dmar_remove_one_dev_info(dev);
5080         }
5081
5082         ret = prepare_domain_attach_device(domain, dev);
5083         if (ret)
5084                 return ret;
5085
5086         return domain_add_dev_info(to_dmar_domain(domain), dev);
5087 }
5088
5089 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5090                                          struct device *dev)
5091 {
5092         int ret;
5093
5094         if (!is_aux_domain(dev, domain))
5095                 return -EPERM;
5096
5097         ret = prepare_domain_attach_device(domain, dev);
5098         if (ret)
5099                 return ret;
5100
5101         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5102 }
5103
5104 static void intel_iommu_detach_device(struct iommu_domain *domain,
5105                                       struct device *dev)
5106 {
5107         dmar_remove_one_dev_info(dev);
5108 }
5109
5110 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5111                                           struct device *dev)
5112 {
5113         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5114 }
5115
5116 static int intel_iommu_map(struct iommu_domain *domain,
5117                            unsigned long iova, phys_addr_t hpa,
5118                            size_t size, int iommu_prot)
5119 {
5120         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5121         u64 max_addr;
5122         int prot = 0;
5123         int ret;
5124
5125         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5126                 return -EINVAL;
5127
5128         if (iommu_prot & IOMMU_READ)
5129                 prot |= DMA_PTE_READ;
5130         if (iommu_prot & IOMMU_WRITE)
5131                 prot |= DMA_PTE_WRITE;
5132         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5133                 prot |= DMA_PTE_SNP;
5134
5135         max_addr = iova + size;
5136         if (dmar_domain->max_addr < max_addr) {
5137                 u64 end;
5138
5139                 /* check if minimum agaw is sufficient for mapped address */
5140                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5141                 if (end < max_addr) {
5142                         pr_err("%s: iommu width (%d) is not "
5143                                "sufficient for the mapped address (%llx)\n",
5144                                __func__, dmar_domain->gaw, max_addr);
5145                         return -EFAULT;
5146                 }
5147                 dmar_domain->max_addr = max_addr;
5148         }
5149         /* Round up size to next multiple of PAGE_SIZE, if it and
5150            the low bits of hpa would take us onto the next page */
5151         size = aligned_nrpages(hpa, size);
5152         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5153                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5154         return ret;
5155 }
5156
5157 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5158                                 unsigned long iova, size_t size)
5159 {
5160         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5161         struct page *freelist = NULL;
5162         unsigned long start_pfn, last_pfn;
5163         unsigned int npages;
5164         int iommu_id, level = 0;
5165
5166         /* Cope with horrid API which requires us to unmap more than the
5167            size argument if it happens to be a large-page mapping. */
5168         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5169         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5170                 return 0;
5171
5172         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5173                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5174
5175         start_pfn = iova >> VTD_PAGE_SHIFT;
5176         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5177
5178         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5179
5180         npages = last_pfn - start_pfn + 1;
5181
5182         for_each_domain_iommu(iommu_id, dmar_domain)
5183                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5184                                       start_pfn, npages, !freelist, 0);
5185
5186         dma_free_pagelist(freelist);
5187
5188         if (dmar_domain->max_addr == iova + size)
5189                 dmar_domain->max_addr = iova;
5190
5191         return size;
5192 }
5193
5194 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5195                                             dma_addr_t iova)
5196 {
5197         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5198         struct dma_pte *pte;
5199         int level = 0;
5200         u64 phys = 0;
5201
5202         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5203                 return 0;
5204
5205         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5206         if (pte)
5207                 phys = dma_pte_addr(pte);
5208
5209         return phys;
5210 }
5211
5212 static inline bool scalable_mode_support(void)
5213 {
5214         struct dmar_drhd_unit *drhd;
5215         struct intel_iommu *iommu;
5216         bool ret = true;
5217
5218         rcu_read_lock();
5219         for_each_active_iommu(iommu, drhd) {
5220                 if (!sm_supported(iommu)) {
5221                         ret = false;
5222                         break;
5223                 }
5224         }
5225         rcu_read_unlock();
5226
5227         return ret;
5228 }
5229
5230 static inline bool iommu_pasid_support(void)
5231 {
5232         struct dmar_drhd_unit *drhd;
5233         struct intel_iommu *iommu;
5234         bool ret = true;
5235
5236         rcu_read_lock();
5237         for_each_active_iommu(iommu, drhd) {
5238                 if (!pasid_supported(iommu)) {
5239                         ret = false;
5240                         break;
5241                 }
5242         }
5243         rcu_read_unlock();
5244
5245         return ret;
5246 }
5247
5248 static bool intel_iommu_capable(enum iommu_cap cap)
5249 {
5250         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5251                 return domain_update_iommu_snooping(NULL) == 1;
5252         if (cap == IOMMU_CAP_INTR_REMAP)
5253                 return irq_remapping_enabled == 1;
5254
5255         return false;
5256 }
5257
5258 static int intel_iommu_add_device(struct device *dev)
5259 {
5260         struct dmar_domain *dmar_domain;
5261         struct iommu_domain *domain;
5262         struct intel_iommu *iommu;
5263         struct iommu_group *group;
5264         u8 bus, devfn;
5265         int ret;
5266
5267         iommu = device_to_iommu(dev, &bus, &devfn);
5268         if (!iommu)
5269                 return -ENODEV;
5270
5271         iommu_device_link(&iommu->iommu, dev);
5272
5273         if (translation_pre_enabled(iommu))
5274                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5275
5276         group = iommu_group_get_for_dev(dev);
5277
5278         if (IS_ERR(group))
5279                 return PTR_ERR(group);
5280
5281         iommu_group_put(group);
5282
5283         domain = iommu_get_domain_for_dev(dev);
5284         dmar_domain = to_dmar_domain(domain);
5285         if (domain->type == IOMMU_DOMAIN_DMA) {
5286                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5287                         ret = iommu_request_dm_for_dev(dev);
5288                         if (ret) {
5289                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5290                                 domain_add_dev_info(si_domain, dev);
5291                                 dev_info(dev,
5292                                          "Device uses a private identity domain.\n");
5293                         }
5294                 }
5295         } else {
5296                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5297                         ret = iommu_request_dma_domain_for_dev(dev);
5298                         if (ret) {
5299                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5300                                 if (!get_private_domain_for_dev(dev)) {
5301                                         dev_warn(dev,
5302                                                  "Failed to get a private domain.\n");
5303                                         return -ENOMEM;
5304                                 }
5305
5306                                 dev_info(dev,
5307                                          "Device uses a private dma domain.\n");
5308                         }
5309                 }
5310         }
5311
5312         return 0;
5313 }
5314
5315 static void intel_iommu_remove_device(struct device *dev)
5316 {
5317         struct intel_iommu *iommu;
5318         u8 bus, devfn;
5319
5320         iommu = device_to_iommu(dev, &bus, &devfn);
5321         if (!iommu)
5322                 return;
5323
5324         iommu_group_remove_device(dev);
5325
5326         iommu_device_unlink(&iommu->iommu, dev);
5327 }
5328
5329 static void intel_iommu_get_resv_regions(struct device *device,
5330                                          struct list_head *head)
5331 {
5332         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5333         struct iommu_resv_region *reg;
5334         struct dmar_rmrr_unit *rmrr;
5335         struct device *i_dev;
5336         int i;
5337
5338         down_read(&dmar_global_lock);
5339         for_each_rmrr_units(rmrr) {
5340                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5341                                           i, i_dev) {
5342                         struct iommu_resv_region *resv;
5343                         enum iommu_resv_type type;
5344                         size_t length;
5345
5346                         if (i_dev != device &&
5347                             !is_downstream_to_pci_bridge(device, i_dev))
5348                                 continue;
5349
5350                         length = rmrr->end_address - rmrr->base_address + 1;
5351
5352                         type = device_rmrr_is_relaxable(device) ?
5353                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5354
5355                         resv = iommu_alloc_resv_region(rmrr->base_address,
5356                                                        length, prot, type);
5357                         if (!resv)
5358                                 break;
5359
5360                         list_add_tail(&resv->list, head);
5361                 }
5362         }
5363         up_read(&dmar_global_lock);
5364
5365 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5366         if (dev_is_pci(device)) {
5367                 struct pci_dev *pdev = to_pci_dev(device);
5368
5369                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5370                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5371                                                       IOMMU_RESV_DIRECT);
5372                         if (reg)
5373                                 list_add_tail(&reg->list, head);
5374                 }
5375         }
5376 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5377
5378         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5379                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5380                                       0, IOMMU_RESV_MSI);
5381         if (!reg)
5382                 return;
5383         list_add_tail(&reg->list, head);
5384 }
5385
5386 static void intel_iommu_put_resv_regions(struct device *dev,
5387                                          struct list_head *head)
5388 {
5389         struct iommu_resv_region *entry, *next;
5390
5391         list_for_each_entry_safe(entry, next, head, list)
5392                 kfree(entry);
5393 }
5394
5395 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5396 {
5397         struct device_domain_info *info;
5398         struct context_entry *context;
5399         struct dmar_domain *domain;
5400         unsigned long flags;
5401         u64 ctx_lo;
5402         int ret;
5403
5404         domain = find_domain(dev);
5405         if (!domain)
5406                 return -EINVAL;
5407
5408         spin_lock_irqsave(&device_domain_lock, flags);
5409         spin_lock(&iommu->lock);
5410
5411         ret = -EINVAL;
5412         info = dev->archdata.iommu;
5413         if (!info || !info->pasid_supported)
5414                 goto out;
5415
5416         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5417         if (WARN_ON(!context))
5418                 goto out;
5419
5420         ctx_lo = context[0].lo;
5421
5422         if (!(ctx_lo & CONTEXT_PASIDE)) {
5423                 ctx_lo |= CONTEXT_PASIDE;
5424                 context[0].lo = ctx_lo;
5425                 wmb();
5426                 iommu->flush.flush_context(iommu,
5427                                            domain->iommu_did[iommu->seq_id],
5428                                            PCI_DEVID(info->bus, info->devfn),
5429                                            DMA_CCMD_MASK_NOBIT,
5430                                            DMA_CCMD_DEVICE_INVL);
5431         }
5432
5433         /* Enable PASID support in the device, if it wasn't already */
5434         if (!info->pasid_enabled)
5435                 iommu_enable_dev_iotlb(info);
5436
5437         ret = 0;
5438
5439  out:
5440         spin_unlock(&iommu->lock);
5441         spin_unlock_irqrestore(&device_domain_lock, flags);
5442
5443         return ret;
5444 }
5445
5446 static void intel_iommu_apply_resv_region(struct device *dev,
5447                                           struct iommu_domain *domain,
5448                                           struct iommu_resv_region *region)
5449 {
5450         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5451         unsigned long start, end;
5452
5453         start = IOVA_PFN(region->start);
5454         end   = IOVA_PFN(region->start + region->length - 1);
5455
5456         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5457 }
5458
5459 #ifdef CONFIG_INTEL_IOMMU_SVM
5460 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5461 {
5462         struct intel_iommu *iommu;
5463         u8 bus, devfn;
5464
5465         if (iommu_dummy(dev)) {
5466                 dev_warn(dev,
5467                          "No IOMMU translation for device; cannot enable SVM\n");
5468                 return NULL;
5469         }
5470
5471         iommu = device_to_iommu(dev, &bus, &devfn);
5472         if ((!iommu)) {
5473                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5474                 return NULL;
5475         }
5476
5477         return iommu;
5478 }
5479 #endif /* CONFIG_INTEL_IOMMU_SVM */
5480
5481 static int intel_iommu_enable_auxd(struct device *dev)
5482 {
5483         struct device_domain_info *info;
5484         struct intel_iommu *iommu;
5485         unsigned long flags;
5486         u8 bus, devfn;
5487         int ret;
5488
5489         iommu = device_to_iommu(dev, &bus, &devfn);
5490         if (!iommu || dmar_disabled)
5491                 return -EINVAL;
5492
5493         if (!sm_supported(iommu) || !pasid_supported(iommu))
5494                 return -EINVAL;
5495
5496         ret = intel_iommu_enable_pasid(iommu, dev);
5497         if (ret)
5498                 return -ENODEV;
5499
5500         spin_lock_irqsave(&device_domain_lock, flags);
5501         info = dev->archdata.iommu;
5502         info->auxd_enabled = 1;
5503         spin_unlock_irqrestore(&device_domain_lock, flags);
5504
5505         return 0;
5506 }
5507
5508 static int intel_iommu_disable_auxd(struct device *dev)
5509 {
5510         struct device_domain_info *info;
5511         unsigned long flags;
5512
5513         spin_lock_irqsave(&device_domain_lock, flags);
5514         info = dev->archdata.iommu;
5515         if (!WARN_ON(!info))
5516                 info->auxd_enabled = 0;
5517         spin_unlock_irqrestore(&device_domain_lock, flags);
5518
5519         return 0;
5520 }
5521
5522 /*
5523  * A PCI express designated vendor specific extended capability is defined
5524  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5525  * for system software and tools to detect endpoint devices supporting the
5526  * Intel scalable IO virtualization without host driver dependency.
5527  *
5528  * Returns the address of the matching extended capability structure within
5529  * the device's PCI configuration space or 0 if the device does not support
5530  * it.
5531  */
5532 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5533 {
5534         int pos;
5535         u16 vendor, id;
5536
5537         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5538         while (pos) {
5539                 pci_read_config_word(pdev, pos + 4, &vendor);
5540                 pci_read_config_word(pdev, pos + 8, &id);
5541                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5542                         return pos;
5543
5544                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5545         }
5546
5547         return 0;
5548 }
5549
5550 static bool
5551 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5552 {
5553         if (feat == IOMMU_DEV_FEAT_AUX) {
5554                 int ret;
5555
5556                 if (!dev_is_pci(dev) || dmar_disabled ||
5557                     !scalable_mode_support() || !iommu_pasid_support())
5558                         return false;
5559
5560                 ret = pci_pasid_features(to_pci_dev(dev));
5561                 if (ret < 0)
5562                         return false;
5563
5564                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5565         }
5566
5567         return false;
5568 }
5569
5570 static int
5571 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5572 {
5573         if (feat == IOMMU_DEV_FEAT_AUX)
5574                 return intel_iommu_enable_auxd(dev);
5575
5576         return -ENODEV;
5577 }
5578
5579 static int
5580 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5581 {
5582         if (feat == IOMMU_DEV_FEAT_AUX)
5583                 return intel_iommu_disable_auxd(dev);
5584
5585         return -ENODEV;
5586 }
5587
5588 static bool
5589 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5590 {
5591         struct device_domain_info *info = dev->archdata.iommu;
5592
5593         if (feat == IOMMU_DEV_FEAT_AUX)
5594                 return scalable_mode_support() && info && info->auxd_enabled;
5595
5596         return false;
5597 }
5598
5599 static int
5600 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5601 {
5602         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5603
5604         return dmar_domain->default_pasid > 0 ?
5605                         dmar_domain->default_pasid : -EINVAL;
5606 }
5607
5608 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5609                                            struct device *dev)
5610 {
5611         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5612 }
5613
5614 const struct iommu_ops intel_iommu_ops = {
5615         .capable                = intel_iommu_capable,
5616         .domain_alloc           = intel_iommu_domain_alloc,
5617         .domain_free            = intel_iommu_domain_free,
5618         .attach_dev             = intel_iommu_attach_device,
5619         .detach_dev             = intel_iommu_detach_device,
5620         .aux_attach_dev         = intel_iommu_aux_attach_device,
5621         .aux_detach_dev         = intel_iommu_aux_detach_device,
5622         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5623         .map                    = intel_iommu_map,
5624         .unmap                  = intel_iommu_unmap,
5625         .iova_to_phys           = intel_iommu_iova_to_phys,
5626         .add_device             = intel_iommu_add_device,
5627         .remove_device          = intel_iommu_remove_device,
5628         .get_resv_regions       = intel_iommu_get_resv_regions,
5629         .put_resv_regions       = intel_iommu_put_resv_regions,
5630         .apply_resv_region      = intel_iommu_apply_resv_region,
5631         .device_group           = pci_device_group,
5632         .dev_has_feat           = intel_iommu_dev_has_feat,
5633         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5634         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5635         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5636         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5637         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5638 };
5639
5640 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5641 {
5642         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5643         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5644         dmar_map_gfx = 0;
5645 }
5646
5647 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5648 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5649 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5650 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5651 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5654
5655 static void quirk_iommu_rwbf(struct pci_dev *dev)
5656 {
5657         /*
5658          * Mobile 4 Series Chipset neglects to set RWBF capability,
5659          * but needs it. Same seems to hold for the desktop versions.
5660          */
5661         pci_info(dev, "Forcing write-buffer flush capability\n");
5662         rwbf_quirk = 1;
5663 }
5664
5665 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5666 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5667 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5668 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5669 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5670 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5671 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5672
5673 #define GGC 0x52
5674 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5675 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5676 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5677 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5678 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5679 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5680 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5681 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5682
5683 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5684 {
5685         unsigned short ggc;
5686
5687         if (pci_read_config_word(dev, GGC, &ggc))
5688                 return;
5689
5690         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5691                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5692                 dmar_map_gfx = 0;
5693         } else if (dmar_map_gfx) {
5694                 /* we have to ensure the gfx device is idle before we flush */
5695                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5696                 intel_iommu_strict = 1;
5697        }
5698 }
5699 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5700 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5701 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5702 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5703
5704 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5705    ISOCH DMAR unit for the Azalia sound device, but not give it any
5706    TLB entries, which causes it to deadlock. Check for that.  We do
5707    this in a function called from init_dmars(), instead of in a PCI
5708    quirk, because we don't want to print the obnoxious "BIOS broken"
5709    message if VT-d is actually disabled.
5710 */
5711 static void __init check_tylersburg_isoch(void)
5712 {
5713         struct pci_dev *pdev;
5714         uint32_t vtisochctrl;
5715
5716         /* If there's no Azalia in the system anyway, forget it. */
5717         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5718         if (!pdev)
5719                 return;
5720         pci_dev_put(pdev);
5721
5722         /* System Management Registers. Might be hidden, in which case
5723            we can't do the sanity check. But that's OK, because the
5724            known-broken BIOSes _don't_ actually hide it, so far. */
5725         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5726         if (!pdev)
5727                 return;
5728
5729         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5730                 pci_dev_put(pdev);
5731                 return;
5732         }
5733
5734         pci_dev_put(pdev);
5735
5736         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5737         if (vtisochctrl & 1)
5738                 return;
5739
5740         /* Drop all bits other than the number of TLB entries */
5741         vtisochctrl &= 0x1c;
5742
5743         /* If we have the recommended number of TLB entries (16), fine. */
5744         if (vtisochctrl == 0x10)
5745                 return;
5746
5747         /* Zero TLB entries? You get to ride the short bus to school. */
5748         if (!vtisochctrl) {
5749                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5750                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5751                      dmi_get_system_info(DMI_BIOS_VENDOR),
5752                      dmi_get_system_info(DMI_BIOS_VERSION),
5753                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5754                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5755                 return;
5756         }
5757
5758         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5759                vtisochctrl);
5760 }