]> asedeno.scripts.mit.edu Git - linux.git/blob - drivers/iommu/intel-iommu.c
Revert "iommu/vt-d: Consolidate domain_init() to avoid duplication"
[linux.git] / drivers / iommu / intel-iommu.c
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15
16 #include <linux/init.h>
17 #include <linux/bitmap.h>
18 #include <linux/debugfs.h>
19 #include <linux/export.h>
20 #include <linux/slab.h>
21 #include <linux/irq.h>
22 #include <linux/interrupt.h>
23 #include <linux/spinlock.h>
24 #include <linux/pci.h>
25 #include <linux/dmar.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/mempool.h>
28 #include <linux/memory.h>
29 #include <linux/cpu.h>
30 #include <linux/timer.h>
31 #include <linux/io.h>
32 #include <linux/iova.h>
33 #include <linux/iommu.h>
34 #include <linux/intel-iommu.h>
35 #include <linux/syscore_ops.h>
36 #include <linux/tboot.h>
37 #include <linux/dmi.h>
38 #include <linux/pci-ats.h>
39 #include <linux/memblock.h>
40 #include <linux/dma-contiguous.h>
41 #include <linux/dma-direct.h>
42 #include <linux/crash_dump.h>
43 #include <linux/numa.h>
44 #include <asm/irq_remapping.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #include "irq_remapping.h"
49 #include "intel-pasid.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
56 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
57 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
58
59 #define IOAPIC_RANGE_START      (0xfee00000)
60 #define IOAPIC_RANGE_END        (0xfeefffff)
61 #define IOVA_START_ADDR         (0x1000)
62
63 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
64
65 #define MAX_AGAW_WIDTH 64
66 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
67
68 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
69 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
70
71 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
72    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
73 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
74                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
75 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
76
77 /* IO virtual address start page frame number */
78 #define IOVA_START_PFN          (1)
79
80 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180 int intel_iommu_tboot_noforce;
181 static int no_platform_optin;
182
183 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
184
185 /*
186  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
187  * if marked present.
188  */
189 static phys_addr_t root_entry_lctp(struct root_entry *re)
190 {
191         if (!(re->lo & 1))
192                 return 0;
193
194         return re->lo & VTD_PAGE_MASK;
195 }
196
197 /*
198  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
199  * if marked present.
200  */
201 static phys_addr_t root_entry_uctp(struct root_entry *re)
202 {
203         if (!(re->hi & 1))
204                 return 0;
205
206         return re->hi & VTD_PAGE_MASK;
207 }
208
209 static inline void context_clear_pasid_enable(struct context_entry *context)
210 {
211         context->lo &= ~(1ULL << 11);
212 }
213
214 static inline bool context_pasid_enabled(struct context_entry *context)
215 {
216         return !!(context->lo & (1ULL << 11));
217 }
218
219 static inline void context_set_copied(struct context_entry *context)
220 {
221         context->hi |= (1ull << 3);
222 }
223
224 static inline bool context_copied(struct context_entry *context)
225 {
226         return !!(context->hi & (1ULL << 3));
227 }
228
229 static inline bool __context_present(struct context_entry *context)
230 {
231         return (context->lo & 1);
232 }
233
234 bool context_present(struct context_entry *context)
235 {
236         return context_pasid_enabled(context) ?
237              __context_present(context) :
238              __context_present(context) && !context_copied(context);
239 }
240
241 static inline void context_set_present(struct context_entry *context)
242 {
243         context->lo |= 1;
244 }
245
246 static inline void context_set_fault_enable(struct context_entry *context)
247 {
248         context->lo &= (((u64)-1) << 2) | 1;
249 }
250
251 static inline void context_set_translation_type(struct context_entry *context,
252                                                 unsigned long value)
253 {
254         context->lo &= (((u64)-1) << 4) | 3;
255         context->lo |= (value & 3) << 2;
256 }
257
258 static inline void context_set_address_root(struct context_entry *context,
259                                             unsigned long value)
260 {
261         context->lo &= ~VTD_PAGE_MASK;
262         context->lo |= value & VTD_PAGE_MASK;
263 }
264
265 static inline void context_set_address_width(struct context_entry *context,
266                                              unsigned long value)
267 {
268         context->hi |= value & 7;
269 }
270
271 static inline void context_set_domain_id(struct context_entry *context,
272                                          unsigned long value)
273 {
274         context->hi |= (value & ((1 << 16) - 1)) << 8;
275 }
276
277 static inline int context_domain_id(struct context_entry *c)
278 {
279         return((c->hi >> 8) & 0xffff);
280 }
281
282 static inline void context_clear_entry(struct context_entry *context)
283 {
284         context->lo = 0;
285         context->hi = 0;
286 }
287
288 /*
289  * This domain is a statically identity mapping domain.
290  *      1. This domain creats a static 1:1 mapping to all usable memory.
291  *      2. It maps to each iommu if successful.
292  *      3. Each iommu mapps to this domain if successful.
293  */
294 static struct dmar_domain *si_domain;
295 static int hw_pass_through = 1;
296
297 /* si_domain contains mulitple devices */
298 #define DOMAIN_FLAG_STATIC_IDENTITY             BIT(0)
299
300 /*
301  * This is a DMA domain allocated through the iommu domain allocation
302  * interface. But one or more devices belonging to this domain have
303  * been chosen to use a private domain. We should avoid to use the
304  * map/unmap/iova_to_phys APIs on it.
305  */
306 #define DOMAIN_FLAG_LOSE_CHILDREN               BIT(1)
307
308 #define for_each_domain_iommu(idx, domain)                      \
309         for (idx = 0; idx < g_num_of_iommus; idx++)             \
310                 if (domain->iommu_refcnt[idx])
311
312 struct dmar_rmrr_unit {
313         struct list_head list;          /* list of rmrr units   */
314         struct acpi_dmar_header *hdr;   /* ACPI header          */
315         u64     base_address;           /* reserved base address*/
316         u64     end_address;            /* reserved end address */
317         struct dmar_dev_scope *devices; /* target devices */
318         int     devices_cnt;            /* target device count */
319 };
320
321 struct dmar_atsr_unit {
322         struct list_head list;          /* list of ATSR units */
323         struct acpi_dmar_header *hdr;   /* ACPI header */
324         struct dmar_dev_scope *devices; /* target devices */
325         int devices_cnt;                /* target device count */
326         u8 include_all:1;               /* include all ports */
327 };
328
329 static LIST_HEAD(dmar_atsr_units);
330 static LIST_HEAD(dmar_rmrr_units);
331
332 #define for_each_rmrr_units(rmrr) \
333         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
334
335 /* bitmap for indexing intel_iommus */
336 static int g_num_of_iommus;
337
338 static void domain_exit(struct dmar_domain *domain);
339 static void domain_remove_dev_info(struct dmar_domain *domain);
340 static void dmar_remove_one_dev_info(struct device *dev);
341 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
342 static void domain_context_clear(struct intel_iommu *iommu,
343                                  struct device *dev);
344 static int domain_detach_iommu(struct dmar_domain *domain,
345                                struct intel_iommu *iommu);
346 static bool device_is_rmrr_locked(struct device *dev);
347 static int intel_iommu_attach_device(struct iommu_domain *domain,
348                                      struct device *dev);
349
350 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
351 int dmar_disabled = 0;
352 #else
353 int dmar_disabled = 1;
354 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
355
356 int intel_iommu_sm;
357 int intel_iommu_enabled = 0;
358 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
359
360 static int dmar_map_gfx = 1;
361 static int dmar_forcedac;
362 static int intel_iommu_strict;
363 static int intel_iommu_superpage = 1;
364 static int iommu_identity_mapping;
365
366 #define IDENTMAP_ALL            1
367 #define IDENTMAP_GFX            2
368 #define IDENTMAP_AZALIA         4
369
370 int intel_iommu_gfx_mapped;
371 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
372
373 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
374 #define DEFER_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-2))
375 static DEFINE_SPINLOCK(device_domain_lock);
376 static LIST_HEAD(device_domain_list);
377
378 /*
379  * Iterate over elements in device_domain_list and call the specified
380  * callback @fn against each element.
381  */
382 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
383                                      void *data), void *data)
384 {
385         int ret = 0;
386         unsigned long flags;
387         struct device_domain_info *info;
388
389         spin_lock_irqsave(&device_domain_lock, flags);
390         list_for_each_entry(info, &device_domain_list, global) {
391                 ret = fn(info, data);
392                 if (ret) {
393                         spin_unlock_irqrestore(&device_domain_lock, flags);
394                         return ret;
395                 }
396         }
397         spin_unlock_irqrestore(&device_domain_lock, flags);
398
399         return 0;
400 }
401
402 const struct iommu_ops intel_iommu_ops;
403
404 static bool translation_pre_enabled(struct intel_iommu *iommu)
405 {
406         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
407 }
408
409 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
410 {
411         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
412 }
413
414 static void init_translation_status(struct intel_iommu *iommu)
415 {
416         u32 gsts;
417
418         gsts = readl(iommu->reg + DMAR_GSTS_REG);
419         if (gsts & DMA_GSTS_TES)
420                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
421 }
422
423 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
424 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
425 {
426         return container_of(dom, struct dmar_domain, domain);
427 }
428
429 static int __init intel_iommu_setup(char *str)
430 {
431         if (!str)
432                 return -EINVAL;
433         while (*str) {
434                 if (!strncmp(str, "on", 2)) {
435                         dmar_disabled = 0;
436                         pr_info("IOMMU enabled\n");
437                 } else if (!strncmp(str, "off", 3)) {
438                         dmar_disabled = 1;
439                         no_platform_optin = 1;
440                         pr_info("IOMMU disabled\n");
441                 } else if (!strncmp(str, "igfx_off", 8)) {
442                         dmar_map_gfx = 0;
443                         pr_info("Disable GFX device mapping\n");
444                 } else if (!strncmp(str, "forcedac", 8)) {
445                         pr_info("Forcing DAC for PCI devices\n");
446                         dmar_forcedac = 1;
447                 } else if (!strncmp(str, "strict", 6)) {
448                         pr_info("Disable batched IOTLB flush\n");
449                         intel_iommu_strict = 1;
450                 } else if (!strncmp(str, "sp_off", 6)) {
451                         pr_info("Disable supported super page\n");
452                         intel_iommu_superpage = 0;
453                 } else if (!strncmp(str, "sm_on", 5)) {
454                         pr_info("Intel-IOMMU: scalable mode supported\n");
455                         intel_iommu_sm = 1;
456                 } else if (!strncmp(str, "tboot_noforce", 13)) {
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
459                         intel_iommu_tboot_noforce = 1;
460                 }
461
462                 str += strcspn(str, ",");
463                 while (*str == ',')
464                         str++;
465         }
466         return 0;
467 }
468 __setup("intel_iommu=", intel_iommu_setup);
469
470 static struct kmem_cache *iommu_domain_cache;
471 static struct kmem_cache *iommu_devinfo_cache;
472
473 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
474 {
475         struct dmar_domain **domains;
476         int idx = did >> 8;
477
478         domains = iommu->domains[idx];
479         if (!domains)
480                 return NULL;
481
482         return domains[did & 0xff];
483 }
484
485 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
486                              struct dmar_domain *domain)
487 {
488         struct dmar_domain **domains;
489         int idx = did >> 8;
490
491         if (!iommu->domains[idx]) {
492                 size_t size = 256 * sizeof(struct dmar_domain *);
493                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
494         }
495
496         domains = iommu->domains[idx];
497         if (WARN_ON(!domains))
498                 return;
499         else
500                 domains[did & 0xff] = domain;
501 }
502
503 void *alloc_pgtable_page(int node)
504 {
505         struct page *page;
506         void *vaddr = NULL;
507
508         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
509         if (page)
510                 vaddr = page_address(page);
511         return vaddr;
512 }
513
514 void free_pgtable_page(void *vaddr)
515 {
516         free_page((unsigned long)vaddr);
517 }
518
519 static inline void *alloc_domain_mem(void)
520 {
521         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
522 }
523
524 static void free_domain_mem(void *vaddr)
525 {
526         kmem_cache_free(iommu_domain_cache, vaddr);
527 }
528
529 static inline void * alloc_devinfo_mem(void)
530 {
531         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
532 }
533
534 static inline void free_devinfo_mem(void *vaddr)
535 {
536         kmem_cache_free(iommu_devinfo_cache, vaddr);
537 }
538
539 static inline int domain_type_is_si(struct dmar_domain *domain)
540 {
541         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
542 }
543
544 static inline int domain_pfn_supported(struct dmar_domain *domain,
545                                        unsigned long pfn)
546 {
547         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
548
549         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
550 }
551
552 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
553 {
554         unsigned long sagaw;
555         int agaw = -1;
556
557         sagaw = cap_sagaw(iommu->cap);
558         for (agaw = width_to_agaw(max_gaw);
559              agaw >= 0; agaw--) {
560                 if (test_bit(agaw, &sagaw))
561                         break;
562         }
563
564         return agaw;
565 }
566
567 /*
568  * Calculate max SAGAW for each iommu.
569  */
570 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
571 {
572         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
573 }
574
575 /*
576  * calculate agaw for each iommu.
577  * "SAGAW" may be different across iommus, use a default agaw, and
578  * get a supported less agaw for iommus that don't support the default agaw.
579  */
580 int iommu_calculate_agaw(struct intel_iommu *iommu)
581 {
582         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
583 }
584
585 /* This functionin only returns single iommu in a domain */
586 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
587 {
588         int iommu_id;
589
590         /* si_domain and vm domain should not get here. */
591         if (WARN_ON(domain->domain.type != IOMMU_DOMAIN_DMA))
592                 return NULL;
593
594         for_each_domain_iommu(iommu_id, domain)
595                 break;
596
597         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
598                 return NULL;
599
600         return g_iommus[iommu_id];
601 }
602
603 static void domain_update_iommu_coherency(struct dmar_domain *domain)
604 {
605         struct dmar_drhd_unit *drhd;
606         struct intel_iommu *iommu;
607         bool found = false;
608         int i;
609
610         domain->iommu_coherency = 1;
611
612         for_each_domain_iommu(i, domain) {
613                 found = true;
614                 if (!ecap_coherent(g_iommus[i]->ecap)) {
615                         domain->iommu_coherency = 0;
616                         break;
617                 }
618         }
619         if (found)
620                 return;
621
622         /* No hardware attached; use lowest common denominator */
623         rcu_read_lock();
624         for_each_active_iommu(iommu, drhd) {
625                 if (!ecap_coherent(iommu->ecap)) {
626                         domain->iommu_coherency = 0;
627                         break;
628                 }
629         }
630         rcu_read_unlock();
631 }
632
633 static int domain_update_iommu_snooping(struct intel_iommu *skip)
634 {
635         struct dmar_drhd_unit *drhd;
636         struct intel_iommu *iommu;
637         int ret = 1;
638
639         rcu_read_lock();
640         for_each_active_iommu(iommu, drhd) {
641                 if (iommu != skip) {
642                         if (!ecap_sc_support(iommu->ecap)) {
643                                 ret = 0;
644                                 break;
645                         }
646                 }
647         }
648         rcu_read_unlock();
649
650         return ret;
651 }
652
653 static int domain_update_iommu_superpage(struct intel_iommu *skip)
654 {
655         struct dmar_drhd_unit *drhd;
656         struct intel_iommu *iommu;
657         int mask = 0xf;
658
659         if (!intel_iommu_superpage) {
660                 return 0;
661         }
662
663         /* set iommu_superpage to the smallest common denominator */
664         rcu_read_lock();
665         for_each_active_iommu(iommu, drhd) {
666                 if (iommu != skip) {
667                         mask &= cap_super_page_val(iommu->cap);
668                         if (!mask)
669                                 break;
670                 }
671         }
672         rcu_read_unlock();
673
674         return fls(mask);
675 }
676
677 /* Some capabilities may be different across iommus */
678 static void domain_update_iommu_cap(struct dmar_domain *domain)
679 {
680         domain_update_iommu_coherency(domain);
681         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
682         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
683 }
684
685 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
686                                          u8 devfn, int alloc)
687 {
688         struct root_entry *root = &iommu->root_entry[bus];
689         struct context_entry *context;
690         u64 *entry;
691
692         entry = &root->lo;
693         if (sm_supported(iommu)) {
694                 if (devfn >= 0x80) {
695                         devfn -= 0x80;
696                         entry = &root->hi;
697                 }
698                 devfn *= 2;
699         }
700         if (*entry & 1)
701                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
702         else {
703                 unsigned long phy_addr;
704                 if (!alloc)
705                         return NULL;
706
707                 context = alloc_pgtable_page(iommu->node);
708                 if (!context)
709                         return NULL;
710
711                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
712                 phy_addr = virt_to_phys((void *)context);
713                 *entry = phy_addr | 1;
714                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
715         }
716         return &context[devfn];
717 }
718
719 static int iommu_dummy(struct device *dev)
720 {
721         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
722 }
723
724 /**
725  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
726  *                               sub-hierarchy of a candidate PCI-PCI bridge
727  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
728  * @bridge: the candidate PCI-PCI bridge
729  *
730  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
731  */
732 static bool
733 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
734 {
735         struct pci_dev *pdev, *pbridge;
736
737         if (!dev_is_pci(dev) || !dev_is_pci(bridge))
738                 return false;
739
740         pdev = to_pci_dev(dev);
741         pbridge = to_pci_dev(bridge);
742
743         if (pbridge->subordinate &&
744             pbridge->subordinate->number <= pdev->bus->number &&
745             pbridge->subordinate->busn_res.end >= pdev->bus->number)
746                 return true;
747
748         return false;
749 }
750
751 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
752 {
753         struct dmar_drhd_unit *drhd = NULL;
754         struct intel_iommu *iommu;
755         struct device *tmp;
756         struct pci_dev *pdev = NULL;
757         u16 segment = 0;
758         int i;
759
760         if (iommu_dummy(dev))
761                 return NULL;
762
763         if (dev_is_pci(dev)) {
764                 struct pci_dev *pf_pdev;
765
766                 pdev = to_pci_dev(dev);
767
768 #ifdef CONFIG_X86
769                 /* VMD child devices currently cannot be handled individually */
770                 if (is_vmd(pdev->bus))
771                         return NULL;
772 #endif
773
774                 /* VFs aren't listed in scope tables; we need to look up
775                  * the PF instead to find the IOMMU. */
776                 pf_pdev = pci_physfn(pdev);
777                 dev = &pf_pdev->dev;
778                 segment = pci_domain_nr(pdev->bus);
779         } else if (has_acpi_companion(dev))
780                 dev = &ACPI_COMPANION(dev)->dev;
781
782         rcu_read_lock();
783         for_each_active_iommu(iommu, drhd) {
784                 if (pdev && segment != drhd->segment)
785                         continue;
786
787                 for_each_active_dev_scope(drhd->devices,
788                                           drhd->devices_cnt, i, tmp) {
789                         if (tmp == dev) {
790                                 /* For a VF use its original BDF# not that of the PF
791                                  * which we used for the IOMMU lookup. Strictly speaking
792                                  * we could do this for all PCI devices; we only need to
793                                  * get the BDF# from the scope table for ACPI matches. */
794                                 if (pdev && pdev->is_virtfn)
795                                         goto got_pdev;
796
797                                 *bus = drhd->devices[i].bus;
798                                 *devfn = drhd->devices[i].devfn;
799                                 goto out;
800                         }
801
802                         if (is_downstream_to_pci_bridge(dev, tmp))
803                                 goto got_pdev;
804                 }
805
806                 if (pdev && drhd->include_all) {
807                 got_pdev:
808                         *bus = pdev->bus->number;
809                         *devfn = pdev->devfn;
810                         goto out;
811                 }
812         }
813         iommu = NULL;
814  out:
815         rcu_read_unlock();
816
817         return iommu;
818 }
819
820 static void domain_flush_cache(struct dmar_domain *domain,
821                                void *addr, int size)
822 {
823         if (!domain->iommu_coherency)
824                 clflush_cache_range(addr, size);
825 }
826
827 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
828 {
829         struct context_entry *context;
830         int ret = 0;
831         unsigned long flags;
832
833         spin_lock_irqsave(&iommu->lock, flags);
834         context = iommu_context_addr(iommu, bus, devfn, 0);
835         if (context)
836                 ret = context_present(context);
837         spin_unlock_irqrestore(&iommu->lock, flags);
838         return ret;
839 }
840
841 static void free_context_table(struct intel_iommu *iommu)
842 {
843         int i;
844         unsigned long flags;
845         struct context_entry *context;
846
847         spin_lock_irqsave(&iommu->lock, flags);
848         if (!iommu->root_entry) {
849                 goto out;
850         }
851         for (i = 0; i < ROOT_ENTRY_NR; i++) {
852                 context = iommu_context_addr(iommu, i, 0, 0);
853                 if (context)
854                         free_pgtable_page(context);
855
856                 if (!sm_supported(iommu))
857                         continue;
858
859                 context = iommu_context_addr(iommu, i, 0x80, 0);
860                 if (context)
861                         free_pgtable_page(context);
862
863         }
864         free_pgtable_page(iommu->root_entry);
865         iommu->root_entry = NULL;
866 out:
867         spin_unlock_irqrestore(&iommu->lock, flags);
868 }
869
870 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
871                                       unsigned long pfn, int *target_level)
872 {
873         struct dma_pte *parent, *pte;
874         int level = agaw_to_level(domain->agaw);
875         int offset;
876
877         BUG_ON(!domain->pgd);
878
879         if (!domain_pfn_supported(domain, pfn))
880                 /* Address beyond IOMMU's addressing capabilities. */
881                 return NULL;
882
883         parent = domain->pgd;
884
885         while (1) {
886                 void *tmp_page;
887
888                 offset = pfn_level_offset(pfn, level);
889                 pte = &parent[offset];
890                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
891                         break;
892                 if (level == *target_level)
893                         break;
894
895                 if (!dma_pte_present(pte)) {
896                         uint64_t pteval;
897
898                         tmp_page = alloc_pgtable_page(domain->nid);
899
900                         if (!tmp_page)
901                                 return NULL;
902
903                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
904                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
905                         if (cmpxchg64(&pte->val, 0ULL, pteval))
906                                 /* Someone else set it while we were thinking; use theirs. */
907                                 free_pgtable_page(tmp_page);
908                         else
909                                 domain_flush_cache(domain, pte, sizeof(*pte));
910                 }
911                 if (level == 1)
912                         break;
913
914                 parent = phys_to_virt(dma_pte_addr(pte));
915                 level--;
916         }
917
918         if (!*target_level)
919                 *target_level = level;
920
921         return pte;
922 }
923
924 /* return address's pte at specific level */
925 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
926                                          unsigned long pfn,
927                                          int level, int *large_page)
928 {
929         struct dma_pte *parent, *pte;
930         int total = agaw_to_level(domain->agaw);
931         int offset;
932
933         parent = domain->pgd;
934         while (level <= total) {
935                 offset = pfn_level_offset(pfn, total);
936                 pte = &parent[offset];
937                 if (level == total)
938                         return pte;
939
940                 if (!dma_pte_present(pte)) {
941                         *large_page = total;
942                         break;
943                 }
944
945                 if (dma_pte_superpage(pte)) {
946                         *large_page = total;
947                         return pte;
948                 }
949
950                 parent = phys_to_virt(dma_pte_addr(pte));
951                 total--;
952         }
953         return NULL;
954 }
955
956 /* clear last level pte, a tlb flush should be followed */
957 static void dma_pte_clear_range(struct dmar_domain *domain,
958                                 unsigned long start_pfn,
959                                 unsigned long last_pfn)
960 {
961         unsigned int large_page;
962         struct dma_pte *first_pte, *pte;
963
964         BUG_ON(!domain_pfn_supported(domain, start_pfn));
965         BUG_ON(!domain_pfn_supported(domain, last_pfn));
966         BUG_ON(start_pfn > last_pfn);
967
968         /* we don't need lock here; nobody else touches the iova range */
969         do {
970                 large_page = 1;
971                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
972                 if (!pte) {
973                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
974                         continue;
975                 }
976                 do {
977                         dma_clear_pte(pte);
978                         start_pfn += lvl_to_nr_pages(large_page);
979                         pte++;
980                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
981
982                 domain_flush_cache(domain, first_pte,
983                                    (void *)pte - (void *)first_pte);
984
985         } while (start_pfn && start_pfn <= last_pfn);
986 }
987
988 static void dma_pte_free_level(struct dmar_domain *domain, int level,
989                                int retain_level, struct dma_pte *pte,
990                                unsigned long pfn, unsigned long start_pfn,
991                                unsigned long last_pfn)
992 {
993         pfn = max(start_pfn, pfn);
994         pte = &pte[pfn_level_offset(pfn, level)];
995
996         do {
997                 unsigned long level_pfn;
998                 struct dma_pte *level_pte;
999
1000                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1001                         goto next;
1002
1003                 level_pfn = pfn & level_mask(level);
1004                 level_pte = phys_to_virt(dma_pte_addr(pte));
1005
1006                 if (level > 2) {
1007                         dma_pte_free_level(domain, level - 1, retain_level,
1008                                            level_pte, level_pfn, start_pfn,
1009                                            last_pfn);
1010                 }
1011
1012                 /*
1013                  * Free the page table if we're below the level we want to
1014                  * retain and the range covers the entire table.
1015                  */
1016                 if (level < retain_level && !(start_pfn > level_pfn ||
1017                       last_pfn < level_pfn + level_size(level) - 1)) {
1018                         dma_clear_pte(pte);
1019                         domain_flush_cache(domain, pte, sizeof(*pte));
1020                         free_pgtable_page(level_pte);
1021                 }
1022 next:
1023                 pfn += level_size(level);
1024         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1025 }
1026
1027 /*
1028  * clear last level (leaf) ptes and free page table pages below the
1029  * level we wish to keep intact.
1030  */
1031 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1032                                    unsigned long start_pfn,
1033                                    unsigned long last_pfn,
1034                                    int retain_level)
1035 {
1036         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1037         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1038         BUG_ON(start_pfn > last_pfn);
1039
1040         dma_pte_clear_range(domain, start_pfn, last_pfn);
1041
1042         /* We don't need lock here; nobody else touches the iova range */
1043         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1044                            domain->pgd, 0, start_pfn, last_pfn);
1045
1046         /* free pgd */
1047         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1048                 free_pgtable_page(domain->pgd);
1049                 domain->pgd = NULL;
1050         }
1051 }
1052
1053 /* When a page at a given level is being unlinked from its parent, we don't
1054    need to *modify* it at all. All we need to do is make a list of all the
1055    pages which can be freed just as soon as we've flushed the IOTLB and we
1056    know the hardware page-walk will no longer touch them.
1057    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1058    be freed. */
1059 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1060                                             int level, struct dma_pte *pte,
1061                                             struct page *freelist)
1062 {
1063         struct page *pg;
1064
1065         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1066         pg->freelist = freelist;
1067         freelist = pg;
1068
1069         if (level == 1)
1070                 return freelist;
1071
1072         pte = page_address(pg);
1073         do {
1074                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1075                         freelist = dma_pte_list_pagetables(domain, level - 1,
1076                                                            pte, freelist);
1077                 pte++;
1078         } while (!first_pte_in_page(pte));
1079
1080         return freelist;
1081 }
1082
1083 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1084                                         struct dma_pte *pte, unsigned long pfn,
1085                                         unsigned long start_pfn,
1086                                         unsigned long last_pfn,
1087                                         struct page *freelist)
1088 {
1089         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1090
1091         pfn = max(start_pfn, pfn);
1092         pte = &pte[pfn_level_offset(pfn, level)];
1093
1094         do {
1095                 unsigned long level_pfn;
1096
1097                 if (!dma_pte_present(pte))
1098                         goto next;
1099
1100                 level_pfn = pfn & level_mask(level);
1101
1102                 /* If range covers entire pagetable, free it */
1103                 if (start_pfn <= level_pfn &&
1104                     last_pfn >= level_pfn + level_size(level) - 1) {
1105                         /* These suborbinate page tables are going away entirely. Don't
1106                            bother to clear them; we're just going to *free* them. */
1107                         if (level > 1 && !dma_pte_superpage(pte))
1108                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1109
1110                         dma_clear_pte(pte);
1111                         if (!first_pte)
1112                                 first_pte = pte;
1113                         last_pte = pte;
1114                 } else if (level > 1) {
1115                         /* Recurse down into a level that isn't *entirely* obsolete */
1116                         freelist = dma_pte_clear_level(domain, level - 1,
1117                                                        phys_to_virt(dma_pte_addr(pte)),
1118                                                        level_pfn, start_pfn, last_pfn,
1119                                                        freelist);
1120                 }
1121 next:
1122                 pfn += level_size(level);
1123         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1124
1125         if (first_pte)
1126                 domain_flush_cache(domain, first_pte,
1127                                    (void *)++last_pte - (void *)first_pte);
1128
1129         return freelist;
1130 }
1131
1132 /* We can't just free the pages because the IOMMU may still be walking
1133    the page tables, and may have cached the intermediate levels. The
1134    pages can only be freed after the IOTLB flush has been done. */
1135 static struct page *domain_unmap(struct dmar_domain *domain,
1136                                  unsigned long start_pfn,
1137                                  unsigned long last_pfn)
1138 {
1139         struct page *freelist;
1140
1141         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1142         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1143         BUG_ON(start_pfn > last_pfn);
1144
1145         /* we don't need lock here; nobody else touches the iova range */
1146         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1147                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1148
1149         /* free pgd */
1150         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1151                 struct page *pgd_page = virt_to_page(domain->pgd);
1152                 pgd_page->freelist = freelist;
1153                 freelist = pgd_page;
1154
1155                 domain->pgd = NULL;
1156         }
1157
1158         return freelist;
1159 }
1160
1161 static void dma_free_pagelist(struct page *freelist)
1162 {
1163         struct page *pg;
1164
1165         while ((pg = freelist)) {
1166                 freelist = pg->freelist;
1167                 free_pgtable_page(page_address(pg));
1168         }
1169 }
1170
1171 static void iova_entry_free(unsigned long data)
1172 {
1173         struct page *freelist = (struct page *)data;
1174
1175         dma_free_pagelist(freelist);
1176 }
1177
1178 /* iommu handling */
1179 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1180 {
1181         struct root_entry *root;
1182         unsigned long flags;
1183
1184         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1185         if (!root) {
1186                 pr_err("Allocating root entry for %s failed\n",
1187                         iommu->name);
1188                 return -ENOMEM;
1189         }
1190
1191         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1192
1193         spin_lock_irqsave(&iommu->lock, flags);
1194         iommu->root_entry = root;
1195         spin_unlock_irqrestore(&iommu->lock, flags);
1196
1197         return 0;
1198 }
1199
1200 static void iommu_set_root_entry(struct intel_iommu *iommu)
1201 {
1202         u64 addr;
1203         u32 sts;
1204         unsigned long flag;
1205
1206         addr = virt_to_phys(iommu->root_entry);
1207         if (sm_supported(iommu))
1208                 addr |= DMA_RTADDR_SMT;
1209
1210         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1211         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1212
1213         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1214
1215         /* Make sure hardware complete it */
1216         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1217                       readl, (sts & DMA_GSTS_RTPS), sts);
1218
1219         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1220 }
1221
1222 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1223 {
1224         u32 val;
1225         unsigned long flag;
1226
1227         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1228                 return;
1229
1230         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1231         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1232
1233         /* Make sure hardware complete it */
1234         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1235                       readl, (!(val & DMA_GSTS_WBFS)), val);
1236
1237         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1238 }
1239
1240 /* return value determine if we need a write buffer flush */
1241 static void __iommu_flush_context(struct intel_iommu *iommu,
1242                                   u16 did, u16 source_id, u8 function_mask,
1243                                   u64 type)
1244 {
1245         u64 val = 0;
1246         unsigned long flag;
1247
1248         switch (type) {
1249         case DMA_CCMD_GLOBAL_INVL:
1250                 val = DMA_CCMD_GLOBAL_INVL;
1251                 break;
1252         case DMA_CCMD_DOMAIN_INVL:
1253                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1254                 break;
1255         case DMA_CCMD_DEVICE_INVL:
1256                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1257                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1258                 break;
1259         default:
1260                 BUG();
1261         }
1262         val |= DMA_CCMD_ICC;
1263
1264         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1265         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1266
1267         /* Make sure hardware complete it */
1268         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1269                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1270
1271         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1272 }
1273
1274 /* return value determine if we need a write buffer flush */
1275 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1276                                 u64 addr, unsigned int size_order, u64 type)
1277 {
1278         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1279         u64 val = 0, val_iva = 0;
1280         unsigned long flag;
1281
1282         switch (type) {
1283         case DMA_TLB_GLOBAL_FLUSH:
1284                 /* global flush doesn't need set IVA_REG */
1285                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1286                 break;
1287         case DMA_TLB_DSI_FLUSH:
1288                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1289                 break;
1290         case DMA_TLB_PSI_FLUSH:
1291                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1292                 /* IH bit is passed in as part of address */
1293                 val_iva = size_order | addr;
1294                 break;
1295         default:
1296                 BUG();
1297         }
1298         /* Note: set drain read/write */
1299 #if 0
1300         /*
1301          * This is probably to be super secure.. Looks like we can
1302          * ignore it without any impact.
1303          */
1304         if (cap_read_drain(iommu->cap))
1305                 val |= DMA_TLB_READ_DRAIN;
1306 #endif
1307         if (cap_write_drain(iommu->cap))
1308                 val |= DMA_TLB_WRITE_DRAIN;
1309
1310         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1311         /* Note: Only uses first TLB reg currently */
1312         if (val_iva)
1313                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1314         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1315
1316         /* Make sure hardware complete it */
1317         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1318                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1319
1320         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1321
1322         /* check IOTLB invalidation granularity */
1323         if (DMA_TLB_IAIG(val) == 0)
1324                 pr_err("Flush IOTLB failed\n");
1325         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1326                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1327                         (unsigned long long)DMA_TLB_IIRG(type),
1328                         (unsigned long long)DMA_TLB_IAIG(val));
1329 }
1330
1331 static struct device_domain_info *
1332 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1333                          u8 bus, u8 devfn)
1334 {
1335         struct device_domain_info *info;
1336
1337         assert_spin_locked(&device_domain_lock);
1338
1339         if (!iommu->qi)
1340                 return NULL;
1341
1342         list_for_each_entry(info, &domain->devices, link)
1343                 if (info->iommu == iommu && info->bus == bus &&
1344                     info->devfn == devfn) {
1345                         if (info->ats_supported && info->dev)
1346                                 return info;
1347                         break;
1348                 }
1349
1350         return NULL;
1351 }
1352
1353 static void domain_update_iotlb(struct dmar_domain *domain)
1354 {
1355         struct device_domain_info *info;
1356         bool has_iotlb_device = false;
1357
1358         assert_spin_locked(&device_domain_lock);
1359
1360         list_for_each_entry(info, &domain->devices, link) {
1361                 struct pci_dev *pdev;
1362
1363                 if (!info->dev || !dev_is_pci(info->dev))
1364                         continue;
1365
1366                 pdev = to_pci_dev(info->dev);
1367                 if (pdev->ats_enabled) {
1368                         has_iotlb_device = true;
1369                         break;
1370                 }
1371         }
1372
1373         domain->has_iotlb_device = has_iotlb_device;
1374 }
1375
1376 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1377 {
1378         struct pci_dev *pdev;
1379
1380         assert_spin_locked(&device_domain_lock);
1381
1382         if (!info || !dev_is_pci(info->dev))
1383                 return;
1384
1385         pdev = to_pci_dev(info->dev);
1386         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1387          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1388          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1389          * reserved, which should be set to 0.
1390          */
1391         if (!ecap_dit(info->iommu->ecap))
1392                 info->pfsid = 0;
1393         else {
1394                 struct pci_dev *pf_pdev;
1395
1396                 /* pdev will be returned if device is not a vf */
1397                 pf_pdev = pci_physfn(pdev);
1398                 info->pfsid = pci_dev_id(pf_pdev);
1399         }
1400
1401 #ifdef CONFIG_INTEL_IOMMU_SVM
1402         /* The PCIe spec, in its wisdom, declares that the behaviour of
1403            the device if you enable PASID support after ATS support is
1404            undefined. So always enable PASID support on devices which
1405            have it, even if we can't yet know if we're ever going to
1406            use it. */
1407         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1408                 info->pasid_enabled = 1;
1409
1410         if (info->pri_supported &&
1411             (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1412             !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1413                 info->pri_enabled = 1;
1414 #endif
1415         if (!pdev->untrusted && info->ats_supported &&
1416             pci_ats_page_aligned(pdev) &&
1417             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1418                 info->ats_enabled = 1;
1419                 domain_update_iotlb(info->domain);
1420                 info->ats_qdep = pci_ats_queue_depth(pdev);
1421         }
1422 }
1423
1424 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1425 {
1426         struct pci_dev *pdev;
1427
1428         assert_spin_locked(&device_domain_lock);
1429
1430         if (!dev_is_pci(info->dev))
1431                 return;
1432
1433         pdev = to_pci_dev(info->dev);
1434
1435         if (info->ats_enabled) {
1436                 pci_disable_ats(pdev);
1437                 info->ats_enabled = 0;
1438                 domain_update_iotlb(info->domain);
1439         }
1440 #ifdef CONFIG_INTEL_IOMMU_SVM
1441         if (info->pri_enabled) {
1442                 pci_disable_pri(pdev);
1443                 info->pri_enabled = 0;
1444         }
1445         if (info->pasid_enabled) {
1446                 pci_disable_pasid(pdev);
1447                 info->pasid_enabled = 0;
1448         }
1449 #endif
1450 }
1451
1452 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1453                                   u64 addr, unsigned mask)
1454 {
1455         u16 sid, qdep;
1456         unsigned long flags;
1457         struct device_domain_info *info;
1458
1459         if (!domain->has_iotlb_device)
1460                 return;
1461
1462         spin_lock_irqsave(&device_domain_lock, flags);
1463         list_for_each_entry(info, &domain->devices, link) {
1464                 if (!info->ats_enabled)
1465                         continue;
1466
1467                 sid = info->bus << 8 | info->devfn;
1468                 qdep = info->ats_qdep;
1469                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1470                                 qdep, addr, mask);
1471         }
1472         spin_unlock_irqrestore(&device_domain_lock, flags);
1473 }
1474
1475 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1476                                   struct dmar_domain *domain,
1477                                   unsigned long pfn, unsigned int pages,
1478                                   int ih, int map)
1479 {
1480         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1481         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1482         u16 did = domain->iommu_did[iommu->seq_id];
1483
1484         BUG_ON(pages == 0);
1485
1486         if (ih)
1487                 ih = 1 << 6;
1488         /*
1489          * Fallback to domain selective flush if no PSI support or the size is
1490          * too big.
1491          * PSI requires page size to be 2 ^ x, and the base address is naturally
1492          * aligned to the size
1493          */
1494         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1495                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1496                                                 DMA_TLB_DSI_FLUSH);
1497         else
1498                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1499                                                 DMA_TLB_PSI_FLUSH);
1500
1501         /*
1502          * In caching mode, changes of pages from non-present to present require
1503          * flush. However, device IOTLB doesn't need to be flushed in this case.
1504          */
1505         if (!cap_caching_mode(iommu->cap) || !map)
1506                 iommu_flush_dev_iotlb(domain, addr, mask);
1507 }
1508
1509 /* Notification for newly created mappings */
1510 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1511                                         struct dmar_domain *domain,
1512                                         unsigned long pfn, unsigned int pages)
1513 {
1514         /* It's a non-present to present mapping. Only flush if caching mode */
1515         if (cap_caching_mode(iommu->cap))
1516                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1517         else
1518                 iommu_flush_write_buffer(iommu);
1519 }
1520
1521 static void iommu_flush_iova(struct iova_domain *iovad)
1522 {
1523         struct dmar_domain *domain;
1524         int idx;
1525
1526         domain = container_of(iovad, struct dmar_domain, iovad);
1527
1528         for_each_domain_iommu(idx, domain) {
1529                 struct intel_iommu *iommu = g_iommus[idx];
1530                 u16 did = domain->iommu_did[iommu->seq_id];
1531
1532                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1533
1534                 if (!cap_caching_mode(iommu->cap))
1535                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1536                                               0, MAX_AGAW_PFN_WIDTH);
1537         }
1538 }
1539
1540 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1541 {
1542         u32 pmen;
1543         unsigned long flags;
1544
1545         if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1546                 return;
1547
1548         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1549         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1550         pmen &= ~DMA_PMEN_EPM;
1551         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1552
1553         /* wait for the protected region status bit to clear */
1554         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1555                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1556
1557         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 }
1559
1560 static void iommu_enable_translation(struct intel_iommu *iommu)
1561 {
1562         u32 sts;
1563         unsigned long flags;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1566         iommu->gcmd |= DMA_GCMD_TE;
1567         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569         /* Make sure hardware complete it */
1570         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                       readl, (sts & DMA_GSTS_TES), sts);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1574 }
1575
1576 static void iommu_disable_translation(struct intel_iommu *iommu)
1577 {
1578         u32 sts;
1579         unsigned long flag;
1580
1581         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1582         iommu->gcmd &= ~DMA_GCMD_TE;
1583         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1584
1585         /* Make sure hardware complete it */
1586         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1587                       readl, (!(sts & DMA_GSTS_TES)), sts);
1588
1589         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1590 }
1591
1592 static int iommu_init_domains(struct intel_iommu *iommu)
1593 {
1594         u32 ndomains, nlongs;
1595         size_t size;
1596
1597         ndomains = cap_ndoms(iommu->cap);
1598         pr_debug("%s: Number of Domains supported <%d>\n",
1599                  iommu->name, ndomains);
1600         nlongs = BITS_TO_LONGS(ndomains);
1601
1602         spin_lock_init(&iommu->lock);
1603
1604         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1605         if (!iommu->domain_ids) {
1606                 pr_err("%s: Allocating domain id array failed\n",
1607                        iommu->name);
1608                 return -ENOMEM;
1609         }
1610
1611         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1612         iommu->domains = kzalloc(size, GFP_KERNEL);
1613
1614         if (iommu->domains) {
1615                 size = 256 * sizeof(struct dmar_domain *);
1616                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1617         }
1618
1619         if (!iommu->domains || !iommu->domains[0]) {
1620                 pr_err("%s: Allocating domain array failed\n",
1621                        iommu->name);
1622                 kfree(iommu->domain_ids);
1623                 kfree(iommu->domains);
1624                 iommu->domain_ids = NULL;
1625                 iommu->domains    = NULL;
1626                 return -ENOMEM;
1627         }
1628
1629         /*
1630          * If Caching mode is set, then invalid translations are tagged
1631          * with domain-id 0, hence we need to pre-allocate it. We also
1632          * use domain-id 0 as a marker for non-allocated domain-id, so
1633          * make sure it is not used for a real domain.
1634          */
1635         set_bit(0, iommu->domain_ids);
1636
1637         /*
1638          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1639          * entry for first-level or pass-through translation modes should
1640          * be programmed with a domain id different from those used for
1641          * second-level or nested translation. We reserve a domain id for
1642          * this purpose.
1643          */
1644         if (sm_supported(iommu))
1645                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1646
1647         return 0;
1648 }
1649
1650 static void disable_dmar_iommu(struct intel_iommu *iommu)
1651 {
1652         struct device_domain_info *info, *tmp;
1653         unsigned long flags;
1654
1655         if (!iommu->domains || !iommu->domain_ids)
1656                 return;
1657
1658         spin_lock_irqsave(&device_domain_lock, flags);
1659         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1660                 if (info->iommu != iommu)
1661                         continue;
1662
1663                 if (!info->dev || !info->domain)
1664                         continue;
1665
1666                 __dmar_remove_one_dev_info(info);
1667         }
1668         spin_unlock_irqrestore(&device_domain_lock, flags);
1669
1670         if (iommu->gcmd & DMA_GCMD_TE)
1671                 iommu_disable_translation(iommu);
1672 }
1673
1674 static void free_dmar_iommu(struct intel_iommu *iommu)
1675 {
1676         if ((iommu->domains) && (iommu->domain_ids)) {
1677                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1678                 int i;
1679
1680                 for (i = 0; i < elems; i++)
1681                         kfree(iommu->domains[i]);
1682                 kfree(iommu->domains);
1683                 kfree(iommu->domain_ids);
1684                 iommu->domains = NULL;
1685                 iommu->domain_ids = NULL;
1686         }
1687
1688         g_iommus[iommu->seq_id] = NULL;
1689
1690         /* free context mapping */
1691         free_context_table(iommu);
1692
1693 #ifdef CONFIG_INTEL_IOMMU_SVM
1694         if (pasid_supported(iommu)) {
1695                 if (ecap_prs(iommu->ecap))
1696                         intel_svm_finish_prq(iommu);
1697         }
1698 #endif
1699 }
1700
1701 static struct dmar_domain *alloc_domain(int flags)
1702 {
1703         struct dmar_domain *domain;
1704
1705         domain = alloc_domain_mem();
1706         if (!domain)
1707                 return NULL;
1708
1709         memset(domain, 0, sizeof(*domain));
1710         domain->nid = NUMA_NO_NODE;
1711         domain->flags = flags;
1712         domain->has_iotlb_device = false;
1713         INIT_LIST_HEAD(&domain->devices);
1714
1715         return domain;
1716 }
1717
1718 /* Must be called with iommu->lock */
1719 static int domain_attach_iommu(struct dmar_domain *domain,
1720                                struct intel_iommu *iommu)
1721 {
1722         unsigned long ndomains;
1723         int num;
1724
1725         assert_spin_locked(&device_domain_lock);
1726         assert_spin_locked(&iommu->lock);
1727
1728         domain->iommu_refcnt[iommu->seq_id] += 1;
1729         domain->iommu_count += 1;
1730         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1731                 ndomains = cap_ndoms(iommu->cap);
1732                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1733
1734                 if (num >= ndomains) {
1735                         pr_err("%s: No free domain ids\n", iommu->name);
1736                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1737                         domain->iommu_count -= 1;
1738                         return -ENOSPC;
1739                 }
1740
1741                 set_bit(num, iommu->domain_ids);
1742                 set_iommu_domain(iommu, num, domain);
1743
1744                 domain->iommu_did[iommu->seq_id] = num;
1745                 domain->nid                      = iommu->node;
1746
1747                 domain_update_iommu_cap(domain);
1748         }
1749
1750         return 0;
1751 }
1752
1753 static int domain_detach_iommu(struct dmar_domain *domain,
1754                                struct intel_iommu *iommu)
1755 {
1756         int num, count;
1757
1758         assert_spin_locked(&device_domain_lock);
1759         assert_spin_locked(&iommu->lock);
1760
1761         domain->iommu_refcnt[iommu->seq_id] -= 1;
1762         count = --domain->iommu_count;
1763         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1764                 num = domain->iommu_did[iommu->seq_id];
1765                 clear_bit(num, iommu->domain_ids);
1766                 set_iommu_domain(iommu, num, NULL);
1767
1768                 domain_update_iommu_cap(domain);
1769                 domain->iommu_did[iommu->seq_id] = 0;
1770         }
1771
1772         return count;
1773 }
1774
1775 static struct iova_domain reserved_iova_list;
1776 static struct lock_class_key reserved_rbtree_key;
1777
1778 static int dmar_init_reserved_ranges(void)
1779 {
1780         struct pci_dev *pdev = NULL;
1781         struct iova *iova;
1782         int i;
1783
1784         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1785
1786         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1787                 &reserved_rbtree_key);
1788
1789         /* IOAPIC ranges shouldn't be accessed by DMA */
1790         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1791                 IOVA_PFN(IOAPIC_RANGE_END));
1792         if (!iova) {
1793                 pr_err("Reserve IOAPIC range failed\n");
1794                 return -ENODEV;
1795         }
1796
1797         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1798         for_each_pci_dev(pdev) {
1799                 struct resource *r;
1800
1801                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1802                         r = &pdev->resource[i];
1803                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1804                                 continue;
1805                         iova = reserve_iova(&reserved_iova_list,
1806                                             IOVA_PFN(r->start),
1807                                             IOVA_PFN(r->end));
1808                         if (!iova) {
1809                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1810                                 return -ENODEV;
1811                         }
1812                 }
1813         }
1814         return 0;
1815 }
1816
1817 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1818 {
1819         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1820 }
1821
1822 static inline int guestwidth_to_adjustwidth(int gaw)
1823 {
1824         int agaw;
1825         int r = (gaw - 12) % 9;
1826
1827         if (r == 0)
1828                 agaw = gaw;
1829         else
1830                 agaw = gaw + 9 - r;
1831         if (agaw > 64)
1832                 agaw = 64;
1833         return agaw;
1834 }
1835
1836 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1837                        int guest_width)
1838 {
1839         int adjust_width, agaw;
1840         unsigned long sagaw;
1841         int err;
1842
1843         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1844
1845         err = init_iova_flush_queue(&domain->iovad,
1846                                     iommu_flush_iova, iova_entry_free);
1847         if (err)
1848                 return err;
1849
1850         domain_reserve_special_ranges(domain);
1851
1852         /* calculate AGAW */
1853         if (guest_width > cap_mgaw(iommu->cap))
1854                 guest_width = cap_mgaw(iommu->cap);
1855         domain->gaw = guest_width;
1856         adjust_width = guestwidth_to_adjustwidth(guest_width);
1857         agaw = width_to_agaw(adjust_width);
1858         sagaw = cap_sagaw(iommu->cap);
1859         if (!test_bit(agaw, &sagaw)) {
1860                 /* hardware doesn't support it, choose a bigger one */
1861                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1862                 agaw = find_next_bit(&sagaw, 5, agaw);
1863                 if (agaw >= 5)
1864                         return -ENODEV;
1865         }
1866         domain->agaw = agaw;
1867
1868         if (ecap_coherent(iommu->ecap))
1869                 domain->iommu_coherency = 1;
1870         else
1871                 domain->iommu_coherency = 0;
1872
1873         if (ecap_sc_support(iommu->ecap))
1874                 domain->iommu_snooping = 1;
1875         else
1876                 domain->iommu_snooping = 0;
1877
1878         if (intel_iommu_superpage)
1879                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1880         else
1881                 domain->iommu_superpage = 0;
1882
1883         domain->nid = iommu->node;
1884
1885         /* always allocate the top pgd */
1886         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1887         if (!domain->pgd)
1888                 return -ENOMEM;
1889         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1890         return 0;
1891 }
1892
1893 static void domain_exit(struct dmar_domain *domain)
1894 {
1895         struct page *freelist;
1896
1897         /* Remove associated devices and clear attached or cached domains */
1898         domain_remove_dev_info(domain);
1899
1900         /* destroy iovas */
1901         put_iova_domain(&domain->iovad);
1902
1903         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1904
1905         dma_free_pagelist(freelist);
1906
1907         free_domain_mem(domain);
1908 }
1909
1910 /*
1911  * Get the PASID directory size for scalable mode context entry.
1912  * Value of X in the PDTS field of a scalable mode context entry
1913  * indicates PASID directory with 2^(X + 7) entries.
1914  */
1915 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1916 {
1917         int pds, max_pde;
1918
1919         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1920         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1921         if (pds < 7)
1922                 return 0;
1923
1924         return pds - 7;
1925 }
1926
1927 /*
1928  * Set the RID_PASID field of a scalable mode context entry. The
1929  * IOMMU hardware will use the PASID value set in this field for
1930  * DMA translations of DMA requests without PASID.
1931  */
1932 static inline void
1933 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1934 {
1935         context->hi |= pasid & ((1 << 20) - 1);
1936         context->hi |= (1 << 20);
1937 }
1938
1939 /*
1940  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1941  * entry.
1942  */
1943 static inline void context_set_sm_dte(struct context_entry *context)
1944 {
1945         context->lo |= (1 << 2);
1946 }
1947
1948 /*
1949  * Set the PRE(Page Request Enable) field of a scalable mode context
1950  * entry.
1951  */
1952 static inline void context_set_sm_pre(struct context_entry *context)
1953 {
1954         context->lo |= (1 << 4);
1955 }
1956
1957 /* Convert value to context PASID directory size field coding. */
1958 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1959
1960 static int domain_context_mapping_one(struct dmar_domain *domain,
1961                                       struct intel_iommu *iommu,
1962                                       struct pasid_table *table,
1963                                       u8 bus, u8 devfn)
1964 {
1965         u16 did = domain->iommu_did[iommu->seq_id];
1966         int translation = CONTEXT_TT_MULTI_LEVEL;
1967         struct device_domain_info *info = NULL;
1968         struct context_entry *context;
1969         unsigned long flags;
1970         int ret;
1971
1972         WARN_ON(did == 0);
1973
1974         if (hw_pass_through && domain_type_is_si(domain))
1975                 translation = CONTEXT_TT_PASS_THROUGH;
1976
1977         pr_debug("Set context mapping for %02x:%02x.%d\n",
1978                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1979
1980         BUG_ON(!domain->pgd);
1981
1982         spin_lock_irqsave(&device_domain_lock, flags);
1983         spin_lock(&iommu->lock);
1984
1985         ret = -ENOMEM;
1986         context = iommu_context_addr(iommu, bus, devfn, 1);
1987         if (!context)
1988                 goto out_unlock;
1989
1990         ret = 0;
1991         if (context_present(context))
1992                 goto out_unlock;
1993
1994         /*
1995          * For kdump cases, old valid entries may be cached due to the
1996          * in-flight DMA and copied pgtable, but there is no unmapping
1997          * behaviour for them, thus we need an explicit cache flush for
1998          * the newly-mapped device. For kdump, at this point, the device
1999          * is supposed to finish reset at its driver probe stage, so no
2000          * in-flight DMA will exist, and we don't need to worry anymore
2001          * hereafter.
2002          */
2003         if (context_copied(context)) {
2004                 u16 did_old = context_domain_id(context);
2005
2006                 if (did_old < cap_ndoms(iommu->cap)) {
2007                         iommu->flush.flush_context(iommu, did_old,
2008                                                    (((u16)bus) << 8) | devfn,
2009                                                    DMA_CCMD_MASK_NOBIT,
2010                                                    DMA_CCMD_DEVICE_INVL);
2011                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2012                                                  DMA_TLB_DSI_FLUSH);
2013                 }
2014         }
2015
2016         context_clear_entry(context);
2017
2018         if (sm_supported(iommu)) {
2019                 unsigned long pds;
2020
2021                 WARN_ON(!table);
2022
2023                 /* Setup the PASID DIR pointer: */
2024                 pds = context_get_sm_pds(table);
2025                 context->lo = (u64)virt_to_phys(table->table) |
2026                                 context_pdts(pds);
2027
2028                 /* Setup the RID_PASID field: */
2029                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2030
2031                 /*
2032                  * Setup the Device-TLB enable bit and Page request
2033                  * Enable bit:
2034                  */
2035                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2036                 if (info && info->ats_supported)
2037                         context_set_sm_dte(context);
2038                 if (info && info->pri_supported)
2039                         context_set_sm_pre(context);
2040         } else {
2041                 struct dma_pte *pgd = domain->pgd;
2042                 int agaw;
2043
2044                 context_set_domain_id(context, did);
2045
2046                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2047                         /*
2048                          * Skip top levels of page tables for iommu which has
2049                          * less agaw than default. Unnecessary for PT mode.
2050                          */
2051                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2052                                 ret = -ENOMEM;
2053                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2054                                 if (!dma_pte_present(pgd))
2055                                         goto out_unlock;
2056                         }
2057
2058                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2059                         if (info && info->ats_supported)
2060                                 translation = CONTEXT_TT_DEV_IOTLB;
2061                         else
2062                                 translation = CONTEXT_TT_MULTI_LEVEL;
2063
2064                         context_set_address_root(context, virt_to_phys(pgd));
2065                         context_set_address_width(context, agaw);
2066                 } else {
2067                         /*
2068                          * In pass through mode, AW must be programmed to
2069                          * indicate the largest AGAW value supported by
2070                          * hardware. And ASR is ignored by hardware.
2071                          */
2072                         context_set_address_width(context, iommu->msagaw);
2073                 }
2074
2075                 context_set_translation_type(context, translation);
2076         }
2077
2078         context_set_fault_enable(context);
2079         context_set_present(context);
2080         domain_flush_cache(domain, context, sizeof(*context));
2081
2082         /*
2083          * It's a non-present to present mapping. If hardware doesn't cache
2084          * non-present entry we only need to flush the write-buffer. If the
2085          * _does_ cache non-present entries, then it does so in the special
2086          * domain #0, which we have to flush:
2087          */
2088         if (cap_caching_mode(iommu->cap)) {
2089                 iommu->flush.flush_context(iommu, 0,
2090                                            (((u16)bus) << 8) | devfn,
2091                                            DMA_CCMD_MASK_NOBIT,
2092                                            DMA_CCMD_DEVICE_INVL);
2093                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2094         } else {
2095                 iommu_flush_write_buffer(iommu);
2096         }
2097         iommu_enable_dev_iotlb(info);
2098
2099         ret = 0;
2100
2101 out_unlock:
2102         spin_unlock(&iommu->lock);
2103         spin_unlock_irqrestore(&device_domain_lock, flags);
2104
2105         return ret;
2106 }
2107
2108 struct domain_context_mapping_data {
2109         struct dmar_domain *domain;
2110         struct intel_iommu *iommu;
2111         struct pasid_table *table;
2112 };
2113
2114 static int domain_context_mapping_cb(struct pci_dev *pdev,
2115                                      u16 alias, void *opaque)
2116 {
2117         struct domain_context_mapping_data *data = opaque;
2118
2119         return domain_context_mapping_one(data->domain, data->iommu,
2120                                           data->table, PCI_BUS_NUM(alias),
2121                                           alias & 0xff);
2122 }
2123
2124 static int
2125 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2126 {
2127         struct domain_context_mapping_data data;
2128         struct pasid_table *table;
2129         struct intel_iommu *iommu;
2130         u8 bus, devfn;
2131
2132         iommu = device_to_iommu(dev, &bus, &devfn);
2133         if (!iommu)
2134                 return -ENODEV;
2135
2136         table = intel_pasid_get_table(dev);
2137
2138         if (!dev_is_pci(dev))
2139                 return domain_context_mapping_one(domain, iommu, table,
2140                                                   bus, devfn);
2141
2142         data.domain = domain;
2143         data.iommu = iommu;
2144         data.table = table;
2145
2146         return pci_for_each_dma_alias(to_pci_dev(dev),
2147                                       &domain_context_mapping_cb, &data);
2148 }
2149
2150 static int domain_context_mapped_cb(struct pci_dev *pdev,
2151                                     u16 alias, void *opaque)
2152 {
2153         struct intel_iommu *iommu = opaque;
2154
2155         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2156 }
2157
2158 static int domain_context_mapped(struct device *dev)
2159 {
2160         struct intel_iommu *iommu;
2161         u8 bus, devfn;
2162
2163         iommu = device_to_iommu(dev, &bus, &devfn);
2164         if (!iommu)
2165                 return -ENODEV;
2166
2167         if (!dev_is_pci(dev))
2168                 return device_context_mapped(iommu, bus, devfn);
2169
2170         return !pci_for_each_dma_alias(to_pci_dev(dev),
2171                                        domain_context_mapped_cb, iommu);
2172 }
2173
2174 /* Returns a number of VTD pages, but aligned to MM page size */
2175 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2176                                             size_t size)
2177 {
2178         host_addr &= ~PAGE_MASK;
2179         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2180 }
2181
2182 /* Return largest possible superpage level for a given mapping */
2183 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2184                                           unsigned long iov_pfn,
2185                                           unsigned long phy_pfn,
2186                                           unsigned long pages)
2187 {
2188         int support, level = 1;
2189         unsigned long pfnmerge;
2190
2191         support = domain->iommu_superpage;
2192
2193         /* To use a large page, the virtual *and* physical addresses
2194            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2195            of them will mean we have to use smaller pages. So just
2196            merge them and check both at once. */
2197         pfnmerge = iov_pfn | phy_pfn;
2198
2199         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2200                 pages >>= VTD_STRIDE_SHIFT;
2201                 if (!pages)
2202                         break;
2203                 pfnmerge >>= VTD_STRIDE_SHIFT;
2204                 level++;
2205                 support--;
2206         }
2207         return level;
2208 }
2209
2210 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2211                             struct scatterlist *sg, unsigned long phys_pfn,
2212                             unsigned long nr_pages, int prot)
2213 {
2214         struct dma_pte *first_pte = NULL, *pte = NULL;
2215         phys_addr_t uninitialized_var(pteval);
2216         unsigned long sg_res = 0;
2217         unsigned int largepage_lvl = 0;
2218         unsigned long lvl_pages = 0;
2219
2220         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2221
2222         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2223                 return -EINVAL;
2224
2225         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2226
2227         if (!sg) {
2228                 sg_res = nr_pages;
2229                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2230         }
2231
2232         while (nr_pages > 0) {
2233                 uint64_t tmp;
2234
2235                 if (!sg_res) {
2236                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2237
2238                         sg_res = aligned_nrpages(sg->offset, sg->length);
2239                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2240                         sg->dma_length = sg->length;
2241                         pteval = (sg_phys(sg) - pgoff) | prot;
2242                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2243                 }
2244
2245                 if (!pte) {
2246                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2247
2248                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2249                         if (!pte)
2250                                 return -ENOMEM;
2251                         /* It is large page*/
2252                         if (largepage_lvl > 1) {
2253                                 unsigned long nr_superpages, end_pfn;
2254
2255                                 pteval |= DMA_PTE_LARGE_PAGE;
2256                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2257
2258                                 nr_superpages = sg_res / lvl_pages;
2259                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2260
2261                                 /*
2262                                  * Ensure that old small page tables are
2263                                  * removed to make room for superpage(s).
2264                                  * We're adding new large pages, so make sure
2265                                  * we don't remove their parent tables.
2266                                  */
2267                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2268                                                        largepage_lvl + 1);
2269                         } else {
2270                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2271                         }
2272
2273                 }
2274                 /* We don't need lock here, nobody else
2275                  * touches the iova range
2276                  */
2277                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2278                 if (tmp) {
2279                         static int dumps = 5;
2280                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2281                                 iov_pfn, tmp, (unsigned long long)pteval);
2282                         if (dumps) {
2283                                 dumps--;
2284                                 debug_dma_dump_mappings(NULL);
2285                         }
2286                         WARN_ON(1);
2287                 }
2288
2289                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2290
2291                 BUG_ON(nr_pages < lvl_pages);
2292                 BUG_ON(sg_res < lvl_pages);
2293
2294                 nr_pages -= lvl_pages;
2295                 iov_pfn += lvl_pages;
2296                 phys_pfn += lvl_pages;
2297                 pteval += lvl_pages * VTD_PAGE_SIZE;
2298                 sg_res -= lvl_pages;
2299
2300                 /* If the next PTE would be the first in a new page, then we
2301                    need to flush the cache on the entries we've just written.
2302                    And then we'll need to recalculate 'pte', so clear it and
2303                    let it get set again in the if (!pte) block above.
2304
2305                    If we're done (!nr_pages) we need to flush the cache too.
2306
2307                    Also if we've been setting superpages, we may need to
2308                    recalculate 'pte' and switch back to smaller pages for the
2309                    end of the mapping, if the trailing size is not enough to
2310                    use another superpage (i.e. sg_res < lvl_pages). */
2311                 pte++;
2312                 if (!nr_pages || first_pte_in_page(pte) ||
2313                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2314                         domain_flush_cache(domain, first_pte,
2315                                            (void *)pte - (void *)first_pte);
2316                         pte = NULL;
2317                 }
2318
2319                 if (!sg_res && nr_pages)
2320                         sg = sg_next(sg);
2321         }
2322         return 0;
2323 }
2324
2325 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2326                           struct scatterlist *sg, unsigned long phys_pfn,
2327                           unsigned long nr_pages, int prot)
2328 {
2329         int iommu_id, ret;
2330         struct intel_iommu *iommu;
2331
2332         /* Do the real mapping first */
2333         ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2334         if (ret)
2335                 return ret;
2336
2337         for_each_domain_iommu(iommu_id, domain) {
2338                 iommu = g_iommus[iommu_id];
2339                 __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2340         }
2341
2342         return 0;
2343 }
2344
2345 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2346                                     struct scatterlist *sg, unsigned long nr_pages,
2347                                     int prot)
2348 {
2349         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2350 }
2351
2352 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2353                                      unsigned long phys_pfn, unsigned long nr_pages,
2354                                      int prot)
2355 {
2356         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2357 }
2358
2359 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2360 {
2361         unsigned long flags;
2362         struct context_entry *context;
2363         u16 did_old;
2364
2365         if (!iommu)
2366                 return;
2367
2368         spin_lock_irqsave(&iommu->lock, flags);
2369         context = iommu_context_addr(iommu, bus, devfn, 0);
2370         if (!context) {
2371                 spin_unlock_irqrestore(&iommu->lock, flags);
2372                 return;
2373         }
2374         did_old = context_domain_id(context);
2375         context_clear_entry(context);
2376         __iommu_flush_cache(iommu, context, sizeof(*context));
2377         spin_unlock_irqrestore(&iommu->lock, flags);
2378         iommu->flush.flush_context(iommu,
2379                                    did_old,
2380                                    (((u16)bus) << 8) | devfn,
2381                                    DMA_CCMD_MASK_NOBIT,
2382                                    DMA_CCMD_DEVICE_INVL);
2383         iommu->flush.flush_iotlb(iommu,
2384                                  did_old,
2385                                  0,
2386                                  0,
2387                                  DMA_TLB_DSI_FLUSH);
2388 }
2389
2390 static inline void unlink_domain_info(struct device_domain_info *info)
2391 {
2392         assert_spin_locked(&device_domain_lock);
2393         list_del(&info->link);
2394         list_del(&info->global);
2395         if (info->dev)
2396                 info->dev->archdata.iommu = NULL;
2397 }
2398
2399 static void domain_remove_dev_info(struct dmar_domain *domain)
2400 {
2401         struct device_domain_info *info, *tmp;
2402         unsigned long flags;
2403
2404         spin_lock_irqsave(&device_domain_lock, flags);
2405         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2406                 __dmar_remove_one_dev_info(info);
2407         spin_unlock_irqrestore(&device_domain_lock, flags);
2408 }
2409
2410 /*
2411  * find_domain
2412  * Note: we use struct device->archdata.iommu stores the info
2413  */
2414 static struct dmar_domain *find_domain(struct device *dev)
2415 {
2416         struct device_domain_info *info;
2417
2418         if (unlikely(dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO)) {
2419                 struct iommu_domain *domain;
2420
2421                 dev->archdata.iommu = NULL;
2422                 domain = iommu_get_domain_for_dev(dev);
2423                 if (domain)
2424                         intel_iommu_attach_device(domain, dev);
2425         }
2426
2427         /* No lock here, assumes no domain exit in normal case */
2428         info = dev->archdata.iommu;
2429
2430         if (likely(info))
2431                 return info->domain;
2432         return NULL;
2433 }
2434
2435 static inline struct device_domain_info *
2436 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2437 {
2438         struct device_domain_info *info;
2439
2440         list_for_each_entry(info, &device_domain_list, global)
2441                 if (info->iommu->segment == segment && info->bus == bus &&
2442                     info->devfn == devfn)
2443                         return info;
2444
2445         return NULL;
2446 }
2447
2448 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2449                                                     int bus, int devfn,
2450                                                     struct device *dev,
2451                                                     struct dmar_domain *domain)
2452 {
2453         struct dmar_domain *found = NULL;
2454         struct device_domain_info *info;
2455         unsigned long flags;
2456         int ret;
2457
2458         info = alloc_devinfo_mem();
2459         if (!info)
2460                 return NULL;
2461
2462         info->bus = bus;
2463         info->devfn = devfn;
2464         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2465         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2466         info->ats_qdep = 0;
2467         info->dev = dev;
2468         info->domain = domain;
2469         info->iommu = iommu;
2470         info->pasid_table = NULL;
2471         info->auxd_enabled = 0;
2472         INIT_LIST_HEAD(&info->auxiliary_domains);
2473
2474         if (dev && dev_is_pci(dev)) {
2475                 struct pci_dev *pdev = to_pci_dev(info->dev);
2476
2477                 if (!pdev->untrusted &&
2478                     !pci_ats_disabled() &&
2479                     ecap_dev_iotlb_support(iommu->ecap) &&
2480                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2481                     dmar_find_matched_atsr_unit(pdev))
2482                         info->ats_supported = 1;
2483
2484                 if (sm_supported(iommu)) {
2485                         if (pasid_supported(iommu)) {
2486                                 int features = pci_pasid_features(pdev);
2487                                 if (features >= 0)
2488                                         info->pasid_supported = features | 1;
2489                         }
2490
2491                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2492                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2493                                 info->pri_supported = 1;
2494                 }
2495         }
2496
2497         spin_lock_irqsave(&device_domain_lock, flags);
2498         if (dev)
2499                 found = find_domain(dev);
2500
2501         if (!found) {
2502                 struct device_domain_info *info2;
2503                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2504                 if (info2) {
2505                         found      = info2->domain;
2506                         info2->dev = dev;
2507                 }
2508         }
2509
2510         if (found) {
2511                 spin_unlock_irqrestore(&device_domain_lock, flags);
2512                 free_devinfo_mem(info);
2513                 /* Caller must free the original domain */
2514                 return found;
2515         }
2516
2517         spin_lock(&iommu->lock);
2518         ret = domain_attach_iommu(domain, iommu);
2519         spin_unlock(&iommu->lock);
2520
2521         if (ret) {
2522                 spin_unlock_irqrestore(&device_domain_lock, flags);
2523                 free_devinfo_mem(info);
2524                 return NULL;
2525         }
2526
2527         list_add(&info->link, &domain->devices);
2528         list_add(&info->global, &device_domain_list);
2529         if (dev)
2530                 dev->archdata.iommu = info;
2531         spin_unlock_irqrestore(&device_domain_lock, flags);
2532
2533         /* PASID table is mandatory for a PCI device in scalable mode. */
2534         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2535                 ret = intel_pasid_alloc_table(dev);
2536                 if (ret) {
2537                         dev_err(dev, "PASID table allocation failed\n");
2538                         dmar_remove_one_dev_info(dev);
2539                         return NULL;
2540                 }
2541
2542                 /* Setup the PASID entry for requests without PASID: */
2543                 spin_lock(&iommu->lock);
2544                 if (hw_pass_through && domain_type_is_si(domain))
2545                         ret = intel_pasid_setup_pass_through(iommu, domain,
2546                                         dev, PASID_RID2PASID);
2547                 else
2548                         ret = intel_pasid_setup_second_level(iommu, domain,
2549                                         dev, PASID_RID2PASID);
2550                 spin_unlock(&iommu->lock);
2551                 if (ret) {
2552                         dev_err(dev, "Setup RID2PASID failed\n");
2553                         dmar_remove_one_dev_info(dev);
2554                         return NULL;
2555                 }
2556         }
2557
2558         if (dev && domain_context_mapping(domain, dev)) {
2559                 dev_err(dev, "Domain context map failed\n");
2560                 dmar_remove_one_dev_info(dev);
2561                 return NULL;
2562         }
2563
2564         return domain;
2565 }
2566
2567 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2568 {
2569         *(u16 *)opaque = alias;
2570         return 0;
2571 }
2572
2573 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2574 {
2575         struct device_domain_info *info;
2576         struct dmar_domain *domain = NULL;
2577         struct intel_iommu *iommu;
2578         u16 dma_alias;
2579         unsigned long flags;
2580         u8 bus, devfn;
2581
2582         iommu = device_to_iommu(dev, &bus, &devfn);
2583         if (!iommu)
2584                 return NULL;
2585
2586         if (dev_is_pci(dev)) {
2587                 struct pci_dev *pdev = to_pci_dev(dev);
2588
2589                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2590
2591                 spin_lock_irqsave(&device_domain_lock, flags);
2592                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2593                                                       PCI_BUS_NUM(dma_alias),
2594                                                       dma_alias & 0xff);
2595                 if (info) {
2596                         iommu = info->iommu;
2597                         domain = info->domain;
2598                 }
2599                 spin_unlock_irqrestore(&device_domain_lock, flags);
2600
2601                 /* DMA alias already has a domain, use it */
2602                 if (info)
2603                         goto out;
2604         }
2605
2606         /* Allocate and initialize new domain for the device */
2607         domain = alloc_domain(0);
2608         if (!domain)
2609                 return NULL;
2610         if (domain_init(domain, iommu, gaw)) {
2611                 domain_exit(domain);
2612                 return NULL;
2613         }
2614
2615 out:
2616         return domain;
2617 }
2618
2619 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2620                                               struct dmar_domain *domain)
2621 {
2622         struct intel_iommu *iommu;
2623         struct dmar_domain *tmp;
2624         u16 req_id, dma_alias;
2625         u8 bus, devfn;
2626
2627         iommu = device_to_iommu(dev, &bus, &devfn);
2628         if (!iommu)
2629                 return NULL;
2630
2631         req_id = ((u16)bus << 8) | devfn;
2632
2633         if (dev_is_pci(dev)) {
2634                 struct pci_dev *pdev = to_pci_dev(dev);
2635
2636                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2637
2638                 /* register PCI DMA alias device */
2639                 if (req_id != dma_alias) {
2640                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2641                                         dma_alias & 0xff, NULL, domain);
2642
2643                         if (!tmp || tmp != domain)
2644                                 return tmp;
2645                 }
2646         }
2647
2648         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2649         if (!tmp || tmp != domain)
2650                 return tmp;
2651
2652         return domain;
2653 }
2654
2655 static int iommu_domain_identity_map(struct dmar_domain *domain,
2656                                      unsigned long long start,
2657                                      unsigned long long end)
2658 {
2659         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2660         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2661
2662         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2663                           dma_to_mm_pfn(last_vpfn))) {
2664                 pr_err("Reserving iova failed\n");
2665                 return -ENOMEM;
2666         }
2667
2668         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2669         /*
2670          * RMRR range might have overlap with physical memory range,
2671          * clear it first
2672          */
2673         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2674
2675         return __domain_mapping(domain, first_vpfn, NULL,
2676                                 first_vpfn, last_vpfn - first_vpfn + 1,
2677                                 DMA_PTE_READ|DMA_PTE_WRITE);
2678 }
2679
2680 static int domain_prepare_identity_map(struct device *dev,
2681                                        struct dmar_domain *domain,
2682                                        unsigned long long start,
2683                                        unsigned long long end)
2684 {
2685         /* For _hardware_ passthrough, don't bother. But for software
2686            passthrough, we do it anyway -- it may indicate a memory
2687            range which is reserved in E820, so which didn't get set
2688            up to start with in si_domain */
2689         if (domain == si_domain && hw_pass_through) {
2690                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2691                          start, end);
2692                 return 0;
2693         }
2694
2695         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2696
2697         if (end < start) {
2698                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2699                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2700                         dmi_get_system_info(DMI_BIOS_VENDOR),
2701                         dmi_get_system_info(DMI_BIOS_VERSION),
2702                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2703                 return -EIO;
2704         }
2705
2706         if (end >> agaw_to_width(domain->agaw)) {
2707                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2708                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2709                      agaw_to_width(domain->agaw),
2710                      dmi_get_system_info(DMI_BIOS_VENDOR),
2711                      dmi_get_system_info(DMI_BIOS_VERSION),
2712                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2713                 return -EIO;
2714         }
2715
2716         return iommu_domain_identity_map(domain, start, end);
2717 }
2718
2719 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2720
2721 static int __init si_domain_init(int hw)
2722 {
2723         struct dmar_rmrr_unit *rmrr;
2724         struct device *dev;
2725         int i, nid, ret;
2726
2727         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2728         if (!si_domain)
2729                 return -EFAULT;
2730
2731         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2732                 domain_exit(si_domain);
2733                 return -EFAULT;
2734         }
2735
2736         if (hw)
2737                 return 0;
2738
2739         for_each_online_node(nid) {
2740                 unsigned long start_pfn, end_pfn;
2741                 int i;
2742
2743                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2744                         ret = iommu_domain_identity_map(si_domain,
2745                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2746                         if (ret)
2747                                 return ret;
2748                 }
2749         }
2750
2751         /*
2752          * Normally we use DMA domains for devices which have RMRRs. But we
2753          * loose this requirement for graphic and usb devices. Identity map
2754          * the RMRRs for graphic and USB devices so that they could use the
2755          * si_domain.
2756          */
2757         for_each_rmrr_units(rmrr) {
2758                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2759                                           i, dev) {
2760                         unsigned long long start = rmrr->base_address;
2761                         unsigned long long end = rmrr->end_address;
2762
2763                         if (device_is_rmrr_locked(dev))
2764                                 continue;
2765
2766                         if (WARN_ON(end < start ||
2767                                     end >> agaw_to_width(si_domain->agaw)))
2768                                 continue;
2769
2770                         ret = iommu_domain_identity_map(si_domain, start, end);
2771                         if (ret)
2772                                 return ret;
2773                 }
2774         }
2775
2776         return 0;
2777 }
2778
2779 static int identity_mapping(struct device *dev)
2780 {
2781         struct device_domain_info *info;
2782
2783         info = dev->archdata.iommu;
2784         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2785                 return (info->domain == si_domain);
2786
2787         return 0;
2788 }
2789
2790 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2791 {
2792         struct dmar_domain *ndomain;
2793         struct intel_iommu *iommu;
2794         u8 bus, devfn;
2795
2796         iommu = device_to_iommu(dev, &bus, &devfn);
2797         if (!iommu)
2798                 return -ENODEV;
2799
2800         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2801         if (ndomain != domain)
2802                 return -EBUSY;
2803
2804         return 0;
2805 }
2806
2807 static bool device_has_rmrr(struct device *dev)
2808 {
2809         struct dmar_rmrr_unit *rmrr;
2810         struct device *tmp;
2811         int i;
2812
2813         rcu_read_lock();
2814         for_each_rmrr_units(rmrr) {
2815                 /*
2816                  * Return TRUE if this RMRR contains the device that
2817                  * is passed in.
2818                  */
2819                 for_each_active_dev_scope(rmrr->devices,
2820                                           rmrr->devices_cnt, i, tmp)
2821                         if (tmp == dev ||
2822                             is_downstream_to_pci_bridge(dev, tmp)) {
2823                                 rcu_read_unlock();
2824                                 return true;
2825                         }
2826         }
2827         rcu_read_unlock();
2828         return false;
2829 }
2830
2831 /**
2832  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2833  * is relaxable (ie. is allowed to be not enforced under some conditions)
2834  * @dev: device handle
2835  *
2836  * We assume that PCI USB devices with RMRRs have them largely
2837  * for historical reasons and that the RMRR space is not actively used post
2838  * boot.  This exclusion may change if vendors begin to abuse it.
2839  *
2840  * The same exception is made for graphics devices, with the requirement that
2841  * any use of the RMRR regions will be torn down before assigning the device
2842  * to a guest.
2843  *
2844  * Return: true if the RMRR is relaxable, false otherwise
2845  */
2846 static bool device_rmrr_is_relaxable(struct device *dev)
2847 {
2848         struct pci_dev *pdev;
2849
2850         if (!dev_is_pci(dev))
2851                 return false;
2852
2853         pdev = to_pci_dev(dev);
2854         if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2855                 return true;
2856         else
2857                 return false;
2858 }
2859
2860 /*
2861  * There are a couple cases where we need to restrict the functionality of
2862  * devices associated with RMRRs.  The first is when evaluating a device for
2863  * identity mapping because problems exist when devices are moved in and out
2864  * of domains and their respective RMRR information is lost.  This means that
2865  * a device with associated RMRRs will never be in a "passthrough" domain.
2866  * The second is use of the device through the IOMMU API.  This interface
2867  * expects to have full control of the IOVA space for the device.  We cannot
2868  * satisfy both the requirement that RMRR access is maintained and have an
2869  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2870  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2871  * We therefore prevent devices associated with an RMRR from participating in
2872  * the IOMMU API, which eliminates them from device assignment.
2873  *
2874  * In both cases, devices which have relaxable RMRRs are not concerned by this
2875  * restriction. See device_rmrr_is_relaxable comment.
2876  */
2877 static bool device_is_rmrr_locked(struct device *dev)
2878 {
2879         if (!device_has_rmrr(dev))
2880                 return false;
2881
2882         if (device_rmrr_is_relaxable(dev))
2883                 return false;
2884
2885         return true;
2886 }
2887
2888 /*
2889  * Return the required default domain type for a specific device.
2890  *
2891  * @dev: the device in query
2892  * @startup: true if this is during early boot
2893  *
2894  * Returns:
2895  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2896  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2897  *  - 0: both identity and dynamic domains work for this device
2898  */
2899 static int device_def_domain_type(struct device *dev)
2900 {
2901         if (dev_is_pci(dev)) {
2902                 struct pci_dev *pdev = to_pci_dev(dev);
2903
2904                 if (device_is_rmrr_locked(dev))
2905                         return IOMMU_DOMAIN_DMA;
2906
2907                 /*
2908                  * Prevent any device marked as untrusted from getting
2909                  * placed into the statically identity mapping domain.
2910                  */
2911                 if (pdev->untrusted)
2912                         return IOMMU_DOMAIN_DMA;
2913
2914                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2915                         return IOMMU_DOMAIN_IDENTITY;
2916
2917                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2918                         return IOMMU_DOMAIN_IDENTITY;
2919
2920                 /*
2921                  * We want to start off with all devices in the 1:1 domain, and
2922                  * take them out later if we find they can't access all of memory.
2923                  *
2924                  * However, we can't do this for PCI devices behind bridges,
2925                  * because all PCI devices behind the same bridge will end up
2926                  * with the same source-id on their transactions.
2927                  *
2928                  * Practically speaking, we can't change things around for these
2929                  * devices at run-time, because we can't be sure there'll be no
2930                  * DMA transactions in flight for any of their siblings.
2931                  *
2932                  * So PCI devices (unless they're on the root bus) as well as
2933                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2934                  * the 1:1 domain, just in _case_ one of their siblings turns out
2935                  * not to be able to map all of memory.
2936                  */
2937                 if (!pci_is_pcie(pdev)) {
2938                         if (!pci_is_root_bus(pdev->bus))
2939                                 return IOMMU_DOMAIN_DMA;
2940                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2941                                 return IOMMU_DOMAIN_DMA;
2942                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2943                         return IOMMU_DOMAIN_DMA;
2944         } else {
2945                 if (device_has_rmrr(dev))
2946                         return IOMMU_DOMAIN_DMA;
2947         }
2948
2949         return (iommu_identity_mapping & IDENTMAP_ALL) ?
2950                         IOMMU_DOMAIN_IDENTITY : 0;
2951 }
2952
2953 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2954 {
2955         /*
2956          * Start from the sane iommu hardware state.
2957          * If the queued invalidation is already initialized by us
2958          * (for example, while enabling interrupt-remapping) then
2959          * we got the things already rolling from a sane state.
2960          */
2961         if (!iommu->qi) {
2962                 /*
2963                  * Clear any previous faults.
2964                  */
2965                 dmar_fault(-1, iommu);
2966                 /*
2967                  * Disable queued invalidation if supported and already enabled
2968                  * before OS handover.
2969                  */
2970                 dmar_disable_qi(iommu);
2971         }
2972
2973         if (dmar_enable_qi(iommu)) {
2974                 /*
2975                  * Queued Invalidate not enabled, use Register Based Invalidate
2976                  */
2977                 iommu->flush.flush_context = __iommu_flush_context;
2978                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2979                 pr_info("%s: Using Register based invalidation\n",
2980                         iommu->name);
2981         } else {
2982                 iommu->flush.flush_context = qi_flush_context;
2983                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2984                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2985         }
2986 }
2987
2988 static int copy_context_table(struct intel_iommu *iommu,
2989                               struct root_entry *old_re,
2990                               struct context_entry **tbl,
2991                               int bus, bool ext)
2992 {
2993         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2994         struct context_entry *new_ce = NULL, ce;
2995         struct context_entry *old_ce = NULL;
2996         struct root_entry re;
2997         phys_addr_t old_ce_phys;
2998
2999         tbl_idx = ext ? bus * 2 : bus;
3000         memcpy(&re, old_re, sizeof(re));
3001
3002         for (devfn = 0; devfn < 256; devfn++) {
3003                 /* First calculate the correct index */
3004                 idx = (ext ? devfn * 2 : devfn) % 256;
3005
3006                 if (idx == 0) {
3007                         /* First save what we may have and clean up */
3008                         if (new_ce) {
3009                                 tbl[tbl_idx] = new_ce;
3010                                 __iommu_flush_cache(iommu, new_ce,
3011                                                     VTD_PAGE_SIZE);
3012                                 pos = 1;
3013                         }
3014
3015                         if (old_ce)
3016                                 memunmap(old_ce);
3017
3018                         ret = 0;
3019                         if (devfn < 0x80)
3020                                 old_ce_phys = root_entry_lctp(&re);
3021                         else
3022                                 old_ce_phys = root_entry_uctp(&re);
3023
3024                         if (!old_ce_phys) {
3025                                 if (ext && devfn == 0) {
3026                                         /* No LCTP, try UCTP */
3027                                         devfn = 0x7f;
3028                                         continue;
3029                                 } else {
3030                                         goto out;
3031                                 }
3032                         }
3033
3034                         ret = -ENOMEM;
3035                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3036                                         MEMREMAP_WB);
3037                         if (!old_ce)
3038                                 goto out;
3039
3040                         new_ce = alloc_pgtable_page(iommu->node);
3041                         if (!new_ce)
3042                                 goto out_unmap;
3043
3044                         ret = 0;
3045                 }
3046
3047                 /* Now copy the context entry */
3048                 memcpy(&ce, old_ce + idx, sizeof(ce));
3049
3050                 if (!__context_present(&ce))
3051                         continue;
3052
3053                 did = context_domain_id(&ce);
3054                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3055                         set_bit(did, iommu->domain_ids);
3056
3057                 /*
3058                  * We need a marker for copied context entries. This
3059                  * marker needs to work for the old format as well as
3060                  * for extended context entries.
3061                  *
3062                  * Bit 67 of the context entry is used. In the old
3063                  * format this bit is available to software, in the
3064                  * extended format it is the PGE bit, but PGE is ignored
3065                  * by HW if PASIDs are disabled (and thus still
3066                  * available).
3067                  *
3068                  * So disable PASIDs first and then mark the entry
3069                  * copied. This means that we don't copy PASID
3070                  * translations from the old kernel, but this is fine as
3071                  * faults there are not fatal.
3072                  */
3073                 context_clear_pasid_enable(&ce);
3074                 context_set_copied(&ce);
3075
3076                 new_ce[idx] = ce;
3077         }
3078
3079         tbl[tbl_idx + pos] = new_ce;
3080
3081         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3082
3083 out_unmap:
3084         memunmap(old_ce);
3085
3086 out:
3087         return ret;
3088 }
3089
3090 static int copy_translation_tables(struct intel_iommu *iommu)
3091 {
3092         struct context_entry **ctxt_tbls;
3093         struct root_entry *old_rt;
3094         phys_addr_t old_rt_phys;
3095         int ctxt_table_entries;
3096         unsigned long flags;
3097         u64 rtaddr_reg;
3098         int bus, ret;
3099         bool new_ext, ext;
3100
3101         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3102         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3103         new_ext    = !!ecap_ecs(iommu->ecap);
3104
3105         /*
3106          * The RTT bit can only be changed when translation is disabled,
3107          * but disabling translation means to open a window for data
3108          * corruption. So bail out and don't copy anything if we would
3109          * have to change the bit.
3110          */
3111         if (new_ext != ext)
3112                 return -EINVAL;
3113
3114         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3115         if (!old_rt_phys)
3116                 return -EINVAL;
3117
3118         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3119         if (!old_rt)
3120                 return -ENOMEM;
3121
3122         /* This is too big for the stack - allocate it from slab */
3123         ctxt_table_entries = ext ? 512 : 256;
3124         ret = -ENOMEM;
3125         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3126         if (!ctxt_tbls)
3127                 goto out_unmap;
3128
3129         for (bus = 0; bus < 256; bus++) {
3130                 ret = copy_context_table(iommu, &old_rt[bus],
3131                                          ctxt_tbls, bus, ext);
3132                 if (ret) {
3133                         pr_err("%s: Failed to copy context table for bus %d\n",
3134                                 iommu->name, bus);
3135                         continue;
3136                 }
3137         }
3138
3139         spin_lock_irqsave(&iommu->lock, flags);
3140
3141         /* Context tables are copied, now write them to the root_entry table */
3142         for (bus = 0; bus < 256; bus++) {
3143                 int idx = ext ? bus * 2 : bus;
3144                 u64 val;
3145
3146                 if (ctxt_tbls[idx]) {
3147                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3148                         iommu->root_entry[bus].lo = val;
3149                 }
3150
3151                 if (!ext || !ctxt_tbls[idx + 1])
3152                         continue;
3153
3154                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3155                 iommu->root_entry[bus].hi = val;
3156         }
3157
3158         spin_unlock_irqrestore(&iommu->lock, flags);
3159
3160         kfree(ctxt_tbls);
3161
3162         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3163
3164         ret = 0;
3165
3166 out_unmap:
3167         memunmap(old_rt);
3168
3169         return ret;
3170 }
3171
3172 static int __init init_dmars(void)
3173 {
3174         struct dmar_drhd_unit *drhd;
3175         struct intel_iommu *iommu;
3176         int ret;
3177
3178         /*
3179          * for each drhd
3180          *    allocate root
3181          *    initialize and program root entry to not present
3182          * endfor
3183          */
3184         for_each_drhd_unit(drhd) {
3185                 /*
3186                  * lock not needed as this is only incremented in the single
3187                  * threaded kernel __init code path all other access are read
3188                  * only
3189                  */
3190                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3191                         g_num_of_iommus++;
3192                         continue;
3193                 }
3194                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3195         }
3196
3197         /* Preallocate enough resources for IOMMU hot-addition */
3198         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3199                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3200
3201         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3202                         GFP_KERNEL);
3203         if (!g_iommus) {
3204                 pr_err("Allocating global iommu array failed\n");
3205                 ret = -ENOMEM;
3206                 goto error;
3207         }
3208
3209         for_each_iommu(iommu, drhd) {
3210                 if (drhd->ignored) {
3211                         iommu_disable_translation(iommu);
3212                         continue;
3213                 }
3214
3215                 /*
3216                  * Find the max pasid size of all IOMMU's in the system.
3217                  * We need to ensure the system pasid table is no bigger
3218                  * than the smallest supported.
3219                  */
3220                 if (pasid_supported(iommu)) {
3221                         u32 temp = 2 << ecap_pss(iommu->ecap);
3222
3223                         intel_pasid_max_id = min_t(u32, temp,
3224                                                    intel_pasid_max_id);
3225                 }
3226
3227                 g_iommus[iommu->seq_id] = iommu;
3228
3229                 intel_iommu_init_qi(iommu);
3230
3231                 ret = iommu_init_domains(iommu);
3232                 if (ret)
3233                         goto free_iommu;
3234
3235                 init_translation_status(iommu);
3236
3237                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3238                         iommu_disable_translation(iommu);
3239                         clear_translation_pre_enabled(iommu);
3240                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3241                                 iommu->name);
3242                 }
3243
3244                 /*
3245                  * TBD:
3246                  * we could share the same root & context tables
3247                  * among all IOMMU's. Need to Split it later.
3248                  */
3249                 ret = iommu_alloc_root_entry(iommu);
3250                 if (ret)
3251                         goto free_iommu;
3252
3253                 if (translation_pre_enabled(iommu)) {
3254                         pr_info("Translation already enabled - trying to copy translation structures\n");
3255
3256                         ret = copy_translation_tables(iommu);
3257                         if (ret) {
3258                                 /*
3259                                  * We found the IOMMU with translation
3260                                  * enabled - but failed to copy over the
3261                                  * old root-entry table. Try to proceed
3262                                  * by disabling translation now and
3263                                  * allocating a clean root-entry table.
3264                                  * This might cause DMAR faults, but
3265                                  * probably the dump will still succeed.
3266                                  */
3267                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3268                                        iommu->name);
3269                                 iommu_disable_translation(iommu);
3270                                 clear_translation_pre_enabled(iommu);
3271                         } else {
3272                                 pr_info("Copied translation tables from previous kernel for %s\n",
3273                                         iommu->name);
3274                         }
3275                 }
3276
3277                 if (!ecap_pass_through(iommu->ecap))
3278                         hw_pass_through = 0;
3279 #ifdef CONFIG_INTEL_IOMMU_SVM
3280                 if (pasid_supported(iommu))
3281                         intel_svm_init(iommu);
3282 #endif
3283         }
3284
3285         /*
3286          * Now that qi is enabled on all iommus, set the root entry and flush
3287          * caches. This is required on some Intel X58 chipsets, otherwise the
3288          * flush_context function will loop forever and the boot hangs.
3289          */
3290         for_each_active_iommu(iommu, drhd) {
3291                 iommu_flush_write_buffer(iommu);
3292                 iommu_set_root_entry(iommu);
3293                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3294                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3295         }
3296
3297         if (iommu_pass_through)
3298                 iommu_identity_mapping |= IDENTMAP_ALL;
3299
3300 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3301         dmar_map_gfx = 0;
3302 #endif
3303
3304         if (!dmar_map_gfx)
3305                 iommu_identity_mapping |= IDENTMAP_GFX;
3306
3307         check_tylersburg_isoch();
3308
3309         ret = si_domain_init(hw_pass_through);
3310         if (ret)
3311                 goto free_iommu;
3312
3313         /*
3314          * for each drhd
3315          *   enable fault log
3316          *   global invalidate context cache
3317          *   global invalidate iotlb
3318          *   enable translation
3319          */
3320         for_each_iommu(iommu, drhd) {
3321                 if (drhd->ignored) {
3322                         /*
3323                          * we always have to disable PMRs or DMA may fail on
3324                          * this device
3325                          */
3326                         if (force_on)
3327                                 iommu_disable_protect_mem_regions(iommu);
3328                         continue;
3329                 }
3330
3331                 iommu_flush_write_buffer(iommu);
3332
3333 #ifdef CONFIG_INTEL_IOMMU_SVM
3334                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3335                         /*
3336                          * Call dmar_alloc_hwirq() with dmar_global_lock held,
3337                          * could cause possible lock race condition.
3338                          */
3339                         up_write(&dmar_global_lock);
3340                         ret = intel_svm_enable_prq(iommu);
3341                         down_write(&dmar_global_lock);
3342                         if (ret)
3343                                 goto free_iommu;
3344                 }
3345 #endif
3346                 ret = dmar_set_interrupt(iommu);
3347                 if (ret)
3348                         goto free_iommu;
3349         }
3350
3351         return 0;
3352
3353 free_iommu:
3354         for_each_active_iommu(iommu, drhd) {
3355                 disable_dmar_iommu(iommu);
3356                 free_dmar_iommu(iommu);
3357         }
3358
3359         kfree(g_iommus);
3360
3361 error:
3362         return ret;
3363 }
3364
3365 /* This takes a number of _MM_ pages, not VTD pages */
3366 static unsigned long intel_alloc_iova(struct device *dev,
3367                                      struct dmar_domain *domain,
3368                                      unsigned long nrpages, uint64_t dma_mask)
3369 {
3370         unsigned long iova_pfn;
3371
3372         /* Restrict dma_mask to the width that the iommu can handle */
3373         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3374         /* Ensure we reserve the whole size-aligned region */
3375         nrpages = __roundup_pow_of_two(nrpages);
3376
3377         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3378                 /*
3379                  * First try to allocate an io virtual address in
3380                  * DMA_BIT_MASK(32) and if that fails then try allocating
3381                  * from higher range
3382                  */
3383                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3384                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3385                 if (iova_pfn)
3386                         return iova_pfn;
3387         }
3388         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3389                                    IOVA_PFN(dma_mask), true);
3390         if (unlikely(!iova_pfn)) {
3391                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3392                 return 0;
3393         }
3394
3395         return iova_pfn;
3396 }
3397
3398 static struct dmar_domain *get_private_domain_for_dev(struct device *dev)
3399 {
3400         struct dmar_domain *domain, *tmp;
3401         struct dmar_rmrr_unit *rmrr;
3402         struct device *i_dev;
3403         int i, ret;
3404
3405         /* Device shouldn't be attached by any domains. */
3406         domain = find_domain(dev);
3407         if (domain)
3408                 return NULL;
3409
3410         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3411         if (!domain)
3412                 goto out;
3413
3414         /* We have a new domain - setup possible RMRRs for the device */
3415         rcu_read_lock();
3416         for_each_rmrr_units(rmrr) {
3417                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3418                                           i, i_dev) {
3419                         if (i_dev != dev)
3420                                 continue;
3421
3422                         ret = domain_prepare_identity_map(dev, domain,
3423                                                           rmrr->base_address,
3424                                                           rmrr->end_address);
3425                         if (ret)
3426                                 dev_err(dev, "Mapping reserved region failed\n");
3427                 }
3428         }
3429         rcu_read_unlock();
3430
3431         tmp = set_domain_for_dev(dev, domain);
3432         if (!tmp || domain != tmp) {
3433                 domain_exit(domain);
3434                 domain = tmp;
3435         }
3436
3437 out:
3438         if (!domain)
3439                 dev_err(dev, "Allocating domain failed\n");
3440         else
3441                 domain->domain.type = IOMMU_DOMAIN_DMA;
3442
3443         return domain;
3444 }
3445
3446 /* Check if the dev needs to go through non-identity map and unmap process.*/
3447 static bool iommu_need_mapping(struct device *dev)
3448 {
3449         int ret;
3450
3451         if (iommu_dummy(dev))
3452                 return false;
3453
3454         ret = identity_mapping(dev);
3455         if (ret) {
3456                 u64 dma_mask = *dev->dma_mask;
3457
3458                 if (dev->coherent_dma_mask && dev->coherent_dma_mask < dma_mask)
3459                         dma_mask = dev->coherent_dma_mask;
3460
3461                 if (dma_mask >= dma_get_required_mask(dev))
3462                         return false;
3463
3464                 /*
3465                  * 32 bit DMA is removed from si_domain and fall back to
3466                  * non-identity mapping.
3467                  */
3468                 dmar_remove_one_dev_info(dev);
3469                 ret = iommu_request_dma_domain_for_dev(dev);
3470                 if (ret) {
3471                         struct iommu_domain *domain;
3472                         struct dmar_domain *dmar_domain;
3473
3474                         domain = iommu_get_domain_for_dev(dev);
3475                         if (domain) {
3476                                 dmar_domain = to_dmar_domain(domain);
3477                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
3478                         }
3479                         get_private_domain_for_dev(dev);
3480                 }
3481
3482                 dev_info(dev, "32bit DMA uses non-identity mapping\n");
3483         }
3484
3485         return true;
3486 }
3487
3488 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3489                                      size_t size, int dir, u64 dma_mask)
3490 {
3491         struct dmar_domain *domain;
3492         phys_addr_t start_paddr;
3493         unsigned long iova_pfn;
3494         int prot = 0;
3495         int ret;
3496         struct intel_iommu *iommu;
3497         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3498
3499         BUG_ON(dir == DMA_NONE);
3500
3501         domain = find_domain(dev);
3502         if (!domain)
3503                 return DMA_MAPPING_ERROR;
3504
3505         iommu = domain_get_iommu(domain);
3506         size = aligned_nrpages(paddr, size);
3507
3508         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3509         if (!iova_pfn)
3510                 goto error;
3511
3512         /*
3513          * Check if DMAR supports zero-length reads on write only
3514          * mappings..
3515          */
3516         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3517                         !cap_zlr(iommu->cap))
3518                 prot |= DMA_PTE_READ;
3519         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3520                 prot |= DMA_PTE_WRITE;
3521         /*
3522          * paddr - (paddr + size) might be partial page, we should map the whole
3523          * page.  Note: if two part of one page are separately mapped, we
3524          * might have two guest_addr mapping to the same host paddr, but this
3525          * is not a big problem
3526          */
3527         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3528                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3529         if (ret)
3530                 goto error;
3531
3532         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3533         start_paddr += paddr & ~PAGE_MASK;
3534         return start_paddr;
3535
3536 error:
3537         if (iova_pfn)
3538                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3539         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3540                 size, (unsigned long long)paddr, dir);
3541         return DMA_MAPPING_ERROR;
3542 }
3543
3544 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3545                                  unsigned long offset, size_t size,
3546                                  enum dma_data_direction dir,
3547                                  unsigned long attrs)
3548 {
3549         if (iommu_need_mapping(dev))
3550                 return __intel_map_single(dev, page_to_phys(page) + offset,
3551                                 size, dir, *dev->dma_mask);
3552         return dma_direct_map_page(dev, page, offset, size, dir, attrs);
3553 }
3554
3555 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3556                                      size_t size, enum dma_data_direction dir,
3557                                      unsigned long attrs)
3558 {
3559         if (iommu_need_mapping(dev))
3560                 return __intel_map_single(dev, phys_addr, size, dir,
3561                                 *dev->dma_mask);
3562         return dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
3563 }
3564
3565 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3566 {
3567         struct dmar_domain *domain;
3568         unsigned long start_pfn, last_pfn;
3569         unsigned long nrpages;
3570         unsigned long iova_pfn;
3571         struct intel_iommu *iommu;
3572         struct page *freelist;
3573         struct pci_dev *pdev = NULL;
3574
3575         domain = find_domain(dev);
3576         BUG_ON(!domain);
3577
3578         iommu = domain_get_iommu(domain);
3579
3580         iova_pfn = IOVA_PFN(dev_addr);
3581
3582         nrpages = aligned_nrpages(dev_addr, size);
3583         start_pfn = mm_to_dma_pfn(iova_pfn);
3584         last_pfn = start_pfn + nrpages - 1;
3585
3586         if (dev_is_pci(dev))
3587                 pdev = to_pci_dev(dev);
3588
3589         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3590
3591         freelist = domain_unmap(domain, start_pfn, last_pfn);
3592
3593         if (intel_iommu_strict || (pdev && pdev->untrusted)) {
3594                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3595                                       nrpages, !freelist, 0);
3596                 /* free iova */
3597                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3598                 dma_free_pagelist(freelist);
3599         } else {
3600                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3601                            (unsigned long)freelist);
3602                 /*
3603                  * queue up the release of the unmap to save the 1/6th of the
3604                  * cpu used up by the iotlb flush operation...
3605                  */
3606         }
3607 }
3608
3609 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3610                              size_t size, enum dma_data_direction dir,
3611                              unsigned long attrs)
3612 {
3613         if (iommu_need_mapping(dev))
3614                 intel_unmap(dev, dev_addr, size);
3615         else
3616                 dma_direct_unmap_page(dev, dev_addr, size, dir, attrs);
3617 }
3618
3619 static void intel_unmap_resource(struct device *dev, dma_addr_t dev_addr,
3620                 size_t size, enum dma_data_direction dir, unsigned long attrs)
3621 {
3622         if (iommu_need_mapping(dev))
3623                 intel_unmap(dev, dev_addr, size);
3624 }
3625
3626 static void *intel_alloc_coherent(struct device *dev, size_t size,
3627                                   dma_addr_t *dma_handle, gfp_t flags,
3628                                   unsigned long attrs)
3629 {
3630         struct page *page = NULL;
3631         int order;
3632
3633         if (!iommu_need_mapping(dev))
3634                 return dma_direct_alloc(dev, size, dma_handle, flags, attrs);
3635
3636         size = PAGE_ALIGN(size);
3637         order = get_order(size);
3638
3639         if (gfpflags_allow_blocking(flags)) {
3640                 unsigned int count = size >> PAGE_SHIFT;
3641
3642                 page = dma_alloc_from_contiguous(dev, count, order,
3643                                                  flags & __GFP_NOWARN);
3644         }
3645
3646         if (!page)
3647                 page = alloc_pages(flags, order);
3648         if (!page)
3649                 return NULL;
3650         memset(page_address(page), 0, size);
3651
3652         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3653                                          DMA_BIDIRECTIONAL,
3654                                          dev->coherent_dma_mask);
3655         if (*dma_handle != DMA_MAPPING_ERROR)
3656                 return page_address(page);
3657         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3658                 __free_pages(page, order);
3659
3660         return NULL;
3661 }
3662
3663 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3664                                 dma_addr_t dma_handle, unsigned long attrs)
3665 {
3666         int order;
3667         struct page *page = virt_to_page(vaddr);
3668
3669         if (!iommu_need_mapping(dev))
3670                 return dma_direct_free(dev, size, vaddr, dma_handle, attrs);
3671
3672         size = PAGE_ALIGN(size);
3673         order = get_order(size);
3674
3675         intel_unmap(dev, dma_handle, size);
3676         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3677                 __free_pages(page, order);
3678 }
3679
3680 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3681                            int nelems, enum dma_data_direction dir,
3682                            unsigned long attrs)
3683 {
3684         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3685         unsigned long nrpages = 0;
3686         struct scatterlist *sg;
3687         int i;
3688
3689         if (!iommu_need_mapping(dev))
3690                 return dma_direct_unmap_sg(dev, sglist, nelems, dir, attrs);
3691
3692         for_each_sg(sglist, sg, nelems, i) {
3693                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3694         }
3695
3696         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3697 }
3698
3699 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3700                         enum dma_data_direction dir, unsigned long attrs)
3701 {
3702         int i;
3703         struct dmar_domain *domain;
3704         size_t size = 0;
3705         int prot = 0;
3706         unsigned long iova_pfn;
3707         int ret;
3708         struct scatterlist *sg;
3709         unsigned long start_vpfn;
3710         struct intel_iommu *iommu;
3711
3712         BUG_ON(dir == DMA_NONE);
3713         if (!iommu_need_mapping(dev))
3714                 return dma_direct_map_sg(dev, sglist, nelems, dir, attrs);
3715
3716         domain = find_domain(dev);
3717         if (!domain)
3718                 return 0;
3719
3720         iommu = domain_get_iommu(domain);
3721
3722         for_each_sg(sglist, sg, nelems, i)
3723                 size += aligned_nrpages(sg->offset, sg->length);
3724
3725         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3726                                 *dev->dma_mask);
3727         if (!iova_pfn) {
3728                 sglist->dma_length = 0;
3729                 return 0;
3730         }
3731
3732         /*
3733          * Check if DMAR supports zero-length reads on write only
3734          * mappings..
3735          */
3736         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3737                         !cap_zlr(iommu->cap))
3738                 prot |= DMA_PTE_READ;
3739         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3740                 prot |= DMA_PTE_WRITE;
3741
3742         start_vpfn = mm_to_dma_pfn(iova_pfn);
3743
3744         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3745         if (unlikely(ret)) {
3746                 dma_pte_free_pagetable(domain, start_vpfn,
3747                                        start_vpfn + size - 1,
3748                                        agaw_to_level(domain->agaw) + 1);
3749                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3750                 return 0;
3751         }
3752
3753         return nelems;
3754 }
3755
3756 static const struct dma_map_ops intel_dma_ops = {
3757         .alloc = intel_alloc_coherent,
3758         .free = intel_free_coherent,
3759         .map_sg = intel_map_sg,
3760         .unmap_sg = intel_unmap_sg,
3761         .map_page = intel_map_page,
3762         .unmap_page = intel_unmap_page,
3763         .map_resource = intel_map_resource,
3764         .unmap_resource = intel_unmap_resource,
3765         .dma_supported = dma_direct_supported,
3766 };
3767
3768 static inline int iommu_domain_cache_init(void)
3769 {
3770         int ret = 0;
3771
3772         iommu_domain_cache = kmem_cache_create("iommu_domain",
3773                                          sizeof(struct dmar_domain),
3774                                          0,
3775                                          SLAB_HWCACHE_ALIGN,
3776
3777                                          NULL);
3778         if (!iommu_domain_cache) {
3779                 pr_err("Couldn't create iommu_domain cache\n");
3780                 ret = -ENOMEM;
3781         }
3782
3783         return ret;
3784 }
3785
3786 static inline int iommu_devinfo_cache_init(void)
3787 {
3788         int ret = 0;
3789
3790         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3791                                          sizeof(struct device_domain_info),
3792                                          0,
3793                                          SLAB_HWCACHE_ALIGN,
3794                                          NULL);
3795         if (!iommu_devinfo_cache) {
3796                 pr_err("Couldn't create devinfo cache\n");
3797                 ret = -ENOMEM;
3798         }
3799
3800         return ret;
3801 }
3802
3803 static int __init iommu_init_mempool(void)
3804 {
3805         int ret;
3806         ret = iova_cache_get();
3807         if (ret)
3808                 return ret;
3809
3810         ret = iommu_domain_cache_init();
3811         if (ret)
3812                 goto domain_error;
3813
3814         ret = iommu_devinfo_cache_init();
3815         if (!ret)
3816                 return ret;
3817
3818         kmem_cache_destroy(iommu_domain_cache);
3819 domain_error:
3820         iova_cache_put();
3821
3822         return -ENOMEM;
3823 }
3824
3825 static void __init iommu_exit_mempool(void)
3826 {
3827         kmem_cache_destroy(iommu_devinfo_cache);
3828         kmem_cache_destroy(iommu_domain_cache);
3829         iova_cache_put();
3830 }
3831
3832 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3833 {
3834         struct dmar_drhd_unit *drhd;
3835         u32 vtbar;
3836         int rc;
3837
3838         /* We know that this device on this chipset has its own IOMMU.
3839          * If we find it under a different IOMMU, then the BIOS is lying
3840          * to us. Hope that the IOMMU for this device is actually
3841          * disabled, and it needs no translation...
3842          */
3843         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3844         if (rc) {
3845                 /* "can't" happen */
3846                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3847                 return;
3848         }
3849         vtbar &= 0xffff0000;
3850
3851         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3852         drhd = dmar_find_matched_drhd_unit(pdev);
3853         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3854                             TAINT_FIRMWARE_WORKAROUND,
3855                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3856                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3857 }
3858 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3859
3860 static void __init init_no_remapping_devices(void)
3861 {
3862         struct dmar_drhd_unit *drhd;
3863         struct device *dev;
3864         int i;
3865
3866         for_each_drhd_unit(drhd) {
3867                 if (!drhd->include_all) {
3868                         for_each_active_dev_scope(drhd->devices,
3869                                                   drhd->devices_cnt, i, dev)
3870                                 break;
3871                         /* ignore DMAR unit if no devices exist */
3872                         if (i == drhd->devices_cnt)
3873                                 drhd->ignored = 1;
3874                 }
3875         }
3876
3877         for_each_active_drhd_unit(drhd) {
3878                 if (drhd->include_all)
3879                         continue;
3880
3881                 for_each_active_dev_scope(drhd->devices,
3882                                           drhd->devices_cnt, i, dev)
3883                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3884                                 break;
3885                 if (i < drhd->devices_cnt)
3886                         continue;
3887
3888                 /* This IOMMU has *only* gfx devices. Either bypass it or
3889                    set the gfx_mapped flag, as appropriate */
3890                 if (!dmar_map_gfx) {
3891                         drhd->ignored = 1;
3892                         for_each_active_dev_scope(drhd->devices,
3893                                                   drhd->devices_cnt, i, dev)
3894                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3895                 }
3896         }
3897 }
3898
3899 #ifdef CONFIG_SUSPEND
3900 static int init_iommu_hw(void)
3901 {
3902         struct dmar_drhd_unit *drhd;
3903         struct intel_iommu *iommu = NULL;
3904
3905         for_each_active_iommu(iommu, drhd)
3906                 if (iommu->qi)
3907                         dmar_reenable_qi(iommu);
3908
3909         for_each_iommu(iommu, drhd) {
3910                 if (drhd->ignored) {
3911                         /*
3912                          * we always have to disable PMRs or DMA may fail on
3913                          * this device
3914                          */
3915                         if (force_on)
3916                                 iommu_disable_protect_mem_regions(iommu);
3917                         continue;
3918                 }
3919
3920                 iommu_flush_write_buffer(iommu);
3921
3922                 iommu_set_root_entry(iommu);
3923
3924                 iommu->flush.flush_context(iommu, 0, 0, 0,
3925                                            DMA_CCMD_GLOBAL_INVL);
3926                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3927                 iommu_enable_translation(iommu);
3928                 iommu_disable_protect_mem_regions(iommu);
3929         }
3930
3931         return 0;
3932 }
3933
3934 static void iommu_flush_all(void)
3935 {
3936         struct dmar_drhd_unit *drhd;
3937         struct intel_iommu *iommu;
3938
3939         for_each_active_iommu(iommu, drhd) {
3940                 iommu->flush.flush_context(iommu, 0, 0, 0,
3941                                            DMA_CCMD_GLOBAL_INVL);
3942                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3943                                          DMA_TLB_GLOBAL_FLUSH);
3944         }
3945 }
3946
3947 static int iommu_suspend(void)
3948 {
3949         struct dmar_drhd_unit *drhd;
3950         struct intel_iommu *iommu = NULL;
3951         unsigned long flag;
3952
3953         for_each_active_iommu(iommu, drhd) {
3954                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3955                                                  GFP_ATOMIC);
3956                 if (!iommu->iommu_state)
3957                         goto nomem;
3958         }
3959
3960         iommu_flush_all();
3961
3962         for_each_active_iommu(iommu, drhd) {
3963                 iommu_disable_translation(iommu);
3964
3965                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3966
3967                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3968                         readl(iommu->reg + DMAR_FECTL_REG);
3969                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3970                         readl(iommu->reg + DMAR_FEDATA_REG);
3971                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3972                         readl(iommu->reg + DMAR_FEADDR_REG);
3973                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3974                         readl(iommu->reg + DMAR_FEUADDR_REG);
3975
3976                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3977         }
3978         return 0;
3979
3980 nomem:
3981         for_each_active_iommu(iommu, drhd)
3982                 kfree(iommu->iommu_state);
3983
3984         return -ENOMEM;
3985 }
3986
3987 static void iommu_resume(void)
3988 {
3989         struct dmar_drhd_unit *drhd;
3990         struct intel_iommu *iommu = NULL;
3991         unsigned long flag;
3992
3993         if (init_iommu_hw()) {
3994                 if (force_on)
3995                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3996                 else
3997                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3998                 return;
3999         }
4000
4001         for_each_active_iommu(iommu, drhd) {
4002
4003                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4004
4005                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4006                         iommu->reg + DMAR_FECTL_REG);
4007                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4008                         iommu->reg + DMAR_FEDATA_REG);
4009                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4010                         iommu->reg + DMAR_FEADDR_REG);
4011                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4012                         iommu->reg + DMAR_FEUADDR_REG);
4013
4014                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4015         }
4016
4017         for_each_active_iommu(iommu, drhd)
4018                 kfree(iommu->iommu_state);
4019 }
4020
4021 static struct syscore_ops iommu_syscore_ops = {
4022         .resume         = iommu_resume,
4023         .suspend        = iommu_suspend,
4024 };
4025
4026 static void __init init_iommu_pm_ops(void)
4027 {
4028         register_syscore_ops(&iommu_syscore_ops);
4029 }
4030
4031 #else
4032 static inline void init_iommu_pm_ops(void) {}
4033 #endif  /* CONFIG_PM */
4034
4035 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4036 {
4037         struct acpi_dmar_reserved_memory *rmrr;
4038         struct dmar_rmrr_unit *rmrru;
4039
4040         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4041         if (!rmrru)
4042                 goto out;
4043
4044         rmrru->hdr = header;
4045         rmrr = (struct acpi_dmar_reserved_memory *)header;
4046         rmrru->base_address = rmrr->base_address;
4047         rmrru->end_address = rmrr->end_address;
4048
4049         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4050                                 ((void *)rmrr) + rmrr->header.length,
4051                                 &rmrru->devices_cnt);
4052         if (rmrru->devices_cnt && rmrru->devices == NULL)
4053                 goto free_rmrru;
4054
4055         list_add(&rmrru->list, &dmar_rmrr_units);
4056
4057         return 0;
4058 free_rmrru:
4059         kfree(rmrru);
4060 out:
4061         return -ENOMEM;
4062 }
4063
4064 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4065 {
4066         struct dmar_atsr_unit *atsru;
4067         struct acpi_dmar_atsr *tmp;
4068
4069         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4070                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4071                 if (atsr->segment != tmp->segment)
4072                         continue;
4073                 if (atsr->header.length != tmp->header.length)
4074                         continue;
4075                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4076                         return atsru;
4077         }
4078
4079         return NULL;
4080 }
4081
4082 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4083 {
4084         struct acpi_dmar_atsr *atsr;
4085         struct dmar_atsr_unit *atsru;
4086
4087         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4088                 return 0;
4089
4090         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4091         atsru = dmar_find_atsr(atsr);
4092         if (atsru)
4093                 return 0;
4094
4095         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4096         if (!atsru)
4097                 return -ENOMEM;
4098
4099         /*
4100          * If memory is allocated from slab by ACPI _DSM method, we need to
4101          * copy the memory content because the memory buffer will be freed
4102          * on return.
4103          */
4104         atsru->hdr = (void *)(atsru + 1);
4105         memcpy(atsru->hdr, hdr, hdr->length);
4106         atsru->include_all = atsr->flags & 0x1;
4107         if (!atsru->include_all) {
4108                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4109                                 (void *)atsr + atsr->header.length,
4110                                 &atsru->devices_cnt);
4111                 if (atsru->devices_cnt && atsru->devices == NULL) {
4112                         kfree(atsru);
4113                         return -ENOMEM;
4114                 }
4115         }
4116
4117         list_add_rcu(&atsru->list, &dmar_atsr_units);
4118
4119         return 0;
4120 }
4121
4122 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4123 {
4124         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4125         kfree(atsru);
4126 }
4127
4128 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4129 {
4130         struct acpi_dmar_atsr *atsr;
4131         struct dmar_atsr_unit *atsru;
4132
4133         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4134         atsru = dmar_find_atsr(atsr);
4135         if (atsru) {
4136                 list_del_rcu(&atsru->list);
4137                 synchronize_rcu();
4138                 intel_iommu_free_atsr(atsru);
4139         }
4140
4141         return 0;
4142 }
4143
4144 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4145 {
4146         int i;
4147         struct device *dev;
4148         struct acpi_dmar_atsr *atsr;
4149         struct dmar_atsr_unit *atsru;
4150
4151         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4152         atsru = dmar_find_atsr(atsr);
4153         if (!atsru)
4154                 return 0;
4155
4156         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4157                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4158                                           i, dev)
4159                         return -EBUSY;
4160         }
4161
4162         return 0;
4163 }
4164
4165 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4166 {
4167         int sp, ret;
4168         struct intel_iommu *iommu = dmaru->iommu;
4169
4170         if (g_iommus[iommu->seq_id])
4171                 return 0;
4172
4173         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4174                 pr_warn("%s: Doesn't support hardware pass through.\n",
4175                         iommu->name);
4176                 return -ENXIO;
4177         }
4178         if (!ecap_sc_support(iommu->ecap) &&
4179             domain_update_iommu_snooping(iommu)) {
4180                 pr_warn("%s: Doesn't support snooping.\n",
4181                         iommu->name);
4182                 return -ENXIO;
4183         }
4184         sp = domain_update_iommu_superpage(iommu) - 1;
4185         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4186                 pr_warn("%s: Doesn't support large page.\n",
4187                         iommu->name);
4188                 return -ENXIO;
4189         }
4190
4191         /*
4192          * Disable translation if already enabled prior to OS handover.
4193          */
4194         if (iommu->gcmd & DMA_GCMD_TE)
4195                 iommu_disable_translation(iommu);
4196
4197         g_iommus[iommu->seq_id] = iommu;
4198         ret = iommu_init_domains(iommu);
4199         if (ret == 0)
4200                 ret = iommu_alloc_root_entry(iommu);
4201         if (ret)
4202                 goto out;
4203
4204 #ifdef CONFIG_INTEL_IOMMU_SVM
4205         if (pasid_supported(iommu))
4206                 intel_svm_init(iommu);
4207 #endif
4208
4209         if (dmaru->ignored) {
4210                 /*
4211                  * we always have to disable PMRs or DMA may fail on this device
4212                  */
4213                 if (force_on)
4214                         iommu_disable_protect_mem_regions(iommu);
4215                 return 0;
4216         }
4217
4218         intel_iommu_init_qi(iommu);
4219         iommu_flush_write_buffer(iommu);
4220
4221 #ifdef CONFIG_INTEL_IOMMU_SVM
4222         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4223                 ret = intel_svm_enable_prq(iommu);
4224                 if (ret)
4225                         goto disable_iommu;
4226         }
4227 #endif
4228         ret = dmar_set_interrupt(iommu);
4229         if (ret)
4230                 goto disable_iommu;
4231
4232         iommu_set_root_entry(iommu);
4233         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4234         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4235         iommu_enable_translation(iommu);
4236
4237         iommu_disable_protect_mem_regions(iommu);
4238         return 0;
4239
4240 disable_iommu:
4241         disable_dmar_iommu(iommu);
4242 out:
4243         free_dmar_iommu(iommu);
4244         return ret;
4245 }
4246
4247 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4248 {
4249         int ret = 0;
4250         struct intel_iommu *iommu = dmaru->iommu;
4251
4252         if (!intel_iommu_enabled)
4253                 return 0;
4254         if (iommu == NULL)
4255                 return -EINVAL;
4256
4257         if (insert) {
4258                 ret = intel_iommu_add(dmaru);
4259         } else {
4260                 disable_dmar_iommu(iommu);
4261                 free_dmar_iommu(iommu);
4262         }
4263
4264         return ret;
4265 }
4266
4267 static void intel_iommu_free_dmars(void)
4268 {
4269         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4270         struct dmar_atsr_unit *atsru, *atsr_n;
4271
4272         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4273                 list_del(&rmrru->list);
4274                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4275                 kfree(rmrru);
4276         }
4277
4278         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4279                 list_del(&atsru->list);
4280                 intel_iommu_free_atsr(atsru);
4281         }
4282 }
4283
4284 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4285 {
4286         int i, ret = 1;
4287         struct pci_bus *bus;
4288         struct pci_dev *bridge = NULL;
4289         struct device *tmp;
4290         struct acpi_dmar_atsr *atsr;
4291         struct dmar_atsr_unit *atsru;
4292
4293         dev = pci_physfn(dev);
4294         for (bus = dev->bus; bus; bus = bus->parent) {
4295                 bridge = bus->self;
4296                 /* If it's an integrated device, allow ATS */
4297                 if (!bridge)
4298                         return 1;
4299                 /* Connected via non-PCIe: no ATS */
4300                 if (!pci_is_pcie(bridge) ||
4301                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4302                         return 0;
4303                 /* If we found the root port, look it up in the ATSR */
4304                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4305                         break;
4306         }
4307
4308         rcu_read_lock();
4309         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4310                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4311                 if (atsr->segment != pci_domain_nr(dev->bus))
4312                         continue;
4313
4314                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4315                         if (tmp == &bridge->dev)
4316                                 goto out;
4317
4318                 if (atsru->include_all)
4319                         goto out;
4320         }
4321         ret = 0;
4322 out:
4323         rcu_read_unlock();
4324
4325         return ret;
4326 }
4327
4328 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4329 {
4330         int ret;
4331         struct dmar_rmrr_unit *rmrru;
4332         struct dmar_atsr_unit *atsru;
4333         struct acpi_dmar_atsr *atsr;
4334         struct acpi_dmar_reserved_memory *rmrr;
4335
4336         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4337                 return 0;
4338
4339         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4340                 rmrr = container_of(rmrru->hdr,
4341                                     struct acpi_dmar_reserved_memory, header);
4342                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4343                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4344                                 ((void *)rmrr) + rmrr->header.length,
4345                                 rmrr->segment, rmrru->devices,
4346                                 rmrru->devices_cnt);
4347                         if (ret < 0)
4348                                 return ret;
4349                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4350                         dmar_remove_dev_scope(info, rmrr->segment,
4351                                 rmrru->devices, rmrru->devices_cnt);
4352                 }
4353         }
4354
4355         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4356                 if (atsru->include_all)
4357                         continue;
4358
4359                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4360                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4361                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4362                                         (void *)atsr + atsr->header.length,
4363                                         atsr->segment, atsru->devices,
4364                                         atsru->devices_cnt);
4365                         if (ret > 0)
4366                                 break;
4367                         else if (ret < 0)
4368                                 return ret;
4369                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4370                         if (dmar_remove_dev_scope(info, atsr->segment,
4371                                         atsru->devices, atsru->devices_cnt))
4372                                 break;
4373                 }
4374         }
4375
4376         return 0;
4377 }
4378
4379 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4380                                        unsigned long val, void *v)
4381 {
4382         struct memory_notify *mhp = v;
4383         unsigned long long start, end;
4384         unsigned long start_vpfn, last_vpfn;
4385
4386         switch (val) {
4387         case MEM_GOING_ONLINE:
4388                 start = mhp->start_pfn << PAGE_SHIFT;
4389                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4390                 if (iommu_domain_identity_map(si_domain, start, end)) {
4391                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4392                                 start, end);
4393                         return NOTIFY_BAD;
4394                 }
4395                 break;
4396
4397         case MEM_OFFLINE:
4398         case MEM_CANCEL_ONLINE:
4399                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4400                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4401                 while (start_vpfn <= last_vpfn) {
4402                         struct iova *iova;
4403                         struct dmar_drhd_unit *drhd;
4404                         struct intel_iommu *iommu;
4405                         struct page *freelist;
4406
4407                         iova = find_iova(&si_domain->iovad, start_vpfn);
4408                         if (iova == NULL) {
4409                                 pr_debug("Failed get IOVA for PFN %lx\n",
4410                                          start_vpfn);
4411                                 break;
4412                         }
4413
4414                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4415                                                      start_vpfn, last_vpfn);
4416                         if (iova == NULL) {
4417                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4418                                         start_vpfn, last_vpfn);
4419                                 return NOTIFY_BAD;
4420                         }
4421
4422                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4423                                                iova->pfn_hi);
4424
4425                         rcu_read_lock();
4426                         for_each_active_iommu(iommu, drhd)
4427                                 iommu_flush_iotlb_psi(iommu, si_domain,
4428                                         iova->pfn_lo, iova_size(iova),
4429                                         !freelist, 0);
4430                         rcu_read_unlock();
4431                         dma_free_pagelist(freelist);
4432
4433                         start_vpfn = iova->pfn_hi + 1;
4434                         free_iova_mem(iova);
4435                 }
4436                 break;
4437         }
4438
4439         return NOTIFY_OK;
4440 }
4441
4442 static struct notifier_block intel_iommu_memory_nb = {
4443         .notifier_call = intel_iommu_memory_notifier,
4444         .priority = 0
4445 };
4446
4447 static void free_all_cpu_cached_iovas(unsigned int cpu)
4448 {
4449         int i;
4450
4451         for (i = 0; i < g_num_of_iommus; i++) {
4452                 struct intel_iommu *iommu = g_iommus[i];
4453                 struct dmar_domain *domain;
4454                 int did;
4455
4456                 if (!iommu)
4457                         continue;
4458
4459                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4460                         domain = get_iommu_domain(iommu, (u16)did);
4461
4462                         if (!domain)
4463                                 continue;
4464                         free_cpu_cached_iovas(cpu, &domain->iovad);
4465                 }
4466         }
4467 }
4468
4469 static int intel_iommu_cpu_dead(unsigned int cpu)
4470 {
4471         free_all_cpu_cached_iovas(cpu);
4472         return 0;
4473 }
4474
4475 static void intel_disable_iommus(void)
4476 {
4477         struct intel_iommu *iommu = NULL;
4478         struct dmar_drhd_unit *drhd;
4479
4480         for_each_iommu(iommu, drhd)
4481                 iommu_disable_translation(iommu);
4482 }
4483
4484 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4485 {
4486         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4487
4488         return container_of(iommu_dev, struct intel_iommu, iommu);
4489 }
4490
4491 static ssize_t intel_iommu_show_version(struct device *dev,
4492                                         struct device_attribute *attr,
4493                                         char *buf)
4494 {
4495         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4496         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4497         return sprintf(buf, "%d:%d\n",
4498                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4499 }
4500 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4501
4502 static ssize_t intel_iommu_show_address(struct device *dev,
4503                                         struct device_attribute *attr,
4504                                         char *buf)
4505 {
4506         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4507         return sprintf(buf, "%llx\n", iommu->reg_phys);
4508 }
4509 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4510
4511 static ssize_t intel_iommu_show_cap(struct device *dev,
4512                                     struct device_attribute *attr,
4513                                     char *buf)
4514 {
4515         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4516         return sprintf(buf, "%llx\n", iommu->cap);
4517 }
4518 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4519
4520 static ssize_t intel_iommu_show_ecap(struct device *dev,
4521                                     struct device_attribute *attr,
4522                                     char *buf)
4523 {
4524         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4525         return sprintf(buf, "%llx\n", iommu->ecap);
4526 }
4527 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4528
4529 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4530                                       struct device_attribute *attr,
4531                                       char *buf)
4532 {
4533         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4534         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4535 }
4536 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4537
4538 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4539                                            struct device_attribute *attr,
4540                                            char *buf)
4541 {
4542         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4543         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4544                                                   cap_ndoms(iommu->cap)));
4545 }
4546 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4547
4548 static struct attribute *intel_iommu_attrs[] = {
4549         &dev_attr_version.attr,
4550         &dev_attr_address.attr,
4551         &dev_attr_cap.attr,
4552         &dev_attr_ecap.attr,
4553         &dev_attr_domains_supported.attr,
4554         &dev_attr_domains_used.attr,
4555         NULL,
4556 };
4557
4558 static struct attribute_group intel_iommu_group = {
4559         .name = "intel-iommu",
4560         .attrs = intel_iommu_attrs,
4561 };
4562
4563 const struct attribute_group *intel_iommu_groups[] = {
4564         &intel_iommu_group,
4565         NULL,
4566 };
4567
4568 static int __init platform_optin_force_iommu(void)
4569 {
4570         struct pci_dev *pdev = NULL;
4571         bool has_untrusted_dev = false;
4572
4573         if (!dmar_platform_optin() || no_platform_optin)
4574                 return 0;
4575
4576         for_each_pci_dev(pdev) {
4577                 if (pdev->untrusted) {
4578                         has_untrusted_dev = true;
4579                         break;
4580                 }
4581         }
4582
4583         if (!has_untrusted_dev)
4584                 return 0;
4585
4586         if (no_iommu || dmar_disabled)
4587                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4588
4589         /*
4590          * If Intel-IOMMU is disabled by default, we will apply identity
4591          * map for all devices except those marked as being untrusted.
4592          */
4593         if (dmar_disabled)
4594                 iommu_identity_mapping |= IDENTMAP_ALL;
4595
4596         dmar_disabled = 0;
4597 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4598         swiotlb = 0;
4599 #endif
4600         no_iommu = 0;
4601
4602         return 1;
4603 }
4604
4605 static int __init probe_acpi_namespace_devices(void)
4606 {
4607         struct dmar_drhd_unit *drhd;
4608         /* To avoid a -Wunused-but-set-variable warning. */
4609         struct intel_iommu *iommu __maybe_unused;
4610         struct device *dev;
4611         int i, ret = 0;
4612
4613         for_each_active_iommu(iommu, drhd) {
4614                 for_each_active_dev_scope(drhd->devices,
4615                                           drhd->devices_cnt, i, dev) {
4616                         struct acpi_device_physical_node *pn;
4617                         struct iommu_group *group;
4618                         struct acpi_device *adev;
4619
4620                         if (dev->bus != &acpi_bus_type)
4621                                 continue;
4622
4623                         adev = to_acpi_device(dev);
4624                         mutex_lock(&adev->physical_node_lock);
4625                         list_for_each_entry(pn,
4626                                             &adev->physical_node_list, node) {
4627                                 group = iommu_group_get(pn->dev);
4628                                 if (group) {
4629                                         iommu_group_put(group);
4630                                         continue;
4631                                 }
4632
4633                                 pn->dev->bus->iommu_ops = &intel_iommu_ops;
4634                                 ret = iommu_probe_device(pn->dev);
4635                                 if (ret)
4636                                         break;
4637                         }
4638                         mutex_unlock(&adev->physical_node_lock);
4639
4640                         if (ret)
4641                                 return ret;
4642                 }
4643         }
4644
4645         return 0;
4646 }
4647
4648 int __init intel_iommu_init(void)
4649 {
4650         int ret = -ENODEV;
4651         struct dmar_drhd_unit *drhd;
4652         struct intel_iommu *iommu;
4653
4654         /*
4655          * Intel IOMMU is required for a TXT/tboot launch or platform
4656          * opt in, so enforce that.
4657          */
4658         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4659
4660         if (iommu_init_mempool()) {
4661                 if (force_on)
4662                         panic("tboot: Failed to initialize iommu memory\n");
4663                 return -ENOMEM;
4664         }
4665
4666         down_write(&dmar_global_lock);
4667         if (dmar_table_init()) {
4668                 if (force_on)
4669                         panic("tboot: Failed to initialize DMAR table\n");
4670                 goto out_free_dmar;
4671         }
4672
4673         if (dmar_dev_scope_init() < 0) {
4674                 if (force_on)
4675                         panic("tboot: Failed to initialize DMAR device scope\n");
4676                 goto out_free_dmar;
4677         }
4678
4679         up_write(&dmar_global_lock);
4680
4681         /*
4682          * The bus notifier takes the dmar_global_lock, so lockdep will
4683          * complain later when we register it under the lock.
4684          */
4685         dmar_register_bus_notifier();
4686
4687         down_write(&dmar_global_lock);
4688
4689         if (no_iommu || dmar_disabled) {
4690                 /*
4691                  * We exit the function here to ensure IOMMU's remapping and
4692                  * mempool aren't setup, which means that the IOMMU's PMRs
4693                  * won't be disabled via the call to init_dmars(). So disable
4694                  * it explicitly here. The PMRs were setup by tboot prior to
4695                  * calling SENTER, but the kernel is expected to reset/tear
4696                  * down the PMRs.
4697                  */
4698                 if (intel_iommu_tboot_noforce) {
4699                         for_each_iommu(iommu, drhd)
4700                                 iommu_disable_protect_mem_regions(iommu);
4701                 }
4702
4703                 /*
4704                  * Make sure the IOMMUs are switched off, even when we
4705                  * boot into a kexec kernel and the previous kernel left
4706                  * them enabled
4707                  */
4708                 intel_disable_iommus();
4709                 goto out_free_dmar;
4710         }
4711
4712         if (list_empty(&dmar_rmrr_units))
4713                 pr_info("No RMRR found\n");
4714
4715         if (list_empty(&dmar_atsr_units))
4716                 pr_info("No ATSR found\n");
4717
4718         if (dmar_init_reserved_ranges()) {
4719                 if (force_on)
4720                         panic("tboot: Failed to reserve iommu ranges\n");
4721                 goto out_free_reserved_range;
4722         }
4723
4724         if (dmar_map_gfx)
4725                 intel_iommu_gfx_mapped = 1;
4726
4727         init_no_remapping_devices();
4728
4729         ret = init_dmars();
4730         if (ret) {
4731                 if (force_on)
4732                         panic("tboot: Failed to initialize DMARs\n");
4733                 pr_err("Initialization failed\n");
4734                 goto out_free_reserved_range;
4735         }
4736         up_write(&dmar_global_lock);
4737
4738 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4739         swiotlb = 0;
4740 #endif
4741         dma_ops = &intel_dma_ops;
4742
4743         init_iommu_pm_ops();
4744
4745         for_each_active_iommu(iommu, drhd) {
4746                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4747                                        intel_iommu_groups,
4748                                        "%s", iommu->name);
4749                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4750                 iommu_device_register(&iommu->iommu);
4751         }
4752
4753         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4754         if (si_domain && !hw_pass_through)
4755                 register_memory_notifier(&intel_iommu_memory_nb);
4756         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4757                           intel_iommu_cpu_dead);
4758
4759         down_read(&dmar_global_lock);
4760         if (probe_acpi_namespace_devices())
4761                 pr_warn("ACPI name space devices didn't probe correctly\n");
4762         up_read(&dmar_global_lock);
4763
4764         /* Finally, we enable the DMA remapping hardware. */
4765         for_each_iommu(iommu, drhd) {
4766                 if (!drhd->ignored && !translation_pre_enabled(iommu))
4767                         iommu_enable_translation(iommu);
4768
4769                 iommu_disable_protect_mem_regions(iommu);
4770         }
4771         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4772
4773         intel_iommu_enabled = 1;
4774         intel_iommu_debugfs_init();
4775
4776         return 0;
4777
4778 out_free_reserved_range:
4779         put_iova_domain(&reserved_iova_list);
4780 out_free_dmar:
4781         intel_iommu_free_dmars();
4782         up_write(&dmar_global_lock);
4783         iommu_exit_mempool();
4784         return ret;
4785 }
4786
4787 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4788 {
4789         struct intel_iommu *iommu = opaque;
4790
4791         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4792         return 0;
4793 }
4794
4795 /*
4796  * NB - intel-iommu lacks any sort of reference counting for the users of
4797  * dependent devices.  If multiple endpoints have intersecting dependent
4798  * devices, unbinding the driver from any one of them will possibly leave
4799  * the others unable to operate.
4800  */
4801 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4802 {
4803         if (!iommu || !dev || !dev_is_pci(dev))
4804                 return;
4805
4806         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4807 }
4808
4809 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4810 {
4811         struct dmar_domain *domain;
4812         struct intel_iommu *iommu;
4813         unsigned long flags;
4814
4815         assert_spin_locked(&device_domain_lock);
4816
4817         if (WARN_ON(!info))
4818                 return;
4819
4820         iommu = info->iommu;
4821         domain = info->domain;
4822
4823         if (info->dev) {
4824                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4825                         intel_pasid_tear_down_entry(iommu, info->dev,
4826                                         PASID_RID2PASID);
4827
4828                 iommu_disable_dev_iotlb(info);
4829                 domain_context_clear(iommu, info->dev);
4830                 intel_pasid_free_table(info->dev);
4831         }
4832
4833         unlink_domain_info(info);
4834
4835         spin_lock_irqsave(&iommu->lock, flags);
4836         domain_detach_iommu(domain, iommu);
4837         spin_unlock_irqrestore(&iommu->lock, flags);
4838
4839         /* free the private domain */
4840         if (domain->flags & DOMAIN_FLAG_LOSE_CHILDREN &&
4841             !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY))
4842                 domain_exit(info->domain);
4843
4844         free_devinfo_mem(info);
4845 }
4846
4847 static void dmar_remove_one_dev_info(struct device *dev)
4848 {
4849         struct device_domain_info *info;
4850         unsigned long flags;
4851
4852         spin_lock_irqsave(&device_domain_lock, flags);
4853         info = dev->archdata.iommu;
4854         __dmar_remove_one_dev_info(info);
4855         spin_unlock_irqrestore(&device_domain_lock, flags);
4856 }
4857
4858 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4859 {
4860         int adjust_width;
4861
4862         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4863         domain_reserve_special_ranges(domain);
4864
4865         /* calculate AGAW */
4866         domain->gaw = guest_width;
4867         adjust_width = guestwidth_to_adjustwidth(guest_width);
4868         domain->agaw = width_to_agaw(adjust_width);
4869
4870         domain->iommu_coherency = 0;
4871         domain->iommu_snooping = 0;
4872         domain->iommu_superpage = 0;
4873         domain->max_addr = 0;
4874
4875         /* always allocate the top pgd */
4876         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4877         if (!domain->pgd)
4878                 return -ENOMEM;
4879         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4880         return 0;
4881 }
4882
4883 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4884 {
4885         struct dmar_domain *dmar_domain;
4886         struct iommu_domain *domain;
4887
4888         switch (type) {
4889         case IOMMU_DOMAIN_DMA:
4890         /* fallthrough */
4891         case IOMMU_DOMAIN_UNMANAGED:
4892                 dmar_domain = alloc_domain(0);
4893                 if (!dmar_domain) {
4894                         pr_err("Can't allocate dmar_domain\n");
4895                         return NULL;
4896                 }
4897                 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4898                         pr_err("Domain initialization failed\n");
4899                         domain_exit(dmar_domain);
4900                         return NULL;
4901                 }
4902
4903                 if (type == IOMMU_DOMAIN_DMA &&
4904                     init_iova_flush_queue(&dmar_domain->iovad,
4905                                           iommu_flush_iova, iova_entry_free)) {
4906                         pr_warn("iova flush queue initialization failed\n");
4907                         intel_iommu_strict = 1;
4908                 }
4909
4910                 domain_update_iommu_cap(dmar_domain);
4911
4912                 domain = &dmar_domain->domain;
4913                 domain->geometry.aperture_start = 0;
4914                 domain->geometry.aperture_end   =
4915                                 __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4916                 domain->geometry.force_aperture = true;
4917
4918                 return domain;
4919         case IOMMU_DOMAIN_IDENTITY:
4920                 return &si_domain->domain;
4921         default:
4922                 return NULL;
4923         }
4924
4925         return NULL;
4926 }
4927
4928 static void intel_iommu_domain_free(struct iommu_domain *domain)
4929 {
4930         if (domain != &si_domain->domain)
4931                 domain_exit(to_dmar_domain(domain));
4932 }
4933
4934 /*
4935  * Check whether a @domain could be attached to the @dev through the
4936  * aux-domain attach/detach APIs.
4937  */
4938 static inline bool
4939 is_aux_domain(struct device *dev, struct iommu_domain *domain)
4940 {
4941         struct device_domain_info *info = dev->archdata.iommu;
4942
4943         return info && info->auxd_enabled &&
4944                         domain->type == IOMMU_DOMAIN_UNMANAGED;
4945 }
4946
4947 static void auxiliary_link_device(struct dmar_domain *domain,
4948                                   struct device *dev)
4949 {
4950         struct device_domain_info *info = dev->archdata.iommu;
4951
4952         assert_spin_locked(&device_domain_lock);
4953         if (WARN_ON(!info))
4954                 return;
4955
4956         domain->auxd_refcnt++;
4957         list_add(&domain->auxd, &info->auxiliary_domains);
4958 }
4959
4960 static void auxiliary_unlink_device(struct dmar_domain *domain,
4961                                     struct device *dev)
4962 {
4963         struct device_domain_info *info = dev->archdata.iommu;
4964
4965         assert_spin_locked(&device_domain_lock);
4966         if (WARN_ON(!info))
4967                 return;
4968
4969         list_del(&domain->auxd);
4970         domain->auxd_refcnt--;
4971
4972         if (!domain->auxd_refcnt && domain->default_pasid > 0)
4973                 intel_pasid_free_id(domain->default_pasid);
4974 }
4975
4976 static int aux_domain_add_dev(struct dmar_domain *domain,
4977                               struct device *dev)
4978 {
4979         int ret;
4980         u8 bus, devfn;
4981         unsigned long flags;
4982         struct intel_iommu *iommu;
4983
4984         iommu = device_to_iommu(dev, &bus, &devfn);
4985         if (!iommu)
4986                 return -ENODEV;
4987
4988         if (domain->default_pasid <= 0) {
4989                 int pasid;
4990
4991                 pasid = intel_pasid_alloc_id(domain, PASID_MIN,
4992                                              pci_max_pasids(to_pci_dev(dev)),
4993                                              GFP_KERNEL);
4994                 if (pasid <= 0) {
4995                         pr_err("Can't allocate default pasid\n");
4996                         return -ENODEV;
4997                 }
4998                 domain->default_pasid = pasid;
4999         }
5000
5001         spin_lock_irqsave(&device_domain_lock, flags);
5002         /*
5003          * iommu->lock must be held to attach domain to iommu and setup the
5004          * pasid entry for second level translation.
5005          */
5006         spin_lock(&iommu->lock);
5007         ret = domain_attach_iommu(domain, iommu);
5008         if (ret)
5009                 goto attach_failed;
5010
5011         /* Setup the PASID entry for mediated devices: */
5012         ret = intel_pasid_setup_second_level(iommu, domain, dev,
5013                                              domain->default_pasid);
5014         if (ret)
5015                 goto table_failed;
5016         spin_unlock(&iommu->lock);
5017
5018         auxiliary_link_device(domain, dev);
5019
5020         spin_unlock_irqrestore(&device_domain_lock, flags);
5021
5022         return 0;
5023
5024 table_failed:
5025         domain_detach_iommu(domain, iommu);
5026 attach_failed:
5027         spin_unlock(&iommu->lock);
5028         spin_unlock_irqrestore(&device_domain_lock, flags);
5029         if (!domain->auxd_refcnt && domain->default_pasid > 0)
5030                 intel_pasid_free_id(domain->default_pasid);
5031
5032         return ret;
5033 }
5034
5035 static void aux_domain_remove_dev(struct dmar_domain *domain,
5036                                   struct device *dev)
5037 {
5038         struct device_domain_info *info;
5039         struct intel_iommu *iommu;
5040         unsigned long flags;
5041
5042         if (!is_aux_domain(dev, &domain->domain))
5043                 return;
5044
5045         spin_lock_irqsave(&device_domain_lock, flags);
5046         info = dev->archdata.iommu;
5047         iommu = info->iommu;
5048
5049         auxiliary_unlink_device(domain, dev);
5050
5051         spin_lock(&iommu->lock);
5052         intel_pasid_tear_down_entry(iommu, dev, domain->default_pasid);
5053         domain_detach_iommu(domain, iommu);
5054         spin_unlock(&iommu->lock);
5055
5056         spin_unlock_irqrestore(&device_domain_lock, flags);
5057 }
5058
5059 static int prepare_domain_attach_device(struct iommu_domain *domain,
5060                                         struct device *dev)
5061 {
5062         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5063         struct intel_iommu *iommu;
5064         int addr_width;
5065         u8 bus, devfn;
5066
5067         iommu = device_to_iommu(dev, &bus, &devfn);
5068         if (!iommu)
5069                 return -ENODEV;
5070
5071         /* check if this iommu agaw is sufficient for max mapped address */
5072         addr_width = agaw_to_width(iommu->agaw);
5073         if (addr_width > cap_mgaw(iommu->cap))
5074                 addr_width = cap_mgaw(iommu->cap);
5075
5076         if (dmar_domain->max_addr > (1LL << addr_width)) {
5077                 dev_err(dev, "%s: iommu width (%d) is not "
5078                         "sufficient for the mapped address (%llx)\n",
5079                         __func__, addr_width, dmar_domain->max_addr);
5080                 return -EFAULT;
5081         }
5082         dmar_domain->gaw = addr_width;
5083
5084         /*
5085          * Knock out extra levels of page tables if necessary
5086          */
5087         while (iommu->agaw < dmar_domain->agaw) {
5088                 struct dma_pte *pte;
5089
5090                 pte = dmar_domain->pgd;
5091                 if (dma_pte_present(pte)) {
5092                         dmar_domain->pgd = (struct dma_pte *)
5093                                 phys_to_virt(dma_pte_addr(pte));
5094                         free_pgtable_page(pte);
5095                 }
5096                 dmar_domain->agaw--;
5097         }
5098
5099         return 0;
5100 }
5101
5102 static int intel_iommu_attach_device(struct iommu_domain *domain,
5103                                      struct device *dev)
5104 {
5105         int ret;
5106
5107         if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
5108             device_is_rmrr_locked(dev)) {
5109                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5110                 return -EPERM;
5111         }
5112
5113         if (is_aux_domain(dev, domain))
5114                 return -EPERM;
5115
5116         /* normally dev is not mapped */
5117         if (unlikely(domain_context_mapped(dev))) {
5118                 struct dmar_domain *old_domain;
5119
5120                 old_domain = find_domain(dev);
5121                 if (old_domain)
5122                         dmar_remove_one_dev_info(dev);
5123         }
5124
5125         ret = prepare_domain_attach_device(domain, dev);
5126         if (ret)
5127                 return ret;
5128
5129         return domain_add_dev_info(to_dmar_domain(domain), dev);
5130 }
5131
5132 static int intel_iommu_aux_attach_device(struct iommu_domain *domain,
5133                                          struct device *dev)
5134 {
5135         int ret;
5136
5137         if (!is_aux_domain(dev, domain))
5138                 return -EPERM;
5139
5140         ret = prepare_domain_attach_device(domain, dev);
5141         if (ret)
5142                 return ret;
5143
5144         return aux_domain_add_dev(to_dmar_domain(domain), dev);
5145 }
5146
5147 static void intel_iommu_detach_device(struct iommu_domain *domain,
5148                                       struct device *dev)
5149 {
5150         dmar_remove_one_dev_info(dev);
5151 }
5152
5153 static void intel_iommu_aux_detach_device(struct iommu_domain *domain,
5154                                           struct device *dev)
5155 {
5156         aux_domain_remove_dev(to_dmar_domain(domain), dev);
5157 }
5158
5159 static int intel_iommu_map(struct iommu_domain *domain,
5160                            unsigned long iova, phys_addr_t hpa,
5161                            size_t size, int iommu_prot)
5162 {
5163         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5164         u64 max_addr;
5165         int prot = 0;
5166         int ret;
5167
5168         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5169                 return -EINVAL;
5170
5171         if (iommu_prot & IOMMU_READ)
5172                 prot |= DMA_PTE_READ;
5173         if (iommu_prot & IOMMU_WRITE)
5174                 prot |= DMA_PTE_WRITE;
5175         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5176                 prot |= DMA_PTE_SNP;
5177
5178         max_addr = iova + size;
5179         if (dmar_domain->max_addr < max_addr) {
5180                 u64 end;
5181
5182                 /* check if minimum agaw is sufficient for mapped address */
5183                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5184                 if (end < max_addr) {
5185                         pr_err("%s: iommu width (%d) is not "
5186                                "sufficient for the mapped address (%llx)\n",
5187                                __func__, dmar_domain->gaw, max_addr);
5188                         return -EFAULT;
5189                 }
5190                 dmar_domain->max_addr = max_addr;
5191         }
5192         /* Round up size to next multiple of PAGE_SIZE, if it and
5193            the low bits of hpa would take us onto the next page */
5194         size = aligned_nrpages(hpa, size);
5195         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5196                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5197         return ret;
5198 }
5199
5200 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5201                                 unsigned long iova, size_t size)
5202 {
5203         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5204         struct page *freelist = NULL;
5205         unsigned long start_pfn, last_pfn;
5206         unsigned int npages;
5207         int iommu_id, level = 0;
5208
5209         /* Cope with horrid API which requires us to unmap more than the
5210            size argument if it happens to be a large-page mapping. */
5211         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5212         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5213                 return 0;
5214
5215         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5216                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5217
5218         start_pfn = iova >> VTD_PAGE_SHIFT;
5219         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5220
5221         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5222
5223         npages = last_pfn - start_pfn + 1;
5224
5225         for_each_domain_iommu(iommu_id, dmar_domain)
5226                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5227                                       start_pfn, npages, !freelist, 0);
5228
5229         dma_free_pagelist(freelist);
5230
5231         if (dmar_domain->max_addr == iova + size)
5232                 dmar_domain->max_addr = iova;
5233
5234         return size;
5235 }
5236
5237 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5238                                             dma_addr_t iova)
5239 {
5240         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5241         struct dma_pte *pte;
5242         int level = 0;
5243         u64 phys = 0;
5244
5245         if (dmar_domain->flags & DOMAIN_FLAG_LOSE_CHILDREN)
5246                 return 0;
5247
5248         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5249         if (pte)
5250                 phys = dma_pte_addr(pte);
5251
5252         return phys;
5253 }
5254
5255 static inline bool scalable_mode_support(void)
5256 {
5257         struct dmar_drhd_unit *drhd;
5258         struct intel_iommu *iommu;
5259         bool ret = true;
5260
5261         rcu_read_lock();
5262         for_each_active_iommu(iommu, drhd) {
5263                 if (!sm_supported(iommu)) {
5264                         ret = false;
5265                         break;
5266                 }
5267         }
5268         rcu_read_unlock();
5269
5270         return ret;
5271 }
5272
5273 static inline bool iommu_pasid_support(void)
5274 {
5275         struct dmar_drhd_unit *drhd;
5276         struct intel_iommu *iommu;
5277         bool ret = true;
5278
5279         rcu_read_lock();
5280         for_each_active_iommu(iommu, drhd) {
5281                 if (!pasid_supported(iommu)) {
5282                         ret = false;
5283                         break;
5284                 }
5285         }
5286         rcu_read_unlock();
5287
5288         return ret;
5289 }
5290
5291 static bool intel_iommu_capable(enum iommu_cap cap)
5292 {
5293         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5294                 return domain_update_iommu_snooping(NULL) == 1;
5295         if (cap == IOMMU_CAP_INTR_REMAP)
5296                 return irq_remapping_enabled == 1;
5297
5298         return false;
5299 }
5300
5301 static int intel_iommu_add_device(struct device *dev)
5302 {
5303         struct dmar_domain *dmar_domain;
5304         struct iommu_domain *domain;
5305         struct intel_iommu *iommu;
5306         struct iommu_group *group;
5307         u8 bus, devfn;
5308         int ret;
5309
5310         iommu = device_to_iommu(dev, &bus, &devfn);
5311         if (!iommu)
5312                 return -ENODEV;
5313
5314         iommu_device_link(&iommu->iommu, dev);
5315
5316         if (translation_pre_enabled(iommu))
5317                 dev->archdata.iommu = DEFER_DEVICE_DOMAIN_INFO;
5318
5319         group = iommu_group_get_for_dev(dev);
5320
5321         if (IS_ERR(group))
5322                 return PTR_ERR(group);
5323
5324         iommu_group_put(group);
5325
5326         domain = iommu_get_domain_for_dev(dev);
5327         dmar_domain = to_dmar_domain(domain);
5328         if (domain->type == IOMMU_DOMAIN_DMA) {
5329                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_IDENTITY) {
5330                         ret = iommu_request_dm_for_dev(dev);
5331                         if (ret) {
5332                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5333                                 domain_add_dev_info(si_domain, dev);
5334                                 dev_info(dev,
5335                                          "Device uses a private identity domain.\n");
5336                         }
5337                 }
5338         } else {
5339                 if (device_def_domain_type(dev) == IOMMU_DOMAIN_DMA) {
5340                         ret = iommu_request_dma_domain_for_dev(dev);
5341                         if (ret) {
5342                                 dmar_domain->flags |= DOMAIN_FLAG_LOSE_CHILDREN;
5343                                 if (!get_private_domain_for_dev(dev)) {
5344                                         dev_warn(dev,
5345                                                  "Failed to get a private domain.\n");
5346                                         return -ENOMEM;
5347                                 }
5348
5349                                 dev_info(dev,
5350                                          "Device uses a private dma domain.\n");
5351                         }
5352                 }
5353         }
5354
5355         return 0;
5356 }
5357
5358 static void intel_iommu_remove_device(struct device *dev)
5359 {
5360         struct intel_iommu *iommu;
5361         u8 bus, devfn;
5362
5363         iommu = device_to_iommu(dev, &bus, &devfn);
5364         if (!iommu)
5365                 return;
5366
5367         iommu_group_remove_device(dev);
5368
5369         iommu_device_unlink(&iommu->iommu, dev);
5370 }
5371
5372 static void intel_iommu_get_resv_regions(struct device *device,
5373                                          struct list_head *head)
5374 {
5375         int prot = DMA_PTE_READ | DMA_PTE_WRITE;
5376         struct iommu_resv_region *reg;
5377         struct dmar_rmrr_unit *rmrr;
5378         struct device *i_dev;
5379         int i;
5380
5381         down_read(&dmar_global_lock);
5382         for_each_rmrr_units(rmrr) {
5383                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5384                                           i, i_dev) {
5385                         struct iommu_resv_region *resv;
5386                         enum iommu_resv_type type;
5387                         size_t length;
5388
5389                         if (i_dev != device &&
5390                             !is_downstream_to_pci_bridge(device, i_dev))
5391                                 continue;
5392
5393                         length = rmrr->end_address - rmrr->base_address + 1;
5394
5395                         type = device_rmrr_is_relaxable(device) ?
5396                                 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
5397
5398                         resv = iommu_alloc_resv_region(rmrr->base_address,
5399                                                        length, prot, type);
5400                         if (!resv)
5401                                 break;
5402
5403                         list_add_tail(&resv->list, head);
5404                 }
5405         }
5406         up_read(&dmar_global_lock);
5407
5408 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
5409         if (dev_is_pci(device)) {
5410                 struct pci_dev *pdev = to_pci_dev(device);
5411
5412                 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
5413                         reg = iommu_alloc_resv_region(0, 1UL << 24, 0,
5414                                                       IOMMU_RESV_DIRECT);
5415                         if (reg)
5416                                 list_add_tail(&reg->list, head);
5417                 }
5418         }
5419 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
5420
5421         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5422                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5423                                       0, IOMMU_RESV_MSI);
5424         if (!reg)
5425                 return;
5426         list_add_tail(&reg->list, head);
5427 }
5428
5429 static void intel_iommu_put_resv_regions(struct device *dev,
5430                                          struct list_head *head)
5431 {
5432         struct iommu_resv_region *entry, *next;
5433
5434         list_for_each_entry_safe(entry, next, head, list)
5435                 kfree(entry);
5436 }
5437
5438 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct device *dev)
5439 {
5440         struct device_domain_info *info;
5441         struct context_entry *context;
5442         struct dmar_domain *domain;
5443         unsigned long flags;
5444         u64 ctx_lo;
5445         int ret;
5446
5447         domain = find_domain(dev);
5448         if (!domain)
5449                 return -EINVAL;
5450
5451         spin_lock_irqsave(&device_domain_lock, flags);
5452         spin_lock(&iommu->lock);
5453
5454         ret = -EINVAL;
5455         info = dev->archdata.iommu;
5456         if (!info || !info->pasid_supported)
5457                 goto out;
5458
5459         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5460         if (WARN_ON(!context))
5461                 goto out;
5462
5463         ctx_lo = context[0].lo;
5464
5465         if (!(ctx_lo & CONTEXT_PASIDE)) {
5466                 ctx_lo |= CONTEXT_PASIDE;
5467                 context[0].lo = ctx_lo;
5468                 wmb();
5469                 iommu->flush.flush_context(iommu,
5470                                            domain->iommu_did[iommu->seq_id],
5471                                            PCI_DEVID(info->bus, info->devfn),
5472                                            DMA_CCMD_MASK_NOBIT,
5473                                            DMA_CCMD_DEVICE_INVL);
5474         }
5475
5476         /* Enable PASID support in the device, if it wasn't already */
5477         if (!info->pasid_enabled)
5478                 iommu_enable_dev_iotlb(info);
5479
5480         ret = 0;
5481
5482  out:
5483         spin_unlock(&iommu->lock);
5484         spin_unlock_irqrestore(&device_domain_lock, flags);
5485
5486         return ret;
5487 }
5488
5489 static void intel_iommu_apply_resv_region(struct device *dev,
5490                                           struct iommu_domain *domain,
5491                                           struct iommu_resv_region *region)
5492 {
5493         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5494         unsigned long start, end;
5495
5496         start = IOVA_PFN(region->start);
5497         end   = IOVA_PFN(region->start + region->length - 1);
5498
5499         WARN_ON_ONCE(!reserve_iova(&dmar_domain->iovad, start, end));
5500 }
5501
5502 #ifdef CONFIG_INTEL_IOMMU_SVM
5503 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5504 {
5505         struct intel_iommu *iommu;
5506         u8 bus, devfn;
5507
5508         if (iommu_dummy(dev)) {
5509                 dev_warn(dev,
5510                          "No IOMMU translation for device; cannot enable SVM\n");
5511                 return NULL;
5512         }
5513
5514         iommu = device_to_iommu(dev, &bus, &devfn);
5515         if ((!iommu)) {
5516                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5517                 return NULL;
5518         }
5519
5520         return iommu;
5521 }
5522 #endif /* CONFIG_INTEL_IOMMU_SVM */
5523
5524 static int intel_iommu_enable_auxd(struct device *dev)
5525 {
5526         struct device_domain_info *info;
5527         struct intel_iommu *iommu;
5528         unsigned long flags;
5529         u8 bus, devfn;
5530         int ret;
5531
5532         iommu = device_to_iommu(dev, &bus, &devfn);
5533         if (!iommu || dmar_disabled)
5534                 return -EINVAL;
5535
5536         if (!sm_supported(iommu) || !pasid_supported(iommu))
5537                 return -EINVAL;
5538
5539         ret = intel_iommu_enable_pasid(iommu, dev);
5540         if (ret)
5541                 return -ENODEV;
5542
5543         spin_lock_irqsave(&device_domain_lock, flags);
5544         info = dev->archdata.iommu;
5545         info->auxd_enabled = 1;
5546         spin_unlock_irqrestore(&device_domain_lock, flags);
5547
5548         return 0;
5549 }
5550
5551 static int intel_iommu_disable_auxd(struct device *dev)
5552 {
5553         struct device_domain_info *info;
5554         unsigned long flags;
5555
5556         spin_lock_irqsave(&device_domain_lock, flags);
5557         info = dev->archdata.iommu;
5558         if (!WARN_ON(!info))
5559                 info->auxd_enabled = 0;
5560         spin_unlock_irqrestore(&device_domain_lock, flags);
5561
5562         return 0;
5563 }
5564
5565 /*
5566  * A PCI express designated vendor specific extended capability is defined
5567  * in the section 3.7 of Intel scalable I/O virtualization technical spec
5568  * for system software and tools to detect endpoint devices supporting the
5569  * Intel scalable IO virtualization without host driver dependency.
5570  *
5571  * Returns the address of the matching extended capability structure within
5572  * the device's PCI configuration space or 0 if the device does not support
5573  * it.
5574  */
5575 static int siov_find_pci_dvsec(struct pci_dev *pdev)
5576 {
5577         int pos;
5578         u16 vendor, id;
5579
5580         pos = pci_find_next_ext_capability(pdev, 0, 0x23);
5581         while (pos) {
5582                 pci_read_config_word(pdev, pos + 4, &vendor);
5583                 pci_read_config_word(pdev, pos + 8, &id);
5584                 if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
5585                         return pos;
5586
5587                 pos = pci_find_next_ext_capability(pdev, pos, 0x23);
5588         }
5589
5590         return 0;
5591 }
5592
5593 static bool
5594 intel_iommu_dev_has_feat(struct device *dev, enum iommu_dev_features feat)
5595 {
5596         if (feat == IOMMU_DEV_FEAT_AUX) {
5597                 int ret;
5598
5599                 if (!dev_is_pci(dev) || dmar_disabled ||
5600                     !scalable_mode_support() || !iommu_pasid_support())
5601                         return false;
5602
5603                 ret = pci_pasid_features(to_pci_dev(dev));
5604                 if (ret < 0)
5605                         return false;
5606
5607                 return !!siov_find_pci_dvsec(to_pci_dev(dev));
5608         }
5609
5610         return false;
5611 }
5612
5613 static int
5614 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
5615 {
5616         if (feat == IOMMU_DEV_FEAT_AUX)
5617                 return intel_iommu_enable_auxd(dev);
5618
5619         return -ENODEV;
5620 }
5621
5622 static int
5623 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
5624 {
5625         if (feat == IOMMU_DEV_FEAT_AUX)
5626                 return intel_iommu_disable_auxd(dev);
5627
5628         return -ENODEV;
5629 }
5630
5631 static bool
5632 intel_iommu_dev_feat_enabled(struct device *dev, enum iommu_dev_features feat)
5633 {
5634         struct device_domain_info *info = dev->archdata.iommu;
5635
5636         if (feat == IOMMU_DEV_FEAT_AUX)
5637                 return scalable_mode_support() && info && info->auxd_enabled;
5638
5639         return false;
5640 }
5641
5642 static int
5643 intel_iommu_aux_get_pasid(struct iommu_domain *domain, struct device *dev)
5644 {
5645         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5646
5647         return dmar_domain->default_pasid > 0 ?
5648                         dmar_domain->default_pasid : -EINVAL;
5649 }
5650
5651 static bool intel_iommu_is_attach_deferred(struct iommu_domain *domain,
5652                                            struct device *dev)
5653 {
5654         return dev->archdata.iommu == DEFER_DEVICE_DOMAIN_INFO;
5655 }
5656
5657 const struct iommu_ops intel_iommu_ops = {
5658         .capable                = intel_iommu_capable,
5659         .domain_alloc           = intel_iommu_domain_alloc,
5660         .domain_free            = intel_iommu_domain_free,
5661         .attach_dev             = intel_iommu_attach_device,
5662         .detach_dev             = intel_iommu_detach_device,
5663         .aux_attach_dev         = intel_iommu_aux_attach_device,
5664         .aux_detach_dev         = intel_iommu_aux_detach_device,
5665         .aux_get_pasid          = intel_iommu_aux_get_pasid,
5666         .map                    = intel_iommu_map,
5667         .unmap                  = intel_iommu_unmap,
5668         .iova_to_phys           = intel_iommu_iova_to_phys,
5669         .add_device             = intel_iommu_add_device,
5670         .remove_device          = intel_iommu_remove_device,
5671         .get_resv_regions       = intel_iommu_get_resv_regions,
5672         .put_resv_regions       = intel_iommu_put_resv_regions,
5673         .apply_resv_region      = intel_iommu_apply_resv_region,
5674         .device_group           = pci_device_group,
5675         .dev_has_feat           = intel_iommu_dev_has_feat,
5676         .dev_feat_enabled       = intel_iommu_dev_feat_enabled,
5677         .dev_enable_feat        = intel_iommu_dev_enable_feat,
5678         .dev_disable_feat       = intel_iommu_dev_disable_feat,
5679         .is_attach_deferred     = intel_iommu_is_attach_deferred,
5680         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5681 };
5682
5683 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5684 {
5685         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5686         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5687         dmar_map_gfx = 0;
5688 }
5689
5690 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5691 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5692 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5693 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5694 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5695 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5696 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5697
5698 static void quirk_iommu_rwbf(struct pci_dev *dev)
5699 {
5700         /*
5701          * Mobile 4 Series Chipset neglects to set RWBF capability,
5702          * but needs it. Same seems to hold for the desktop versions.
5703          */
5704         pci_info(dev, "Forcing write-buffer flush capability\n");
5705         rwbf_quirk = 1;
5706 }
5707
5708 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5709 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5710 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5711 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5712 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5713 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5714 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5715
5716 #define GGC 0x52
5717 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5718 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5719 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5720 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5721 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5722 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5723 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5724 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5725
5726 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5727 {
5728         unsigned short ggc;
5729
5730         if (pci_read_config_word(dev, GGC, &ggc))
5731                 return;
5732
5733         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5734                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5735                 dmar_map_gfx = 0;
5736         } else if (dmar_map_gfx) {
5737                 /* we have to ensure the gfx device is idle before we flush */
5738                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5739                 intel_iommu_strict = 1;
5740        }
5741 }
5742 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5743 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5744 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5745 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5746
5747 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5748    ISOCH DMAR unit for the Azalia sound device, but not give it any
5749    TLB entries, which causes it to deadlock. Check for that.  We do
5750    this in a function called from init_dmars(), instead of in a PCI
5751    quirk, because we don't want to print the obnoxious "BIOS broken"
5752    message if VT-d is actually disabled.
5753 */
5754 static void __init check_tylersburg_isoch(void)
5755 {
5756         struct pci_dev *pdev;
5757         uint32_t vtisochctrl;
5758
5759         /* If there's no Azalia in the system anyway, forget it. */
5760         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5761         if (!pdev)
5762                 return;
5763         pci_dev_put(pdev);
5764
5765         /* System Management Registers. Might be hidden, in which case
5766            we can't do the sanity check. But that's OK, because the
5767            known-broken BIOSes _don't_ actually hide it, so far. */
5768         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5769         if (!pdev)
5770                 return;
5771
5772         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5773                 pci_dev_put(pdev);
5774                 return;
5775         }
5776
5777         pci_dev_put(pdev);
5778
5779         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5780         if (vtisochctrl & 1)
5781                 return;
5782
5783         /* Drop all bits other than the number of TLB entries */
5784         vtisochctrl &= 0x1c;
5785
5786         /* If we have the recommended number of TLB entries (16), fine. */
5787         if (vtisochctrl == 0x10)
5788                 return;
5789
5790         /* Zero TLB entries? You get to ride the short bus to school. */
5791         if (!vtisochctrl) {
5792                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5793                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5794                      dmi_get_system_info(DMI_BIOS_VENDOR),
5795                      dmi_get_system_info(DMI_BIOS_VERSION),
5796                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5797                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5798                 return;
5799         }
5800
5801         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5802                vtisochctrl);
5803 }