drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22 #define dev_fmt(fmt)    pr_fmt(fmt)
  23
  24 #include <linux/init.h>
  25 #include <linux/bitmap.h>
  26 #include <linux/debugfs.h>
  27 #include <linux/export.h>
  28 #include <linux/slab.h>
  29 #include <linux/irq.h>
  30 #include <linux/interrupt.h>
  31 #include <linux/spinlock.h>
  32 #include <linux/pci.h>
  33 #include <linux/dmar.h>
  34 #include <linux/dma-mapping.h>
  35 #include <linux/mempool.h>
  36 #include <linux/memory.h>
  37 #include <linux/cpu.h>
  38 #include <linux/timer.h>
  39 #include <linux/io.h>
  40 #include <linux/iova.h>
  41 #include <linux/iommu.h>
  42 #include <linux/intel-iommu.h>
  43 #include <linux/syscore_ops.h>
  44 #include <linux/tboot.h>
  45 #include <linux/dmi.h>
  46 #include <linux/pci-ats.h>
  47 #include <linux/memblock.h>
  48 #include <linux/dma-contiguous.h>
  49 #include <linux/dma-direct.h>
  50 #include <linux/crash_dump.h>
  51 #include <asm/irq_remapping.h>
  52 #include <asm/cacheflush.h>
  53 #include <asm/iommu.h>
  54
  55 #include "irq_remapping.h"
  56 #include "intel-pasid.h"
  57
  58 #define ROOT_SIZE               VTD_PAGE_SIZE
  59 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  60
  61 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  62 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  63 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  64 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  65
  66 #define IOAPIC_RANGE_START      (0xfee00000)
  67 #define IOAPIC_RANGE_END        (0xfeefffff)
  68 #define IOVA_START_ADDR         (0x1000)
  69
  70 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  71
  72 #define MAX_AGAW_WIDTH 64
  73 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  74
  75 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  76 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  77
  78 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  79    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  80 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  81                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  82 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  83
  84 /* IO virtual address start page frame number */
  85 #define IOVA_START_PFN          (1)
  86
  87 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  88
  89 /* page table handling */
  90 #define LEVEL_STRIDE            (9)
  91 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  92
  93 /*
  94  * This bitmap is used to advertise the page sizes our hardware support
  95  * to the IOMMU core, which will then use this information to split
  96  * physically contiguous memory regions it is mapping into page sizes
  97  * that we support.
  98  *
  99  * Traditionally the IOMMU core just handed us the mappings directly,
 100  * after making sure the size is an order of a 4KiB page and that the
 101  * mapping has natural alignment.
 102  *
 103  * To retain this behavior, we currently advertise that we support
 104  * all page sizes that are an order of 4KiB.
 105  *
 106  * If at some point we'd like to utilize the IOMMU core's new behavior,
 107  * we could change this to advertise the real page sizes we support.
 108  */
 109 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 110
 111 static inline int agaw_to_level(int agaw)
 112 {
 113         return agaw + 2;
 114 }
 115
 116 static inline int agaw_to_width(int agaw)
 117 {
 118         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 119 }
 120
 121 static inline int width_to_agaw(int width)
 122 {
 123         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 124 }
 125
 126 static inline unsigned int level_to_offset_bits(int level)
 127 {
 128         return (level - 1) * LEVEL_STRIDE;
 129 }
 130
 131 static inline int pfn_level_offset(unsigned long pfn, int level)
 132 {
 133         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 134 }
 135
 136 static inline unsigned long level_mask(int level)
 137 {
 138         return -1UL << level_to_offset_bits(level);
 139 }
 140
 141 static inline unsigned long level_size(int level)
 142 {
 143         return 1UL << level_to_offset_bits(level);
 144 }
 145
 146 static inline unsigned long align_to_level(unsigned long pfn, int level)
 147 {
 148         return (pfn + level_size(level) - 1) & level_mask(level);
 149 }
 150
 151 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 152 {
 153         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 154 }
 155
 156 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 157    are never going to work. */
 158 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 159 {
 160         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 161 }
 162
 163 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 164 {
 165         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 166 }
 167 static inline unsigned long page_to_dma_pfn(struct page *pg)
 168 {
 169         return mm_to_dma_pfn(page_to_pfn(pg));
 170 }
 171 static inline unsigned long virt_to_dma_pfn(void *p)
 172 {
 173         return page_to_dma_pfn(virt_to_page(p));
 174 }
 175
 176 /* global iommu list, set NULL for ignored DMAR units */
 177 static struct intel_iommu **g_iommus;
 178
 179 static void __init check_tylersburg_isoch(void);
 180 static int rwbf_quirk;
 181
 182 /*
 183  * set to 1 to panic kernel if can't successfully enable VT-d
 184  * (used when kernel is launched w/ TXT)
 185  */
 186 static int force_on = 0;
 187 int intel_iommu_tboot_noforce;
 188 static int no_platform_optin;
 189
 190 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 191
 192 /*
 193  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 194  * if marked present.
 195  */
 196 static phys_addr_t root_entry_lctp(struct root_entry *re)
 197 {
 198         if (!(re->lo & 1))
 199                 return 0;
 200
 201         return re->lo & VTD_PAGE_MASK;
 202 }
 203
 204 /*
 205  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 206  * if marked present.
 207  */
 208 static phys_addr_t root_entry_uctp(struct root_entry *re)
 209 {
 210         if (!(re->hi & 1))
 211                 return 0;
 212
 213         return re->hi & VTD_PAGE_MASK;
 214 }
 215
 216 static inline void context_clear_pasid_enable(struct context_entry *context)
 217 {
 218         context->lo &= ~(1ULL << 11);
 219 }
 220
 221 static inline bool context_pasid_enabled(struct context_entry *context)
 222 {
 223         return !!(context->lo & (1ULL << 11));
 224 }
 225
 226 static inline void context_set_copied(struct context_entry *context)
 227 {
 228         context->hi |= (1ull << 3);
 229 }
 230
 231 static inline bool context_copied(struct context_entry *context)
 232 {
 233         return !!(context->hi & (1ULL << 3));
 234 }
 235
 236 static inline bool __context_present(struct context_entry *context)
 237 {
 238         return (context->lo & 1);
 239 }
 240
 241 bool context_present(struct context_entry *context)
 242 {
 243         return context_pasid_enabled(context) ?
 244              __context_present(context) :
 245              __context_present(context) && !context_copied(context);
 246 }
 247
 248 static inline void context_set_present(struct context_entry *context)
 249 {
 250         context->lo |= 1;
 251 }
 252
 253 static inline void context_set_fault_enable(struct context_entry *context)
 254 {
 255         context->lo &= (((u64)-1) << 2) | 1;
 256 }
 257
 258 static inline void context_set_translation_type(struct context_entry *context,
 259                                                 unsigned long value)
 260 {
 261         context->lo &= (((u64)-1) << 4) | 3;
 262         context->lo |= (value & 3) << 2;
 263 }
 264
 265 static inline void context_set_address_root(struct context_entry *context,
 266                                             unsigned long value)
 267 {
 268         context->lo &= ~VTD_PAGE_MASK;
 269         context->lo |= value & VTD_PAGE_MASK;
 270 }
 271
 272 static inline void context_set_address_width(struct context_entry *context,
 273                                              unsigned long value)
 274 {
 275         context->hi |= value & 7;
 276 }
 277
 278 static inline void context_set_domain_id(struct context_entry *context,
 279                                          unsigned long value)
 280 {
 281         context->hi |= (value & ((1 << 16) - 1)) << 8;
 282 }
 283
 284 static inline int context_domain_id(struct context_entry *c)
 285 {
 286         return((c->hi >> 8) & 0xffff);
 287 }
 288
 289 static inline void context_clear_entry(struct context_entry *context)
 290 {
 291         context->lo = 0;
 292         context->hi = 0;
 293 }
 294
 295 /*
 296  * This domain is a statically identity mapping domain.
 297  *      1. This domain creats a static 1:1 mapping to all usable memory.
 298  *      2. It maps to each iommu if successful.
 299  *      3. Each iommu mapps to this domain if successful.
 300  */
 301 static struct dmar_domain *si_domain;
 302 static int hw_pass_through = 1;
 303
 304 /*
 305  * Domain represents a virtual machine, more than one devices
 306  * across iommus may be owned in one domain, e.g. kvm guest.
 307  */
 308 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 309
 310 /* si_domain contains mulitple devices */
 311 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 312
 313 #define for_each_domain_iommu(idx, domain)                      \
 314         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 315                 if (domain->iommu_refcnt[idx])
 316
 317 struct dmar_rmrr_unit {
 318         struct list_head list;          /* list of rmrr units   */
 319         struct acpi_dmar_header *hdr;   /* ACPI header          */
 320         u64     base_address;           /* reserved base address*/
 321         u64     end_address;            /* reserved end address */
 322         struct dmar_dev_scope *devices; /* target devices */
 323         int     devices_cnt;            /* target device count */
 324         struct iommu_resv_region *resv; /* reserved region handle */
 325 };
 326
 327 struct dmar_atsr_unit {
 328         struct list_head list;          /* list of ATSR units */
 329         struct acpi_dmar_header *hdr;   /* ACPI header */
 330         struct dmar_dev_scope *devices; /* target devices */
 331         int devices_cnt;                /* target device count */
 332         u8 include_all:1;               /* include all ports */
 333 };
 334
 335 static LIST_HEAD(dmar_atsr_units);
 336 static LIST_HEAD(dmar_rmrr_units);
 337
 338 #define for_each_rmrr_units(rmrr) \
 339         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 340
 341 /* bitmap for indexing intel_iommus */
 342 static int g_num_of_iommus;
 343
 344 static void domain_exit(struct dmar_domain *domain);
 345 static void domain_remove_dev_info(struct dmar_domain *domain);
 346 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 347                                      struct device *dev);
 348 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 349 static void domain_context_clear(struct intel_iommu *iommu,
 350                                  struct device *dev);
 351 static int domain_detach_iommu(struct dmar_domain *domain,
 352                                struct intel_iommu *iommu);
 353
 354 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 355 int dmar_disabled = 0;
 356 #else
 357 int dmar_disabled = 1;
 358 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 359
 360 int intel_iommu_enabled = 0;
 361 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 362
 363 static int dmar_map_gfx = 1;
 364 static int dmar_forcedac;
 365 static int intel_iommu_strict;
 366 static int intel_iommu_superpage = 1;
 367 static int intel_iommu_sm = 1;
 368 static int iommu_identity_mapping;
 369
 370 #define IDENTMAP_ALL            1
 371 #define IDENTMAP_GFX            2
 372 #define IDENTMAP_AZALIA         4
 373
 374 #define sm_supported(iommu)     (intel_iommu_sm && ecap_smts((iommu)->ecap))
 375 #define pasid_supported(iommu)  (sm_supported(iommu) &&                 \
 376                                  ecap_pasid((iommu)->ecap))
 377
 378 int intel_iommu_gfx_mapped;
 379 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 380
 381 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 382 static DEFINE_SPINLOCK(device_domain_lock);
 383 static LIST_HEAD(device_domain_list);
 384
 385 /*
 386  * Iterate over elements in device_domain_list and call the specified
 387  * callback @fn against each element.
 388  */
 389 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 390                                      void *data), void *data)
 391 {
 392         int ret = 0;
 393         unsigned long flags;
 394         struct device_domain_info *info;
 395
 396         spin_lock_irqsave(&device_domain_lock, flags);
 397         list_for_each_entry(info, &device_domain_list, global) {
 398                 ret = fn(info, data);
 399                 if (ret) {
 400                         spin_unlock_irqrestore(&device_domain_lock, flags);
 401                         return ret;
 402                 }
 403         }
 404         spin_unlock_irqrestore(&device_domain_lock, flags);
 405
 406         return 0;
 407 }
 408
 409 const struct iommu_ops intel_iommu_ops;
 410
 411 static bool translation_pre_enabled(struct intel_iommu *iommu)
 412 {
 413         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 414 }
 415
 416 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 417 {
 418         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 419 }
 420
 421 static void init_translation_status(struct intel_iommu *iommu)
 422 {
 423         u32 gsts;
 424
 425         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 426         if (gsts & DMA_GSTS_TES)
 427                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 428 }
 429
 430 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 431 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 432 {
 433         return container_of(dom, struct dmar_domain, domain);
 434 }
 435
 436 static int __init intel_iommu_setup(char *str)
 437 {
 438         if (!str)
 439                 return -EINVAL;
 440         while (*str) {
 441                 if (!strncmp(str, "on", 2)) {
 442                         dmar_disabled = 0;
 443                         pr_info("IOMMU enabled\n");
 444                 } else if (!strncmp(str, "off", 3)) {
 445                         dmar_disabled = 1;
 446                         no_platform_optin = 1;
 447                         pr_info("IOMMU disabled\n");
 448                 } else if (!strncmp(str, "igfx_off", 8)) {
 449                         dmar_map_gfx = 0;
 450                         pr_info("Disable GFX device mapping\n");
 451                 } else if (!strncmp(str, "forcedac", 8)) {
 452                         pr_info("Forcing DAC for PCI devices\n");
 453                         dmar_forcedac = 1;
 454                 } else if (!strncmp(str, "strict", 6)) {
 455                         pr_info("Disable batched IOTLB flush\n");
 456                         intel_iommu_strict = 1;
 457                 } else if (!strncmp(str, "sp_off", 6)) {
 458                         pr_info("Disable supported super page\n");
 459                         intel_iommu_superpage = 0;
 460                 } else if (!strncmp(str, "sm_off", 6)) {
 461                         pr_info("Intel-IOMMU: disable scalable mode support\n");
 462                         intel_iommu_sm = 0;
 463                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 464                         printk(KERN_INFO
 465                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 466                         intel_iommu_tboot_noforce = 1;
 467                 }
 468
 469                 str += strcspn(str, ",");
 470                 while (*str == ',')
 471                         str++;
 472         }
 473         return 0;
 474 }
 475 __setup("intel_iommu=", intel_iommu_setup);
 476
 477 static struct kmem_cache *iommu_domain_cache;
 478 static struct kmem_cache *iommu_devinfo_cache;
 479
 480 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 481 {
 482         struct dmar_domain **domains;
 483         int idx = did >> 8;
 484
 485         domains = iommu->domains[idx];
 486         if (!domains)
 487                 return NULL;
 488
 489         return domains[did & 0xff];
 490 }
 491
 492 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 493                              struct dmar_domain *domain)
 494 {
 495         struct dmar_domain **domains;
 496         int idx = did >> 8;
 497
 498         if (!iommu->domains[idx]) {
 499                 size_t size = 256 * sizeof(struct dmar_domain *);
 500                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 501         }
 502
 503         domains = iommu->domains[idx];
 504         if (WARN_ON(!domains))
 505                 return;
 506         else
 507                 domains[did & 0xff] = domain;
 508 }
 509
 510 void *alloc_pgtable_page(int node)
 511 {
 512         struct page *page;
 513         void *vaddr = NULL;
 514
 515         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 516         if (page)
 517                 vaddr = page_address(page);
 518         return vaddr;
 519 }
 520
 521 void free_pgtable_page(void *vaddr)
 522 {
 523         free_page((unsigned long)vaddr);
 524 }
 525
 526 static inline void *alloc_domain_mem(void)
 527 {
 528         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 529 }
 530
 531 static void free_domain_mem(void *vaddr)
 532 {
 533         kmem_cache_free(iommu_domain_cache, vaddr);
 534 }
 535
 536 static inline void * alloc_devinfo_mem(void)
 537 {
 538         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 539 }
 540
 541 static inline void free_devinfo_mem(void *vaddr)
 542 {
 543         kmem_cache_free(iommu_devinfo_cache, vaddr);
 544 }
 545
 546 static inline int domain_type_is_vm(struct dmar_domain *domain)
 547 {
 548         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 549 }
 550
 551 static inline int domain_type_is_si(struct dmar_domain *domain)
 552 {
 553         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 554 }
 555
 556 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 557 {
 558         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 559                                 DOMAIN_FLAG_STATIC_IDENTITY);
 560 }
 561
 562 static inline int domain_pfn_supported(struct dmar_domain *domain,
 563                                        unsigned long pfn)
 564 {
 565         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 566
 567         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 568 }
 569
 570 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 571 {
 572         unsigned long sagaw;
 573         int agaw = -1;
 574
 575         sagaw = cap_sagaw(iommu->cap);
 576         for (agaw = width_to_agaw(max_gaw);
 577              agaw >= 0; agaw--) {
 578                 if (test_bit(agaw, &sagaw))
 579                         break;
 580         }
 581
 582         return agaw;
 583 }
 584
 585 /*
 586  * Calculate max SAGAW for each iommu.
 587  */
 588 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 589 {
 590         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 591 }
 592
 593 /*
 594  * calculate agaw for each iommu.
 595  * "SAGAW" may be different across iommus, use a default agaw, and
 596  * get a supported less agaw for iommus that don't support the default agaw.
 597  */
 598 int iommu_calculate_agaw(struct intel_iommu *iommu)
 599 {
 600         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 601 }
 602
 603 /* This functionin only returns single iommu in a domain */
 604 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 605 {
 606         int iommu_id;
 607
 608         /* si_domain and vm domain should not get here. */
 609         BUG_ON(domain_type_is_vm_or_si(domain));
 610         for_each_domain_iommu(iommu_id, domain)
 611                 break;
 612
 613         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 614                 return NULL;
 615
 616         return g_iommus[iommu_id];
 617 }
 618
 619 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 620 {
 621         struct dmar_drhd_unit *drhd;
 622         struct intel_iommu *iommu;
 623         bool found = false;
 624         int i;
 625
 626         domain->iommu_coherency = 1;
 627
 628         for_each_domain_iommu(i, domain) {
 629                 found = true;
 630                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 631                         domain->iommu_coherency = 0;
 632                         break;
 633                 }
 634         }
 635         if (found)
 636                 return;
 637
 638         /* No hardware attached; use lowest common denominator */
 639         rcu_read_lock();
 640         for_each_active_iommu(iommu, drhd) {
 641                 if (!ecap_coherent(iommu->ecap)) {
 642                         domain->iommu_coherency = 0;
 643                         break;
 644                 }
 645         }
 646         rcu_read_unlock();
 647 }
 648
 649 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 650 {
 651         struct dmar_drhd_unit *drhd;
 652         struct intel_iommu *iommu;
 653         int ret = 1;
 654
 655         rcu_read_lock();
 656         for_each_active_iommu(iommu, drhd) {
 657                 if (iommu != skip) {
 658                         if (!ecap_sc_support(iommu->ecap)) {
 659                                 ret = 0;
 660                                 break;
 661                         }
 662                 }
 663         }
 664         rcu_read_unlock();
 665
 666         return ret;
 667 }
 668
 669 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 670 {
 671         struct dmar_drhd_unit *drhd;
 672         struct intel_iommu *iommu;
 673         int mask = 0xf;
 674
 675         if (!intel_iommu_superpage) {
 676                 return 0;
 677         }
 678
 679         /* set iommu_superpage to the smallest common denominator */
 680         rcu_read_lock();
 681         for_each_active_iommu(iommu, drhd) {
 682                 if (iommu != skip) {
 683                         mask &= cap_super_page_val(iommu->cap);
 684                         if (!mask)
 685                                 break;
 686                 }
 687         }
 688         rcu_read_unlock();
 689
 690         return fls(mask);
 691 }
 692
 693 /* Some capabilities may be different across iommus */
 694 static void domain_update_iommu_cap(struct dmar_domain *domain)
 695 {
 696         domain_update_iommu_coherency(domain);
 697         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 698         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 699 }
 700
 701 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 702                                          u8 devfn, int alloc)
 703 {
 704         struct root_entry *root = &iommu->root_entry[bus];
 705         struct context_entry *context;
 706         u64 *entry;
 707
 708         entry = &root->lo;
 709         if (sm_supported(iommu)) {
 710                 if (devfn >= 0x80) {
 711                         devfn -= 0x80;
 712                         entry = &root->hi;
 713                 }
 714                 devfn *= 2;
 715         }
 716         if (*entry & 1)
 717                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 718         else {
 719                 unsigned long phy_addr;
 720                 if (!alloc)
 721                         return NULL;
 722
 723                 context = alloc_pgtable_page(iommu->node);
 724                 if (!context)
 725                         return NULL;
 726
 727                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 728                 phy_addr = virt_to_phys((void *)context);
 729                 *entry = phy_addr | 1;
 730                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 731         }
 732         return &context[devfn];
 733 }
 734
 735 static int iommu_dummy(struct device *dev)
 736 {
 737         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 738 }
 739
 740 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 741 {
 742         struct dmar_drhd_unit *drhd = NULL;
 743         struct intel_iommu *iommu;
 744         struct device *tmp;
 745         struct pci_dev *ptmp, *pdev = NULL;
 746         u16 segment = 0;
 747         int i;
 748
 749         if (iommu_dummy(dev))
 750                 return NULL;
 751
 752         if (dev_is_pci(dev)) {
 753                 struct pci_dev *pf_pdev;
 754
 755                 pdev = to_pci_dev(dev);
 756
 757 #ifdef CONFIG_X86
 758                 /* VMD child devices currently cannot be handled individually */
 759                 if (is_vmd(pdev->bus))
 760                         return NULL;
 761 #endif
 762
 763                 /* VFs aren't listed in scope tables; we need to look up
 764                  * the PF instead to find the IOMMU. */
 765                 pf_pdev = pci_physfn(pdev);
 766                 dev = &pf_pdev->dev;
 767                 segment = pci_domain_nr(pdev->bus);
 768         } else if (has_acpi_companion(dev))
 769                 dev = &ACPI_COMPANION(dev)->dev;
 770
 771         rcu_read_lock();
 772         for_each_active_iommu(iommu, drhd) {
 773                 if (pdev && segment != drhd->segment)
 774                         continue;
 775
 776                 for_each_active_dev_scope(drhd->devices,
 777                                           drhd->devices_cnt, i, tmp) {
 778                         if (tmp == dev) {
 779                                 /* For a VF use its original BDF# not that of the PF
 780                                  * which we used for the IOMMU lookup. Strictly speaking
 781                                  * we could do this for all PCI devices; we only need to
 782                                  * get the BDF# from the scope table for ACPI matches. */
 783                                 if (pdev && pdev->is_virtfn)
 784                                         goto got_pdev;
 785
 786                                 *bus = drhd->devices[i].bus;
 787                                 *devfn = drhd->devices[i].devfn;
 788                                 goto out;
 789                         }
 790
 791                         if (!pdev || !dev_is_pci(tmp))
 792                                 continue;
 793
 794                         ptmp = to_pci_dev(tmp);
 795                         if (ptmp->subordinate &&
 796                             ptmp->subordinate->number <= pdev->bus->number &&
 797                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 798                                 goto got_pdev;
 799                 }
 800
 801                 if (pdev && drhd->include_all) {
 802                 got_pdev:
 803                         *bus = pdev->bus->number;
 804                         *devfn = pdev->devfn;
 805                         goto out;
 806                 }
 807         }
 808         iommu = NULL;
 809  out:
 810         rcu_read_unlock();
 811
 812         return iommu;
 813 }
 814
 815 static void domain_flush_cache(struct dmar_domain *domain,
 816                                void *addr, int size)
 817 {
 818         if (!domain->iommu_coherency)
 819                 clflush_cache_range(addr, size);
 820 }
 821
 822 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 823 {
 824         struct context_entry *context;
 825         int ret = 0;
 826         unsigned long flags;
 827
 828         spin_lock_irqsave(&iommu->lock, flags);
 829         context = iommu_context_addr(iommu, bus, devfn, 0);
 830         if (context)
 831                 ret = context_present(context);
 832         spin_unlock_irqrestore(&iommu->lock, flags);
 833         return ret;
 834 }
 835
 836 static void free_context_table(struct intel_iommu *iommu)
 837 {
 838         int i;
 839         unsigned long flags;
 840         struct context_entry *context;
 841
 842         spin_lock_irqsave(&iommu->lock, flags);
 843         if (!iommu->root_entry) {
 844                 goto out;
 845         }
 846         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 847                 context = iommu_context_addr(iommu, i, 0, 0);
 848                 if (context)
 849                         free_pgtable_page(context);
 850
 851                 if (!sm_supported(iommu))
 852                         continue;
 853
 854                 context = iommu_context_addr(iommu, i, 0x80, 0);
 855                 if (context)
 856                         free_pgtable_page(context);
 857
 858         }
 859         free_pgtable_page(iommu->root_entry);
 860         iommu->root_entry = NULL;
 861 out:
 862         spin_unlock_irqrestore(&iommu->lock, flags);
 863 }
 864
 865 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 866                                       unsigned long pfn, int *target_level)
 867 {
 868         struct dma_pte *parent, *pte;
 869         int level = agaw_to_level(domain->agaw);
 870         int offset;
 871
 872         BUG_ON(!domain->pgd);
 873
 874         if (!domain_pfn_supported(domain, pfn))
 875                 /* Address beyond IOMMU's addressing capabilities. */
 876                 return NULL;
 877
 878         parent = domain->pgd;
 879
 880         while (1) {
 881                 void *tmp_page;
 882
 883                 offset = pfn_level_offset(pfn, level);
 884                 pte = &parent[offset];
 885                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 886                         break;
 887                 if (level == *target_level)
 888                         break;
 889
 890                 if (!dma_pte_present(pte)) {
 891                         uint64_t pteval;
 892
 893                         tmp_page = alloc_pgtable_page(domain->nid);
 894
 895                         if (!tmp_page)
 896                                 return NULL;
 897
 898                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 899                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 900                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 901                                 /* Someone else set it while we were thinking; use theirs. */
 902                                 free_pgtable_page(tmp_page);
 903                         else
 904                                 domain_flush_cache(domain, pte, sizeof(*pte));
 905                 }
 906                 if (level == 1)
 907                         break;
 908
 909                 parent = phys_to_virt(dma_pte_addr(pte));
 910                 level--;
 911         }
 912
 913         if (!*target_level)
 914                 *target_level = level;
 915
 916         return pte;
 917 }
 918
 919
 920 /* return address's pte at specific level */
 921 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 922                                          unsigned long pfn,
 923                                          int level, int *large_page)
 924 {
 925         struct dma_pte *parent, *pte;
 926         int total = agaw_to_level(domain->agaw);
 927         int offset;
 928
 929         parent = domain->pgd;
 930         while (level <= total) {
 931                 offset = pfn_level_offset(pfn, total);
 932                 pte = &parent[offset];
 933                 if (level == total)
 934                         return pte;
 935
 936                 if (!dma_pte_present(pte)) {
 937                         *large_page = total;
 938                         break;
 939                 }
 940
 941                 if (dma_pte_superpage(pte)) {
 942                         *large_page = total;
 943                         return pte;
 944                 }
 945
 946                 parent = phys_to_virt(dma_pte_addr(pte));
 947                 total--;
 948         }
 949         return NULL;
 950 }
 951
 952 /* clear last level pte, a tlb flush should be followed */
 953 static void dma_pte_clear_range(struct dmar_domain *domain,
 954                                 unsigned long start_pfn,
 955                                 unsigned long last_pfn)
 956 {
 957         unsigned int large_page;
 958         struct dma_pte *first_pte, *pte;
 959
 960         BUG_ON(!domain_pfn_supported(domain, start_pfn));
 961         BUG_ON(!domain_pfn_supported(domain, last_pfn));
 962         BUG_ON(start_pfn > last_pfn);
 963
 964         /* we don't need lock here; nobody else touches the iova range */
 965         do {
 966                 large_page = 1;
 967                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 968                 if (!pte) {
 969                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 970                         continue;
 971                 }
 972                 do {
 973                         dma_clear_pte(pte);
 974                         start_pfn += lvl_to_nr_pages(large_page);
 975                         pte++;
 976                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 977
 978                 domain_flush_cache(domain, first_pte,
 979                                    (void *)pte - (void *)first_pte);
 980
 981         } while (start_pfn && start_pfn <= last_pfn);
 982 }
 983
 984 static void dma_pte_free_level(struct dmar_domain *domain, int level,
 985                                int retain_level, struct dma_pte *pte,
 986                                unsigned long pfn, unsigned long start_pfn,
 987                                unsigned long last_pfn)
 988 {
 989         pfn = max(start_pfn, pfn);
 990         pte = &pte[pfn_level_offset(pfn, level)];
 991
 992         do {
 993                 unsigned long level_pfn;
 994                 struct dma_pte *level_pte;
 995
 996                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 997                         goto next;
 998
 999                 level_pfn = pfn & level_mask(level);
1000                 level_pte = phys_to_virt(dma_pte_addr(pte));
1001
1002                 if (level > 2) {
1003                         dma_pte_free_level(domain, level - 1, retain_level,
1004                                            level_pte, level_pfn, start_pfn,
1005                                            last_pfn);
1006                 }
1007
1008                 /*
1009                  * Free the page table if we're below the level we want to
1010                  * retain and the range covers the entire table.
1011                  */
1012                 if (level < retain_level && !(start_pfn > level_pfn ||
1013                       last_pfn < level_pfn + level_size(level) - 1)) {
1014                         dma_clear_pte(pte);
1015                         domain_flush_cache(domain, pte, sizeof(*pte));
1016                         free_pgtable_page(level_pte);
1017                 }
1018 next:
1019                 pfn += level_size(level);
1020         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1021 }
1022
1023 /*
1024  * clear last level (leaf) ptes and free page table pages below the
1025  * level we wish to keep intact.
1026  */
1027 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1028                                    unsigned long start_pfn,
1029                                    unsigned long last_pfn,
1030                                    int retain_level)
1031 {
1032         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1033         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1034         BUG_ON(start_pfn > last_pfn);
1035
1036         dma_pte_clear_range(domain, start_pfn, last_pfn);
1037
1038         /* We don't need lock here; nobody else touches the iova range */
1039         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1040                            domain->pgd, 0, start_pfn, last_pfn);
1041
1042         /* free pgd */
1043         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1044                 free_pgtable_page(domain->pgd);
1045                 domain->pgd = NULL;
1046         }
1047 }
1048
1049 /* When a page at a given level is being unlinked from its parent, we don't
1050    need to *modify* it at all. All we need to do is make a list of all the
1051    pages which can be freed just as soon as we've flushed the IOTLB and we
1052    know the hardware page-walk will no longer touch them.
1053    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1054    be freed. */
1055 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1056                                             int level, struct dma_pte *pte,
1057                                             struct page *freelist)
1058 {
1059         struct page *pg;
1060
1061         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1062         pg->freelist = freelist;
1063         freelist = pg;
1064
1065         if (level == 1)
1066                 return freelist;
1067
1068         pte = page_address(pg);
1069         do {
1070                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1071                         freelist = dma_pte_list_pagetables(domain, level - 1,
1072                                                            pte, freelist);
1073                 pte++;
1074         } while (!first_pte_in_page(pte));
1075
1076         return freelist;
1077 }
1078
1079 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1080                                         struct dma_pte *pte, unsigned long pfn,
1081                                         unsigned long start_pfn,
1082                                         unsigned long last_pfn,
1083                                         struct page *freelist)
1084 {
1085         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1086
1087         pfn = max(start_pfn, pfn);
1088         pte = &pte[pfn_level_offset(pfn, level)];
1089
1090         do {
1091                 unsigned long level_pfn;
1092
1093                 if (!dma_pte_present(pte))
1094                         goto next;
1095
1096                 level_pfn = pfn & level_mask(level);
1097
1098                 /* If range covers entire pagetable, free it */
1099                 if (start_pfn <= level_pfn &&
1100                     last_pfn >= level_pfn + level_size(level) - 1) {
1101                         /* These suborbinate page tables are going away entirely. Don't
1102                            bother to clear them; we're just going to *free* them. */
1103                         if (level > 1 && !dma_pte_superpage(pte))
1104                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1105
1106                         dma_clear_pte(pte);
1107                         if (!first_pte)
1108                                 first_pte = pte;
1109                         last_pte = pte;
1110                 } else if (level > 1) {
1111                         /* Recurse down into a level that isn't *entirely* obsolete */
1112                         freelist = dma_pte_clear_level(domain, level - 1,
1113                                                        phys_to_virt(dma_pte_addr(pte)),
1114                                                        level_pfn, start_pfn, last_pfn,
1115                                                        freelist);
1116                 }
1117 next:
1118                 pfn += level_size(level);
1119         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1120
1121         if (first_pte)
1122                 domain_flush_cache(domain, first_pte,
1123                                    (void *)++last_pte - (void *)first_pte);
1124
1125         return freelist;
1126 }
1127
1128 /* We can't just free the pages because the IOMMU may still be walking
1129    the page tables, and may have cached the intermediate levels. The
1130    pages can only be freed after the IOTLB flush has been done. */
1131 static struct page *domain_unmap(struct dmar_domain *domain,
1132                                  unsigned long start_pfn,
1133                                  unsigned long last_pfn)
1134 {
1135         struct page *freelist;
1136
1137         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1138         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1139         BUG_ON(start_pfn > last_pfn);
1140
1141         /* we don't need lock here; nobody else touches the iova range */
1142         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1143                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1144
1145         /* free pgd */
1146         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1147                 struct page *pgd_page = virt_to_page(domain->pgd);
1148                 pgd_page->freelist = freelist;
1149                 freelist = pgd_page;
1150
1151                 domain->pgd = NULL;
1152         }
1153
1154         return freelist;
1155 }
1156
1157 static void dma_free_pagelist(struct page *freelist)
1158 {
1159         struct page *pg;
1160
1161         while ((pg = freelist)) {
1162                 freelist = pg->freelist;
1163                 free_pgtable_page(page_address(pg));
1164         }
1165 }
1166
1167 static void iova_entry_free(unsigned long data)
1168 {
1169         struct page *freelist = (struct page *)data;
1170
1171         dma_free_pagelist(freelist);
1172 }
1173
1174 /* iommu handling */
1175 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1176 {
1177         struct root_entry *root;
1178         unsigned long flags;
1179
1180         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1181         if (!root) {
1182                 pr_err("Allocating root entry for %s failed\n",
1183                         iommu->name);
1184                 return -ENOMEM;
1185         }
1186
1187         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1188
1189         spin_lock_irqsave(&iommu->lock, flags);
1190         iommu->root_entry = root;
1191         spin_unlock_irqrestore(&iommu->lock, flags);
1192
1193         return 0;
1194 }
1195
1196 static void iommu_set_root_entry(struct intel_iommu *iommu)
1197 {
1198         u64 addr;
1199         u32 sts;
1200         unsigned long flag;
1201
1202         addr = virt_to_phys(iommu->root_entry);
1203         if (sm_supported(iommu))
1204                 addr |= DMA_RTADDR_SMT;
1205
1206         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1207         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1208
1209         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (sts & DMA_GSTS_RTPS), sts);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1216 }
1217
1218 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1219 {
1220         u32 val;
1221         unsigned long flag;
1222
1223         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1224                 return;
1225
1226         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1227         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1228
1229         /* Make sure hardware complete it */
1230         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231                       readl, (!(val & DMA_GSTS_WBFS)), val);
1232
1233         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234 }
1235
1236 /* return value determine if we need a write buffer flush */
1237 static void __iommu_flush_context(struct intel_iommu *iommu,
1238                                   u16 did, u16 source_id, u8 function_mask,
1239                                   u64 type)
1240 {
1241         u64 val = 0;
1242         unsigned long flag;
1243
1244         switch (type) {
1245         case DMA_CCMD_GLOBAL_INVL:
1246                 val = DMA_CCMD_GLOBAL_INVL;
1247                 break;
1248         case DMA_CCMD_DOMAIN_INVL:
1249                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1250                 break;
1251         case DMA_CCMD_DEVICE_INVL:
1252                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1253                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1254                 break;
1255         default:
1256                 BUG();
1257         }
1258         val |= DMA_CCMD_ICC;
1259
1260         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1261         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1262
1263         /* Make sure hardware complete it */
1264         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1265                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1266
1267         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1268 }
1269
1270 /* return value determine if we need a write buffer flush */
1271 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1272                                 u64 addr, unsigned int size_order, u64 type)
1273 {
1274         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1275         u64 val = 0, val_iva = 0;
1276         unsigned long flag;
1277
1278         switch (type) {
1279         case DMA_TLB_GLOBAL_FLUSH:
1280                 /* global flush doesn't need set IVA_REG */
1281                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1282                 break;
1283         case DMA_TLB_DSI_FLUSH:
1284                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1285                 break;
1286         case DMA_TLB_PSI_FLUSH:
1287                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1288                 /* IH bit is passed in as part of address */
1289                 val_iva = size_order | addr;
1290                 break;
1291         default:
1292                 BUG();
1293         }
1294         /* Note: set drain read/write */
1295 #if 0
1296         /*
1297          * This is probably to be super secure.. Looks like we can
1298          * ignore it without any impact.
1299          */
1300         if (cap_read_drain(iommu->cap))
1301                 val |= DMA_TLB_READ_DRAIN;
1302 #endif
1303         if (cap_write_drain(iommu->cap))
1304                 val |= DMA_TLB_WRITE_DRAIN;
1305
1306         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1307         /* Note: Only uses first TLB reg currently */
1308         if (val_iva)
1309                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1310         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1311
1312         /* Make sure hardware complete it */
1313         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1314                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1315
1316         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1317
1318         /* check IOTLB invalidation granularity */
1319         if (DMA_TLB_IAIG(val) == 0)
1320                 pr_err("Flush IOTLB failed\n");
1321         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1322                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1323                         (unsigned long long)DMA_TLB_IIRG(type),
1324                         (unsigned long long)DMA_TLB_IAIG(val));
1325 }
1326
1327 static struct device_domain_info *
1328 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1329                          u8 bus, u8 devfn)
1330 {
1331         struct device_domain_info *info;
1332
1333         assert_spin_locked(&device_domain_lock);
1334
1335         if (!iommu->qi)
1336                 return NULL;
1337
1338         list_for_each_entry(info, &domain->devices, link)
1339                 if (info->iommu == iommu && info->bus == bus &&
1340                     info->devfn == devfn) {
1341                         if (info->ats_supported && info->dev)
1342                                 return info;
1343                         break;
1344                 }
1345
1346         return NULL;
1347 }
1348
1349 static void domain_update_iotlb(struct dmar_domain *domain)
1350 {
1351         struct device_domain_info *info;
1352         bool has_iotlb_device = false;
1353
1354         assert_spin_locked(&device_domain_lock);
1355
1356         list_for_each_entry(info, &domain->devices, link) {
1357                 struct pci_dev *pdev;
1358
1359                 if (!info->dev || !dev_is_pci(info->dev))
1360                         continue;
1361
1362                 pdev = to_pci_dev(info->dev);
1363                 if (pdev->ats_enabled) {
1364                         has_iotlb_device = true;
1365                         break;
1366                 }
1367         }
1368
1369         domain->has_iotlb_device = has_iotlb_device;
1370 }
1371
1372 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1373 {
1374         struct pci_dev *pdev;
1375
1376         assert_spin_locked(&device_domain_lock);
1377
1378         if (!info || !dev_is_pci(info->dev))
1379                 return;
1380
1381         pdev = to_pci_dev(info->dev);
1382         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1383          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1384          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1385          * reserved, which should be set to 0.
1386          */
1387         if (!ecap_dit(info->iommu->ecap))
1388                 info->pfsid = 0;
1389         else {
1390                 struct pci_dev *pf_pdev;
1391
1392                 /* pdev will be returned if device is not a vf */
1393                 pf_pdev = pci_physfn(pdev);
1394                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1395         }
1396
1397 #ifdef CONFIG_INTEL_IOMMU_SVM
1398         /* The PCIe spec, in its wisdom, declares that the behaviour of
1399            the device if you enable PASID support after ATS support is
1400            undefined. So always enable PASID support on devices which
1401            have it, even if we can't yet know if we're ever going to
1402            use it. */
1403         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1404                 info->pasid_enabled = 1;
1405
1406         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1407                 info->pri_enabled = 1;
1408 #endif
1409         if (!pdev->untrusted && info->ats_supported &&
1410             !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1411                 info->ats_enabled = 1;
1412                 domain_update_iotlb(info->domain);
1413                 info->ats_qdep = pci_ats_queue_depth(pdev);
1414         }
1415 }
1416
1417 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1418 {
1419         struct pci_dev *pdev;
1420
1421         assert_spin_locked(&device_domain_lock);
1422
1423         if (!dev_is_pci(info->dev))
1424                 return;
1425
1426         pdev = to_pci_dev(info->dev);
1427
1428         if (info->ats_enabled) {
1429                 pci_disable_ats(pdev);
1430                 info->ats_enabled = 0;
1431                 domain_update_iotlb(info->domain);
1432         }
1433 #ifdef CONFIG_INTEL_IOMMU_SVM
1434         if (info->pri_enabled) {
1435                 pci_disable_pri(pdev);
1436                 info->pri_enabled = 0;
1437         }
1438         if (info->pasid_enabled) {
1439                 pci_disable_pasid(pdev);
1440                 info->pasid_enabled = 0;
1441         }
1442 #endif
1443 }
1444
1445 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1446                                   u64 addr, unsigned mask)
1447 {
1448         u16 sid, qdep;
1449         unsigned long flags;
1450         struct device_domain_info *info;
1451
1452         if (!domain->has_iotlb_device)
1453                 return;
1454
1455         spin_lock_irqsave(&device_domain_lock, flags);
1456         list_for_each_entry(info, &domain->devices, link) {
1457                 if (!info->ats_enabled)
1458                         continue;
1459
1460                 sid = info->bus << 8 | info->devfn;
1461                 qdep = info->ats_qdep;
1462                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1463                                 qdep, addr, mask);
1464         }
1465         spin_unlock_irqrestore(&device_domain_lock, flags);
1466 }
1467
1468 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1469                                   struct dmar_domain *domain,
1470                                   unsigned long pfn, unsigned int pages,
1471                                   int ih, int map)
1472 {
1473         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1474         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1475         u16 did = domain->iommu_did[iommu->seq_id];
1476
1477         BUG_ON(pages == 0);
1478
1479         if (ih)
1480                 ih = 1 << 6;
1481         /*
1482          * Fallback to domain selective flush if no PSI support or the size is
1483          * too big.
1484          * PSI requires page size to be 2 ^ x, and the base address is naturally
1485          * aligned to the size
1486          */
1487         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1488                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1489                                                 DMA_TLB_DSI_FLUSH);
1490         else
1491                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1492                                                 DMA_TLB_PSI_FLUSH);
1493
1494         /*
1495          * In caching mode, changes of pages from non-present to present require
1496          * flush. However, device IOTLB doesn't need to be flushed in this case.
1497          */
1498         if (!cap_caching_mode(iommu->cap) || !map)
1499                 iommu_flush_dev_iotlb(domain, addr, mask);
1500 }
1501
1502 /* Notification for newly created mappings */
1503 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1504                                         struct dmar_domain *domain,
1505                                         unsigned long pfn, unsigned int pages)
1506 {
1507         /* It's a non-present to present mapping. Only flush if caching mode */
1508         if (cap_caching_mode(iommu->cap))
1509                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1510         else
1511                 iommu_flush_write_buffer(iommu);
1512 }
1513
1514 static void iommu_flush_iova(struct iova_domain *iovad)
1515 {
1516         struct dmar_domain *domain;
1517         int idx;
1518
1519         domain = container_of(iovad, struct dmar_domain, iovad);
1520
1521         for_each_domain_iommu(idx, domain) {
1522                 struct intel_iommu *iommu = g_iommus[idx];
1523                 u16 did = domain->iommu_did[iommu->seq_id];
1524
1525                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1526
1527                 if (!cap_caching_mode(iommu->cap))
1528                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1529                                               0, MAX_AGAW_PFN_WIDTH);
1530         }
1531 }
1532
1533 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1534 {
1535         u32 pmen;
1536         unsigned long flags;
1537
1538         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1539         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1540         pmen &= ~DMA_PMEN_EPM;
1541         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1542
1543         /* wait for the protected region status bit to clear */
1544         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1545                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1546
1547         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1548 }
1549
1550 static void iommu_enable_translation(struct intel_iommu *iommu)
1551 {
1552         u32 sts;
1553         unsigned long flags;
1554
1555         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1556         iommu->gcmd |= DMA_GCMD_TE;
1557         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1558
1559         /* Make sure hardware complete it */
1560         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1561                       readl, (sts & DMA_GSTS_TES), sts);
1562
1563         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1564 }
1565
1566 static void iommu_disable_translation(struct intel_iommu *iommu)
1567 {
1568         u32 sts;
1569         unsigned long flag;
1570
1571         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1572         iommu->gcmd &= ~DMA_GCMD_TE;
1573         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1574
1575         /* Make sure hardware complete it */
1576         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1577                       readl, (!(sts & DMA_GSTS_TES)), sts);
1578
1579         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1580 }
1581
1582
1583 static int iommu_init_domains(struct intel_iommu *iommu)
1584 {
1585         u32 ndomains, nlongs;
1586         size_t size;
1587
1588         ndomains = cap_ndoms(iommu->cap);
1589         pr_debug("%s: Number of Domains supported <%d>\n",
1590                  iommu->name, ndomains);
1591         nlongs = BITS_TO_LONGS(ndomains);
1592
1593         spin_lock_init(&iommu->lock);
1594
1595         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1596         if (!iommu->domain_ids) {
1597                 pr_err("%s: Allocating domain id array failed\n",
1598                        iommu->name);
1599                 return -ENOMEM;
1600         }
1601
1602         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1603         iommu->domains = kzalloc(size, GFP_KERNEL);
1604
1605         if (iommu->domains) {
1606                 size = 256 * sizeof(struct dmar_domain *);
1607                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1608         }
1609
1610         if (!iommu->domains || !iommu->domains[0]) {
1611                 pr_err("%s: Allocating domain array failed\n",
1612                        iommu->name);
1613                 kfree(iommu->domain_ids);
1614                 kfree(iommu->domains);
1615                 iommu->domain_ids = NULL;
1616                 iommu->domains    = NULL;
1617                 return -ENOMEM;
1618         }
1619
1620
1621
1622         /*
1623          * If Caching mode is set, then invalid translations are tagged
1624          * with domain-id 0, hence we need to pre-allocate it. We also
1625          * use domain-id 0 as a marker for non-allocated domain-id, so
1626          * make sure it is not used for a real domain.
1627          */
1628         set_bit(0, iommu->domain_ids);
1629
1630         /*
1631          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1632          * entry for first-level or pass-through translation modes should
1633          * be programmed with a domain id different from those used for
1634          * second-level or nested translation. We reserve a domain id for
1635          * this purpose.
1636          */
1637         if (sm_supported(iommu))
1638                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1639
1640         return 0;
1641 }
1642
1643 static void disable_dmar_iommu(struct intel_iommu *iommu)
1644 {
1645         struct device_domain_info *info, *tmp;
1646         unsigned long flags;
1647
1648         if (!iommu->domains || !iommu->domain_ids)
1649                 return;
1650
1651 again:
1652         spin_lock_irqsave(&device_domain_lock, flags);
1653         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1654                 struct dmar_domain *domain;
1655
1656                 if (info->iommu != iommu)
1657                         continue;
1658
1659                 if (!info->dev || !info->domain)
1660                         continue;
1661
1662                 domain = info->domain;
1663
1664                 __dmar_remove_one_dev_info(info);
1665
1666                 if (!domain_type_is_vm_or_si(domain)) {
1667                         /*
1668                          * The domain_exit() function  can't be called under
1669                          * device_domain_lock, as it takes this lock itself.
1670                          * So release the lock here and re-run the loop
1671                          * afterwards.
1672                          */
1673                         spin_unlock_irqrestore(&device_domain_lock, flags);
1674                         domain_exit(domain);
1675                         goto again;
1676                 }
1677         }
1678         spin_unlock_irqrestore(&device_domain_lock, flags);
1679
1680         if (iommu->gcmd & DMA_GCMD_TE)
1681                 iommu_disable_translation(iommu);
1682 }
1683
1684 static void free_dmar_iommu(struct intel_iommu *iommu)
1685 {
1686         if ((iommu->domains) && (iommu->domain_ids)) {
1687                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1688                 int i;
1689
1690                 for (i = 0; i < elems; i++)
1691                         kfree(iommu->domains[i]);
1692                 kfree(iommu->domains);
1693                 kfree(iommu->domain_ids);
1694                 iommu->domains = NULL;
1695                 iommu->domain_ids = NULL;
1696         }
1697
1698         g_iommus[iommu->seq_id] = NULL;
1699
1700         /* free context mapping */
1701         free_context_table(iommu);
1702
1703 #ifdef CONFIG_INTEL_IOMMU_SVM
1704         if (pasid_supported(iommu)) {
1705                 if (ecap_prs(iommu->ecap))
1706                         intel_svm_finish_prq(iommu);
1707         }
1708 #endif
1709 }
1710
1711 static struct dmar_domain *alloc_domain(int flags)
1712 {
1713         struct dmar_domain *domain;
1714
1715         domain = alloc_domain_mem();
1716         if (!domain)
1717                 return NULL;
1718
1719         memset(domain, 0, sizeof(*domain));
1720         domain->nid = -1;
1721         domain->flags = flags;
1722         domain->has_iotlb_device = false;
1723         INIT_LIST_HEAD(&domain->devices);
1724
1725         return domain;
1726 }
1727
1728 /* Must be called with iommu->lock */
1729 static int domain_attach_iommu(struct dmar_domain *domain,
1730                                struct intel_iommu *iommu)
1731 {
1732         unsigned long ndomains;
1733         int num;
1734
1735         assert_spin_locked(&device_domain_lock);
1736         assert_spin_locked(&iommu->lock);
1737
1738         domain->iommu_refcnt[iommu->seq_id] += 1;
1739         domain->iommu_count += 1;
1740         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1741                 ndomains = cap_ndoms(iommu->cap);
1742                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1743
1744                 if (num >= ndomains) {
1745                         pr_err("%s: No free domain ids\n", iommu->name);
1746                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1747                         domain->iommu_count -= 1;
1748                         return -ENOSPC;
1749                 }
1750
1751                 set_bit(num, iommu->domain_ids);
1752                 set_iommu_domain(iommu, num, domain);
1753
1754                 domain->iommu_did[iommu->seq_id] = num;
1755                 domain->nid                      = iommu->node;
1756
1757                 domain_update_iommu_cap(domain);
1758         }
1759
1760         return 0;
1761 }
1762
1763 static int domain_detach_iommu(struct dmar_domain *domain,
1764                                struct intel_iommu *iommu)
1765 {
1766         int num, count;
1767
1768         assert_spin_locked(&device_domain_lock);
1769         assert_spin_locked(&iommu->lock);
1770
1771         domain->iommu_refcnt[iommu->seq_id] -= 1;
1772         count = --domain->iommu_count;
1773         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1774                 num = domain->iommu_did[iommu->seq_id];
1775                 clear_bit(num, iommu->domain_ids);
1776                 set_iommu_domain(iommu, num, NULL);
1777
1778                 domain_update_iommu_cap(domain);
1779                 domain->iommu_did[iommu->seq_id] = 0;
1780         }
1781
1782         return count;
1783 }
1784
1785 static struct iova_domain reserved_iova_list;
1786 static struct lock_class_key reserved_rbtree_key;
1787
1788 static int dmar_init_reserved_ranges(void)
1789 {
1790         struct pci_dev *pdev = NULL;
1791         struct iova *iova;
1792         int i;
1793
1794         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1795
1796         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1797                 &reserved_rbtree_key);
1798
1799         /* IOAPIC ranges shouldn't be accessed by DMA */
1800         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1801                 IOVA_PFN(IOAPIC_RANGE_END));
1802         if (!iova) {
1803                 pr_err("Reserve IOAPIC range failed\n");
1804                 return -ENODEV;
1805         }
1806
1807         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1808         for_each_pci_dev(pdev) {
1809                 struct resource *r;
1810
1811                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1812                         r = &pdev->resource[i];
1813                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1814                                 continue;
1815                         iova = reserve_iova(&reserved_iova_list,
1816                                             IOVA_PFN(r->start),
1817                                             IOVA_PFN(r->end));
1818                         if (!iova) {
1819                                 pci_err(pdev, "Reserve iova for %pR failed\n", r);
1820                                 return -ENODEV;
1821                         }
1822                 }
1823         }
1824         return 0;
1825 }
1826
1827 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1828 {
1829         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1830 }
1831
1832 static inline int guestwidth_to_adjustwidth(int gaw)
1833 {
1834         int agaw;
1835         int r = (gaw - 12) % 9;
1836
1837         if (r == 0)
1838                 agaw = gaw;
1839         else
1840                 agaw = gaw + 9 - r;
1841         if (agaw > 64)
1842                 agaw = 64;
1843         return agaw;
1844 }
1845
1846 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1847                        int guest_width)
1848 {
1849         int adjust_width, agaw;
1850         unsigned long sagaw;
1851         int err;
1852
1853         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1854
1855         err = init_iova_flush_queue(&domain->iovad,
1856                                     iommu_flush_iova, iova_entry_free);
1857         if (err)
1858                 return err;
1859
1860         domain_reserve_special_ranges(domain);
1861
1862         /* calculate AGAW */
1863         if (guest_width > cap_mgaw(iommu->cap))
1864                 guest_width = cap_mgaw(iommu->cap);
1865         domain->gaw = guest_width;
1866         adjust_width = guestwidth_to_adjustwidth(guest_width);
1867         agaw = width_to_agaw(adjust_width);
1868         sagaw = cap_sagaw(iommu->cap);
1869         if (!test_bit(agaw, &sagaw)) {
1870                 /* hardware doesn't support it, choose a bigger one */
1871                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1872                 agaw = find_next_bit(&sagaw, 5, agaw);
1873                 if (agaw >= 5)
1874                         return -ENODEV;
1875         }
1876         domain->agaw = agaw;
1877
1878         if (ecap_coherent(iommu->ecap))
1879                 domain->iommu_coherency = 1;
1880         else
1881                 domain->iommu_coherency = 0;
1882
1883         if (ecap_sc_support(iommu->ecap))
1884                 domain->iommu_snooping = 1;
1885         else
1886                 domain->iommu_snooping = 0;
1887
1888         if (intel_iommu_superpage)
1889                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1890         else
1891                 domain->iommu_superpage = 0;
1892
1893         domain->nid = iommu->node;
1894
1895         /* always allocate the top pgd */
1896         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1897         if (!domain->pgd)
1898                 return -ENOMEM;
1899         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1900         return 0;
1901 }
1902
1903 static void domain_exit(struct dmar_domain *domain)
1904 {
1905         struct page *freelist;
1906
1907         /* Domain 0 is reserved, so dont process it */
1908         if (!domain)
1909                 return;
1910
1911         /* Remove associated devices and clear attached or cached domains */
1912         rcu_read_lock();
1913         domain_remove_dev_info(domain);
1914         rcu_read_unlock();
1915
1916         /* destroy iovas */
1917         put_iova_domain(&domain->iovad);
1918
1919         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1920
1921         dma_free_pagelist(freelist);
1922
1923         free_domain_mem(domain);
1924 }
1925
1926 /*
1927  * Get the PASID directory size for scalable mode context entry.
1928  * Value of X in the PDTS field of a scalable mode context entry
1929  * indicates PASID directory with 2^(X + 7) entries.
1930  */
1931 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1932 {
1933         int pds, max_pde;
1934
1935         max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1936         pds = find_first_bit((unsigned long *)&max_pde, MAX_NR_PASID_BITS);
1937         if (pds < 7)
1938                 return 0;
1939
1940         return pds - 7;
1941 }
1942
1943 /*
1944  * Set the RID_PASID field of a scalable mode context entry. The
1945  * IOMMU hardware will use the PASID value set in this field for
1946  * DMA translations of DMA requests without PASID.
1947  */
1948 static inline void
1949 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1950 {
1951         context->hi |= pasid & ((1 << 20) - 1);
1952         context->hi |= (1 << 20);
1953 }
1954
1955 /*
1956  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1957  * entry.
1958  */
1959 static inline void context_set_sm_dte(struct context_entry *context)
1960 {
1961         context->lo |= (1 << 2);
1962 }
1963
1964 /*
1965  * Set the PRE(Page Request Enable) field of a scalable mode context
1966  * entry.
1967  */
1968 static inline void context_set_sm_pre(struct context_entry *context)
1969 {
1970         context->lo |= (1 << 4);
1971 }
1972
1973 /* Convert value to context PASID directory size field coding. */
1974 #define context_pdts(pds)       (((pds) & 0x7) << 9)
1975
1976 static int domain_context_mapping_one(struct dmar_domain *domain,
1977                                       struct intel_iommu *iommu,
1978                                       struct pasid_table *table,
1979                                       u8 bus, u8 devfn)
1980 {
1981         u16 did = domain->iommu_did[iommu->seq_id];
1982         int translation = CONTEXT_TT_MULTI_LEVEL;
1983         struct device_domain_info *info = NULL;
1984         struct context_entry *context;
1985         unsigned long flags;
1986         int ret;
1987
1988         WARN_ON(did == 0);
1989
1990         if (hw_pass_through && domain_type_is_si(domain))
1991                 translation = CONTEXT_TT_PASS_THROUGH;
1992
1993         pr_debug("Set context mapping for %02x:%02x.%d\n",
1994                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1995
1996         BUG_ON(!domain->pgd);
1997
1998         spin_lock_irqsave(&device_domain_lock, flags);
1999         spin_lock(&iommu->lock);
2000
2001         ret = -ENOMEM;
2002         context = iommu_context_addr(iommu, bus, devfn, 1);
2003         if (!context)
2004                 goto out_unlock;
2005
2006         ret = 0;
2007         if (context_present(context))
2008                 goto out_unlock;
2009
2010         /*
2011          * For kdump cases, old valid entries may be cached due to the
2012          * in-flight DMA and copied pgtable, but there is no unmapping
2013          * behaviour for them, thus we need an explicit cache flush for
2014          * the newly-mapped device. For kdump, at this point, the device
2015          * is supposed to finish reset at its driver probe stage, so no
2016          * in-flight DMA will exist, and we don't need to worry anymore
2017          * hereafter.
2018          */
2019         if (context_copied(context)) {
2020                 u16 did_old = context_domain_id(context);
2021
2022                 if (did_old < cap_ndoms(iommu->cap)) {
2023                         iommu->flush.flush_context(iommu, did_old,
2024                                                    (((u16)bus) << 8) | devfn,
2025                                                    DMA_CCMD_MASK_NOBIT,
2026                                                    DMA_CCMD_DEVICE_INVL);
2027                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
2028                                                  DMA_TLB_DSI_FLUSH);
2029                 }
2030         }
2031
2032         context_clear_entry(context);
2033
2034         if (sm_supported(iommu)) {
2035                 unsigned long pds;
2036
2037                 WARN_ON(!table);
2038
2039                 /* Setup the PASID DIR pointer: */
2040                 pds = context_get_sm_pds(table);
2041                 context->lo = (u64)virt_to_phys(table->table) |
2042                                 context_pdts(pds);
2043
2044                 /* Setup the RID_PASID field: */
2045                 context_set_sm_rid2pasid(context, PASID_RID2PASID);
2046
2047                 /*
2048                  * Setup the Device-TLB enable bit and Page request
2049                  * Enable bit:
2050                  */
2051                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2052                 if (info && info->ats_supported)
2053                         context_set_sm_dte(context);
2054                 if (info && info->pri_supported)
2055                         context_set_sm_pre(context);
2056         } else {
2057                 struct dma_pte *pgd = domain->pgd;
2058                 int agaw;
2059
2060                 context_set_domain_id(context, did);
2061                 context_set_translation_type(context, translation);
2062
2063                 if (translation != CONTEXT_TT_PASS_THROUGH) {
2064                         /*
2065                          * Skip top levels of page tables for iommu which has
2066                          * less agaw than default. Unnecessary for PT mode.
2067                          */
2068                         for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2069                                 ret = -ENOMEM;
2070                                 pgd = phys_to_virt(dma_pte_addr(pgd));
2071                                 if (!dma_pte_present(pgd))
2072                                         goto out_unlock;
2073                         }
2074
2075                         info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2076                         if (info && info->ats_supported)
2077                                 translation = CONTEXT_TT_DEV_IOTLB;
2078                         else
2079                                 translation = CONTEXT_TT_MULTI_LEVEL;
2080
2081                         context_set_address_root(context, virt_to_phys(pgd));
2082                         context_set_address_width(context, agaw);
2083                 } else {
2084                         /*
2085                          * In pass through mode, AW must be programmed to
2086                          * indicate the largest AGAW value supported by
2087                          * hardware. And ASR is ignored by hardware.
2088                          */
2089                         context_set_address_width(context, iommu->msagaw);
2090                 }
2091         }
2092
2093         context_set_fault_enable(context);
2094         context_set_present(context);
2095         domain_flush_cache(domain, context, sizeof(*context));
2096
2097         /*
2098          * It's a non-present to present mapping. If hardware doesn't cache
2099          * non-present entry we only need to flush the write-buffer. If the
2100          * _does_ cache non-present entries, then it does so in the special
2101          * domain #0, which we have to flush:
2102          */
2103         if (cap_caching_mode(iommu->cap)) {
2104                 iommu->flush.flush_context(iommu, 0,
2105                                            (((u16)bus) << 8) | devfn,
2106                                            DMA_CCMD_MASK_NOBIT,
2107                                            DMA_CCMD_DEVICE_INVL);
2108                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2109         } else {
2110                 iommu_flush_write_buffer(iommu);
2111         }
2112         iommu_enable_dev_iotlb(info);
2113
2114         ret = 0;
2115
2116 out_unlock:
2117         spin_unlock(&iommu->lock);
2118         spin_unlock_irqrestore(&device_domain_lock, flags);
2119
2120         return ret;
2121 }
2122
2123 struct domain_context_mapping_data {
2124         struct dmar_domain *domain;
2125         struct intel_iommu *iommu;
2126         struct pasid_table *table;
2127 };
2128
2129 static int domain_context_mapping_cb(struct pci_dev *pdev,
2130                                      u16 alias, void *opaque)
2131 {
2132         struct domain_context_mapping_data *data = opaque;
2133
2134         return domain_context_mapping_one(data->domain, data->iommu,
2135                                           data->table, PCI_BUS_NUM(alias),
2136                                           alias & 0xff);
2137 }
2138
2139 static int
2140 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2141 {
2142         struct domain_context_mapping_data data;
2143         struct pasid_table *table;
2144         struct intel_iommu *iommu;
2145         u8 bus, devfn;
2146
2147         iommu = device_to_iommu(dev, &bus, &devfn);
2148         if (!iommu)
2149                 return -ENODEV;
2150
2151         table = intel_pasid_get_table(dev);
2152
2153         if (!dev_is_pci(dev))
2154                 return domain_context_mapping_one(domain, iommu, table,
2155                                                   bus, devfn);
2156
2157         data.domain = domain;
2158         data.iommu = iommu;
2159         data.table = table;
2160
2161         return pci_for_each_dma_alias(to_pci_dev(dev),
2162                                       &domain_context_mapping_cb, &data);
2163 }
2164
2165 static int domain_context_mapped_cb(struct pci_dev *pdev,
2166                                     u16 alias, void *opaque)
2167 {
2168         struct intel_iommu *iommu = opaque;
2169
2170         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2171 }
2172
2173 static int domain_context_mapped(struct device *dev)
2174 {
2175         struct intel_iommu *iommu;
2176         u8 bus, devfn;
2177
2178         iommu = device_to_iommu(dev, &bus, &devfn);
2179         if (!iommu)
2180                 return -ENODEV;
2181
2182         if (!dev_is_pci(dev))
2183                 return device_context_mapped(iommu, bus, devfn);
2184
2185         return !pci_for_each_dma_alias(to_pci_dev(dev),
2186                                        domain_context_mapped_cb, iommu);
2187 }
2188
2189 /* Returns a number of VTD pages, but aligned to MM page size */
2190 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2191                                             size_t size)
2192 {
2193         host_addr &= ~PAGE_MASK;
2194         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2195 }
2196
2197 /* Return largest possible superpage level for a given mapping */
2198 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2199                                           unsigned long iov_pfn,
2200                                           unsigned long phy_pfn,
2201                                           unsigned long pages)
2202 {
2203         int support, level = 1;
2204         unsigned long pfnmerge;
2205
2206         support = domain->iommu_superpage;
2207
2208         /* To use a large page, the virtual *and* physical addresses
2209            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2210            of them will mean we have to use smaller pages. So just
2211            merge them and check both at once. */
2212         pfnmerge = iov_pfn | phy_pfn;
2213
2214         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2215                 pages >>= VTD_STRIDE_SHIFT;
2216                 if (!pages)
2217                         break;
2218                 pfnmerge >>= VTD_STRIDE_SHIFT;
2219                 level++;
2220                 support--;
2221         }
2222         return level;
2223 }
2224
2225 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2226                             struct scatterlist *sg, unsigned long phys_pfn,
2227                             unsigned long nr_pages, int prot)
2228 {
2229         struct dma_pte *first_pte = NULL, *pte = NULL;
2230         phys_addr_t uninitialized_var(pteval);
2231         unsigned long sg_res = 0;
2232         unsigned int largepage_lvl = 0;
2233         unsigned long lvl_pages = 0;
2234
2235         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2236
2237         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2238                 return -EINVAL;
2239
2240         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2241
2242         if (!sg) {
2243                 sg_res = nr_pages;
2244                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2245         }
2246
2247         while (nr_pages > 0) {
2248                 uint64_t tmp;
2249
2250                 if (!sg_res) {
2251                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2252
2253                         sg_res = aligned_nrpages(sg->offset, sg->length);
2254                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2255                         sg->dma_length = sg->length;
2256                         pteval = (sg_phys(sg) - pgoff) | prot;
2257                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2258                 }
2259
2260                 if (!pte) {
2261                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2262
2263                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2264                         if (!pte)
2265                                 return -ENOMEM;
2266                         /* It is large page*/
2267                         if (largepage_lvl > 1) {
2268                                 unsigned long nr_superpages, end_pfn;
2269
2270                                 pteval |= DMA_PTE_LARGE_PAGE;
2271                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2272
2273                                 nr_superpages = sg_res / lvl_pages;
2274                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2275
2276                                 /*
2277                                  * Ensure that old small page tables are
2278                                  * removed to make room for superpage(s).
2279                                  * We're adding new large pages, so make sure
2280                                  * we don't remove their parent tables.
2281                                  */
2282                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2283                                                        largepage_lvl + 1);
2284                         } else {
2285                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2286                         }
2287
2288                 }
2289                 /* We don't need lock here, nobody else
2290                  * touches the iova range
2291                  */
2292                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2293                 if (tmp) {
2294                         static int dumps = 5;
2295                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2296                                 iov_pfn, tmp, (unsigned long long)pteval);
2297                         if (dumps) {
2298                                 dumps--;
2299                                 debug_dma_dump_mappings(NULL);
2300                         }
2301                         WARN_ON(1);
2302                 }
2303
2304                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2305
2306                 BUG_ON(nr_pages < lvl_pages);
2307                 BUG_ON(sg_res < lvl_pages);
2308
2309                 nr_pages -= lvl_pages;
2310                 iov_pfn += lvl_pages;
2311                 phys_pfn += lvl_pages;
2312                 pteval += lvl_pages * VTD_PAGE_SIZE;
2313                 sg_res -= lvl_pages;
2314
2315                 /* If the next PTE would be the first in a new page, then we
2316                    need to flush the cache on the entries we've just written.
2317                    And then we'll need to recalculate 'pte', so clear it and
2318                    let it get set again in the if (!pte) block above.
2319
2320                    If we're done (!nr_pages) we need to flush the cache too.
2321
2322                    Also if we've been setting superpages, we may need to
2323                    recalculate 'pte' and switch back to smaller pages for the
2324                    end of the mapping, if the trailing size is not enough to
2325                    use another superpage (i.e. sg_res < lvl_pages). */
2326                 pte++;
2327                 if (!nr_pages || first_pte_in_page(pte) ||
2328                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2329                         domain_flush_cache(domain, first_pte,
2330                                            (void *)pte - (void *)first_pte);
2331                         pte = NULL;
2332                 }
2333
2334                 if (!sg_res && nr_pages)
2335                         sg = sg_next(sg);
2336         }
2337         return 0;
2338 }
2339
2340 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2341                          struct scatterlist *sg, unsigned long phys_pfn,
2342                          unsigned long nr_pages, int prot)
2343 {
2344        int ret;
2345        struct intel_iommu *iommu;
2346
2347        /* Do the real mapping first */
2348        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2349        if (ret)
2350                return ret;
2351
2352        /* Notify about the new mapping */
2353        if (domain_type_is_vm(domain)) {
2354                /* VM typed domains can have more than one IOMMUs */
2355                int iommu_id;
2356                for_each_domain_iommu(iommu_id, domain) {
2357                        iommu = g_iommus[iommu_id];
2358                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2359                }
2360        } else {
2361                /* General domains only have one IOMMU */
2362                iommu = domain_get_iommu(domain);
2363                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2364        }
2365
2366        return 0;
2367 }
2368
2369 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2370                                     struct scatterlist *sg, unsigned long nr_pages,
2371                                     int prot)
2372 {
2373         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2374 }
2375
2376 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2377                                      unsigned long phys_pfn, unsigned long nr_pages,
2378                                      int prot)
2379 {
2380         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2381 }
2382
2383 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2384 {
2385         unsigned long flags;
2386         struct context_entry *context;
2387         u16 did_old;
2388
2389         if (!iommu)
2390                 return;
2391
2392         spin_lock_irqsave(&iommu->lock, flags);
2393         context = iommu_context_addr(iommu, bus, devfn, 0);
2394         if (!context) {
2395                 spin_unlock_irqrestore(&iommu->lock, flags);
2396                 return;
2397         }
2398         did_old = context_domain_id(context);
2399         context_clear_entry(context);
2400         __iommu_flush_cache(iommu, context, sizeof(*context));
2401         spin_unlock_irqrestore(&iommu->lock, flags);
2402         iommu->flush.flush_context(iommu,
2403                                    did_old,
2404                                    (((u16)bus) << 8) | devfn,
2405                                    DMA_CCMD_MASK_NOBIT,
2406                                    DMA_CCMD_DEVICE_INVL);
2407         iommu->flush.flush_iotlb(iommu,
2408                                  did_old,
2409                                  0,
2410                                  0,
2411                                  DMA_TLB_DSI_FLUSH);
2412 }
2413
2414 static inline void unlink_domain_info(struct device_domain_info *info)
2415 {
2416         assert_spin_locked(&device_domain_lock);
2417         list_del(&info->link);
2418         list_del(&info->global);
2419         if (info->dev)
2420                 info->dev->archdata.iommu = NULL;
2421 }
2422
2423 static void domain_remove_dev_info(struct dmar_domain *domain)
2424 {
2425         struct device_domain_info *info, *tmp;
2426         unsigned long flags;
2427
2428         spin_lock_irqsave(&device_domain_lock, flags);
2429         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2430                 __dmar_remove_one_dev_info(info);
2431         spin_unlock_irqrestore(&device_domain_lock, flags);
2432 }
2433
2434 /*
2435  * find_domain
2436  * Note: we use struct device->archdata.iommu stores the info
2437  */
2438 static struct dmar_domain *find_domain(struct device *dev)
2439 {
2440         struct device_domain_info *info;
2441
2442         /* No lock here, assumes no domain exit in normal case */
2443         info = dev->archdata.iommu;
2444         if (likely(info))
2445                 return info->domain;
2446         return NULL;
2447 }
2448
2449 static inline struct device_domain_info *
2450 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2451 {
2452         struct device_domain_info *info;
2453
2454         list_for_each_entry(info, &device_domain_list, global)
2455                 if (info->iommu->segment == segment && info->bus == bus &&
2456                     info->devfn == devfn)
2457                         return info;
2458
2459         return NULL;
2460 }
2461
2462 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2463                                                     int bus, int devfn,
2464                                                     struct device *dev,
2465                                                     struct dmar_domain *domain)
2466 {
2467         struct dmar_domain *found = NULL;
2468         struct device_domain_info *info;
2469         unsigned long flags;
2470         int ret;
2471
2472         info = alloc_devinfo_mem();
2473         if (!info)
2474                 return NULL;
2475
2476         info->bus = bus;
2477         info->devfn = devfn;
2478         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2479         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2480         info->ats_qdep = 0;
2481         info->dev = dev;
2482         info->domain = domain;
2483         info->iommu = iommu;
2484         info->pasid_table = NULL;
2485
2486         if (dev && dev_is_pci(dev)) {
2487                 struct pci_dev *pdev = to_pci_dev(info->dev);
2488
2489                 if (!pci_ats_disabled() &&
2490                     ecap_dev_iotlb_support(iommu->ecap) &&
2491                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2492                     dmar_find_matched_atsr_unit(pdev))
2493                         info->ats_supported = 1;
2494
2495                 if (sm_supported(iommu)) {
2496                         if (pasid_supported(iommu)) {
2497                                 int features = pci_pasid_features(pdev);
2498                                 if (features >= 0)
2499                                         info->pasid_supported = features | 1;
2500                         }
2501
2502                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2503                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2504                                 info->pri_supported = 1;
2505                 }
2506         }
2507
2508         spin_lock_irqsave(&device_domain_lock, flags);
2509         if (dev)
2510                 found = find_domain(dev);
2511
2512         if (!found) {
2513                 struct device_domain_info *info2;
2514                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2515                 if (info2) {
2516                         found      = info2->domain;
2517                         info2->dev = dev;
2518                 }
2519         }
2520
2521         if (found) {
2522                 spin_unlock_irqrestore(&device_domain_lock, flags);
2523                 free_devinfo_mem(info);
2524                 /* Caller must free the original domain */
2525                 return found;
2526         }
2527
2528         spin_lock(&iommu->lock);
2529         ret = domain_attach_iommu(domain, iommu);
2530         spin_unlock(&iommu->lock);
2531
2532         if (ret) {
2533                 spin_unlock_irqrestore(&device_domain_lock, flags);
2534                 free_devinfo_mem(info);
2535                 return NULL;
2536         }
2537
2538         list_add(&info->link, &domain->devices);
2539         list_add(&info->global, &device_domain_list);
2540         if (dev)
2541                 dev->archdata.iommu = info;
2542         spin_unlock_irqrestore(&device_domain_lock, flags);
2543
2544         /* PASID table is mandatory for a PCI device in scalable mode. */
2545         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2546                 ret = intel_pasid_alloc_table(dev);
2547                 if (ret) {
2548                         dev_err(dev, "PASID table allocation failed\n");
2549                         dmar_remove_one_dev_info(domain, dev);
2550                         return NULL;
2551                 }
2552
2553                 /* Setup the PASID entry for requests without PASID: */
2554                 spin_lock(&iommu->lock);
2555                 if (hw_pass_through && domain_type_is_si(domain))
2556                         ret = intel_pasid_setup_pass_through(iommu, domain,
2557                                         dev, PASID_RID2PASID);
2558                 else
2559                         ret = intel_pasid_setup_second_level(iommu, domain,
2560                                         dev, PASID_RID2PASID);
2561                 spin_unlock(&iommu->lock);
2562                 if (ret) {
2563                         dev_err(dev, "Setup RID2PASID failed\n");
2564                         dmar_remove_one_dev_info(domain, dev);
2565                         return NULL;
2566                 }
2567         }
2568
2569         if (dev && domain_context_mapping(domain, dev)) {
2570                 dev_err(dev, "Domain context map failed\n");
2571                 dmar_remove_one_dev_info(domain, dev);
2572                 return NULL;
2573         }
2574
2575         return domain;
2576 }
2577
2578 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2579 {
2580         *(u16 *)opaque = alias;
2581         return 0;
2582 }
2583
2584 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2585 {
2586         struct device_domain_info *info;
2587         struct dmar_domain *domain = NULL;
2588         struct intel_iommu *iommu;
2589         u16 dma_alias;
2590         unsigned long flags;
2591         u8 bus, devfn;
2592
2593         iommu = device_to_iommu(dev, &bus, &devfn);
2594         if (!iommu)
2595                 return NULL;
2596
2597         if (dev_is_pci(dev)) {
2598                 struct pci_dev *pdev = to_pci_dev(dev);
2599
2600                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2601
2602                 spin_lock_irqsave(&device_domain_lock, flags);
2603                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2604                                                       PCI_BUS_NUM(dma_alias),
2605                                                       dma_alias & 0xff);
2606                 if (info) {
2607                         iommu = info->iommu;
2608                         domain = info->domain;
2609                 }
2610                 spin_unlock_irqrestore(&device_domain_lock, flags);
2611
2612                 /* DMA alias already has a domain, use it */
2613                 if (info)
2614                         goto out;
2615         }
2616
2617         /* Allocate and initialize new domain for the device */
2618         domain = alloc_domain(0);
2619         if (!domain)
2620                 return NULL;
2621         if (domain_init(domain, iommu, gaw)) {
2622                 domain_exit(domain);
2623                 return NULL;
2624         }
2625
2626 out:
2627
2628         return domain;
2629 }
2630
2631 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2632                                               struct dmar_domain *domain)
2633 {
2634         struct intel_iommu *iommu;
2635         struct dmar_domain *tmp;
2636         u16 req_id, dma_alias;
2637         u8 bus, devfn;
2638
2639         iommu = device_to_iommu(dev, &bus, &devfn);
2640         if (!iommu)
2641                 return NULL;
2642
2643         req_id = ((u16)bus << 8) | devfn;
2644
2645         if (dev_is_pci(dev)) {
2646                 struct pci_dev *pdev = to_pci_dev(dev);
2647
2648                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2649
2650                 /* register PCI DMA alias device */
2651                 if (req_id != dma_alias) {
2652                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2653                                         dma_alias & 0xff, NULL, domain);
2654
2655                         if (!tmp || tmp != domain)
2656                                 return tmp;
2657                 }
2658         }
2659
2660         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2661         if (!tmp || tmp != domain)
2662                 return tmp;
2663
2664         return domain;
2665 }
2666
2667 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2668 {
2669         struct dmar_domain *domain, *tmp;
2670
2671         domain = find_domain(dev);
2672         if (domain)
2673                 goto out;
2674
2675         domain = find_or_alloc_domain(dev, gaw);
2676         if (!domain)
2677                 goto out;
2678
2679         tmp = set_domain_for_dev(dev, domain);
2680         if (!tmp || domain != tmp) {
2681                 domain_exit(domain);
2682                 domain = tmp;
2683         }
2684
2685 out:
2686
2687         return domain;
2688 }
2689
2690 static int iommu_domain_identity_map(struct dmar_domain *domain,
2691                                      unsigned long long start,
2692                                      unsigned long long end)
2693 {
2694         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2695         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2696
2697         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2698                           dma_to_mm_pfn(last_vpfn))) {
2699                 pr_err("Reserving iova failed\n");
2700                 return -ENOMEM;
2701         }
2702
2703         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2704         /*
2705          * RMRR range might have overlap with physical memory range,
2706          * clear it first
2707          */
2708         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2709
2710         return __domain_mapping(domain, first_vpfn, NULL,
2711                                 first_vpfn, last_vpfn - first_vpfn + 1,
2712                                 DMA_PTE_READ|DMA_PTE_WRITE);
2713 }
2714
2715 static int domain_prepare_identity_map(struct device *dev,
2716                                        struct dmar_domain *domain,
2717                                        unsigned long long start,
2718                                        unsigned long long end)
2719 {
2720         /* For _hardware_ passthrough, don't bother. But for software
2721            passthrough, we do it anyway -- it may indicate a memory
2722            range which is reserved in E820, so which didn't get set
2723            up to start with in si_domain */
2724         if (domain == si_domain && hw_pass_through) {
2725                 dev_warn(dev, "Ignoring identity map for HW passthrough [0x%Lx - 0x%Lx]\n",
2726                          start, end);
2727                 return 0;
2728         }
2729
2730         dev_info(dev, "Setting identity map [0x%Lx - 0x%Lx]\n", start, end);
2731
2732         if (end < start) {
2733                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2734                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2735                         dmi_get_system_info(DMI_BIOS_VENDOR),
2736                         dmi_get_system_info(DMI_BIOS_VERSION),
2737                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2738                 return -EIO;
2739         }
2740
2741         if (end >> agaw_to_width(domain->agaw)) {
2742                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2743                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2744                      agaw_to_width(domain->agaw),
2745                      dmi_get_system_info(DMI_BIOS_VENDOR),
2746                      dmi_get_system_info(DMI_BIOS_VERSION),
2747                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2748                 return -EIO;
2749         }
2750
2751         return iommu_domain_identity_map(domain, start, end);
2752 }
2753
2754 static int iommu_prepare_identity_map(struct device *dev,
2755                                       unsigned long long start,
2756                                       unsigned long long end)
2757 {
2758         struct dmar_domain *domain;
2759         int ret;
2760
2761         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2762         if (!domain)
2763                 return -ENOMEM;
2764
2765         ret = domain_prepare_identity_map(dev, domain, start, end);
2766         if (ret)
2767                 domain_exit(domain);
2768
2769         return ret;
2770 }
2771
2772 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2773                                          struct device *dev)
2774 {
2775         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2776                 return 0;
2777         return iommu_prepare_identity_map(dev, rmrr->base_address,
2778                                           rmrr->end_address);
2779 }
2780
2781 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2782 static inline void iommu_prepare_isa(void)
2783 {
2784         struct pci_dev *pdev;
2785         int ret;
2786
2787         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2788         if (!pdev)
2789                 return;
2790
2791         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2792         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2793
2794         if (ret)
2795                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2796
2797         pci_dev_put(pdev);
2798 }
2799 #else
2800 static inline void iommu_prepare_isa(void)
2801 {
2802         return;
2803 }
2804 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2805
2806 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2807
2808 static int __init si_domain_init(int hw)
2809 {
2810         int nid, ret;
2811
2812         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2813         if (!si_domain)
2814                 return -EFAULT;
2815
2816         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2817                 domain_exit(si_domain);
2818                 return -EFAULT;
2819         }
2820
2821         pr_debug("Identity mapping domain allocated\n");
2822
2823         if (hw)
2824                 return 0;
2825
2826         for_each_online_node(nid) {
2827                 unsigned long start_pfn, end_pfn;
2828                 int i;
2829
2830                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2831                         ret = iommu_domain_identity_map(si_domain,
2832                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2833                         if (ret)
2834                                 return ret;
2835                 }
2836         }
2837
2838         return 0;
2839 }
2840
2841 static int identity_mapping(struct device *dev)
2842 {
2843         struct device_domain_info *info;
2844
2845         if (likely(!iommu_identity_mapping))
2846                 return 0;
2847
2848         info = dev->archdata.iommu;
2849         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2850                 return (info->domain == si_domain);
2851
2852         return 0;
2853 }
2854
2855 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2856 {
2857         struct dmar_domain *ndomain;
2858         struct intel_iommu *iommu;
2859         u8 bus, devfn;
2860
2861         iommu = device_to_iommu(dev, &bus, &devfn);
2862         if (!iommu)
2863                 return -ENODEV;
2864
2865         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2866         if (ndomain != domain)
2867                 return -EBUSY;
2868
2869         return 0;
2870 }
2871
2872 static bool device_has_rmrr(struct device *dev)
2873 {
2874         struct dmar_rmrr_unit *rmrr;
2875         struct device *tmp;
2876         int i;
2877
2878         rcu_read_lock();
2879         for_each_rmrr_units(rmrr) {
2880                 /*
2881                  * Return TRUE if this RMRR contains the device that
2882                  * is passed in.
2883                  */
2884                 for_each_active_dev_scope(rmrr->devices,
2885                                           rmrr->devices_cnt, i, tmp)
2886                         if (tmp == dev) {
2887                                 rcu_read_unlock();
2888                                 return true;
2889                         }
2890         }
2891         rcu_read_unlock();
2892         return false;
2893 }
2894
2895 /*
2896  * There are a couple cases where we need to restrict the functionality of
2897  * devices associated with RMRRs.  The first is when evaluating a device for
2898  * identity mapping because problems exist when devices are moved in and out
2899  * of domains and their respective RMRR information is lost.  This means that
2900  * a device with associated RMRRs will never be in a "passthrough" domain.
2901  * The second is use of the device through the IOMMU API.  This interface
2902  * expects to have full control of the IOVA space for the device.  We cannot
2903  * satisfy both the requirement that RMRR access is maintained and have an
2904  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2905  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2906  * We therefore prevent devices associated with an RMRR from participating in
2907  * the IOMMU API, which eliminates them from device assignment.
2908  *
2909  * In both cases we assume that PCI USB devices with RMRRs have them largely
2910  * for historical reasons and that the RMRR space is not actively used post
2911  * boot.  This exclusion may change if vendors begin to abuse it.
2912  *
2913  * The same exception is made for graphics devices, with the requirement that
2914  * any use of the RMRR regions will be torn down before assigning the device
2915  * to a guest.
2916  */
2917 static bool device_is_rmrr_locked(struct device *dev)
2918 {
2919         if (!device_has_rmrr(dev))
2920                 return false;
2921
2922         if (dev_is_pci(dev)) {
2923                 struct pci_dev *pdev = to_pci_dev(dev);
2924
2925                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2926                         return false;
2927         }
2928
2929         return true;
2930 }
2931
2932 static int iommu_should_identity_map(struct device *dev, int startup)
2933 {
2934         if (dev_is_pci(dev)) {
2935                 struct pci_dev *pdev = to_pci_dev(dev);
2936
2937                 if (device_is_rmrr_locked(dev))
2938                         return 0;
2939
2940                 /*
2941                  * Prevent any device marked as untrusted from getting
2942                  * placed into the statically identity mapping domain.
2943                  */
2944                 if (pdev->untrusted)
2945                         return 0;
2946
2947                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2948                         return 1;
2949
2950                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2951                         return 1;
2952
2953                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2954                         return 0;
2955
2956                 /*
2957                  * We want to start off with all devices in the 1:1 domain, and
2958                  * take them out later if we find they can't access all of memory.
2959                  *
2960                  * However, we can't do this for PCI devices behind bridges,
2961                  * because all PCI devices behind the same bridge will end up
2962                  * with the same source-id on their transactions.
2963                  *
2964                  * Practically speaking, we can't change things around for these
2965                  * devices at run-time, because we can't be sure there'll be no
2966                  * DMA transactions in flight for any of their siblings.
2967                  *
2968                  * So PCI devices (unless they're on the root bus) as well as
2969                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2970                  * the 1:1 domain, just in _case_ one of their siblings turns out
2971                  * not to be able to map all of memory.
2972                  */
2973                 if (!pci_is_pcie(pdev)) {
2974                         if (!pci_is_root_bus(pdev->bus))
2975                                 return 0;
2976                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2977                                 return 0;
2978                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2979                         return 0;
2980         } else {
2981                 if (device_has_rmrr(dev))
2982                         return 0;
2983         }
2984
2985         /*
2986          * At boot time, we don't yet know if devices will be 64-bit capable.
2987          * Assume that they will — if they turn out not to be, then we can
2988          * take them out of the 1:1 domain later.
2989          */
2990         if (!startup) {
2991                 /*
2992                  * If the device's dma_mask is less than the system's memory
2993                  * size then this is not a candidate for identity mapping.
2994                  */
2995                 u64 dma_mask = *dev->dma_mask;
2996
2997                 if (dev->coherent_dma_mask &&
2998                     dev->coherent_dma_mask < dma_mask)
2999                         dma_mask = dev->coherent_dma_mask;
3000
3001                 return dma_mask >= dma_get_required_mask(dev);
3002         }
3003
3004         return 1;
3005 }
3006
3007 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
3008 {
3009         int ret;
3010
3011         if (!iommu_should_identity_map(dev, 1))
3012                 return 0;
3013
3014         ret = domain_add_dev_info(si_domain, dev);
3015         if (!ret)
3016                 dev_info(dev, "%s identity mapping\n",
3017                          hw ? "Hardware" : "Software");
3018         else if (ret == -ENODEV)
3019                 /* device not associated with an iommu */
3020                 ret = 0;
3021
3022         return ret;
3023 }
3024
3025
3026 static int __init iommu_prepare_static_identity_mapping(int hw)
3027 {
3028         struct pci_dev *pdev = NULL;
3029         struct dmar_drhd_unit *drhd;
3030         struct intel_iommu *iommu;
3031         struct device *dev;
3032         int i;
3033         int ret = 0;
3034
3035         for_each_pci_dev(pdev) {
3036                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
3037                 if (ret)
3038                         return ret;
3039         }
3040
3041         for_each_active_iommu(iommu, drhd)
3042                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
3043                         struct acpi_device_physical_node *pn;
3044                         struct acpi_device *adev;
3045
3046                         if (dev->bus != &acpi_bus_type)
3047                                 continue;
3048
3049                         adev= to_acpi_device(dev);
3050                         mutex_lock(&adev->physical_node_lock);
3051                         list_for_each_entry(pn, &adev->physical_node_list, node) {
3052                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
3053                                 if (ret)
3054                                         break;
3055                         }
3056                         mutex_unlock(&adev->physical_node_lock);
3057                         if (ret)
3058                                 return ret;
3059                 }
3060
3061         return 0;
3062 }
3063
3064 static void intel_iommu_init_qi(struct intel_iommu *iommu)
3065 {
3066         /*
3067          * Start from the sane iommu hardware state.
3068          * If the queued invalidation is already initialized by us
3069          * (for example, while enabling interrupt-remapping) then
3070          * we got the things already rolling from a sane state.
3071          */
3072         if (!iommu->qi) {
3073                 /*
3074                  * Clear any previous faults.
3075                  */
3076                 dmar_fault(-1, iommu);
3077                 /*
3078                  * Disable queued invalidation if supported and already enabled
3079                  * before OS handover.
3080                  */
3081                 dmar_disable_qi(iommu);
3082         }
3083
3084         if (dmar_enable_qi(iommu)) {
3085                 /*
3086                  * Queued Invalidate not enabled, use Register Based Invalidate
3087                  */
3088                 iommu->flush.flush_context = __iommu_flush_context;
3089                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
3090                 pr_info("%s: Using Register based invalidation\n",
3091                         iommu->name);
3092         } else {
3093                 iommu->flush.flush_context = qi_flush_context;
3094                 iommu->flush.flush_iotlb = qi_flush_iotlb;
3095                 pr_info("%s: Using Queued invalidation\n", iommu->name);
3096         }
3097 }
3098
3099 static int copy_context_table(struct intel_iommu *iommu,
3100                               struct root_entry *old_re,
3101                               struct context_entry **tbl,
3102                               int bus, bool ext)
3103 {
3104         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
3105         struct context_entry *new_ce = NULL, ce;
3106         struct context_entry *old_ce = NULL;
3107         struct root_entry re;
3108         phys_addr_t old_ce_phys;
3109
3110         tbl_idx = ext ? bus * 2 : bus;
3111         memcpy(&re, old_re, sizeof(re));
3112
3113         for (devfn = 0; devfn < 256; devfn++) {
3114                 /* First calculate the correct index */
3115                 idx = (ext ? devfn * 2 : devfn) % 256;
3116
3117                 if (idx == 0) {
3118                         /* First save what we may have and clean up */
3119                         if (new_ce) {
3120                                 tbl[tbl_idx] = new_ce;
3121                                 __iommu_flush_cache(iommu, new_ce,
3122                                                     VTD_PAGE_SIZE);
3123                                 pos = 1;
3124                         }
3125
3126                         if (old_ce)
3127                                 memunmap(old_ce);
3128
3129                         ret = 0;
3130                         if (devfn < 0x80)
3131                                 old_ce_phys = root_entry_lctp(&re);
3132                         else
3133                                 old_ce_phys = root_entry_uctp(&re);
3134
3135                         if (!old_ce_phys) {
3136                                 if (ext && devfn == 0) {
3137                                         /* No LCTP, try UCTP */
3138                                         devfn = 0x7f;
3139                                         continue;
3140                                 } else {
3141                                         goto out;
3142                                 }
3143                         }
3144
3145                         ret = -ENOMEM;
3146                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3147                                         MEMREMAP_WB);
3148                         if (!old_ce)
3149                                 goto out;
3150
3151                         new_ce = alloc_pgtable_page(iommu->node);
3152                         if (!new_ce)
3153                                 goto out_unmap;
3154
3155                         ret = 0;
3156                 }
3157
3158                 /* Now copy the context entry */
3159                 memcpy(&ce, old_ce + idx, sizeof(ce));
3160
3161                 if (!__context_present(&ce))
3162                         continue;
3163
3164                 did = context_domain_id(&ce);
3165                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3166                         set_bit(did, iommu->domain_ids);
3167
3168                 /*
3169                  * We need a marker for copied context entries. This
3170                  * marker needs to work for the old format as well as
3171                  * for extended context entries.
3172                  *
3173                  * Bit 67 of the context entry is used. In the old
3174                  * format this bit is available to software, in the
3175                  * extended format it is the PGE bit, but PGE is ignored
3176                  * by HW if PASIDs are disabled (and thus still
3177                  * available).
3178                  *
3179                  * So disable PASIDs first and then mark the entry
3180                  * copied. This means that we don't copy PASID
3181                  * translations from the old kernel, but this is fine as
3182                  * faults there are not fatal.
3183                  */
3184                 context_clear_pasid_enable(&ce);
3185                 context_set_copied(&ce);
3186
3187                 new_ce[idx] = ce;
3188         }
3189
3190         tbl[tbl_idx + pos] = new_ce;
3191
3192         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3193
3194 out_unmap:
3195         memunmap(old_ce);
3196
3197 out:
3198         return ret;
3199 }
3200
3201 static int copy_translation_tables(struct intel_iommu *iommu)
3202 {
3203         struct context_entry **ctxt_tbls;
3204         struct root_entry *old_rt;
3205         phys_addr_t old_rt_phys;
3206         int ctxt_table_entries;
3207         unsigned long flags;
3208         u64 rtaddr_reg;
3209         int bus, ret;
3210         bool new_ext, ext;
3211
3212         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3213         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3214         new_ext    = !!ecap_ecs(iommu->ecap);
3215
3216         /*
3217          * The RTT bit can only be changed when translation is disabled,
3218          * but disabling translation means to open a window for data
3219          * corruption. So bail out and don't copy anything if we would
3220          * have to change the bit.
3221          */
3222         if (new_ext != ext)
3223                 return -EINVAL;
3224
3225         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3226         if (!old_rt_phys)
3227                 return -EINVAL;
3228
3229         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3230         if (!old_rt)
3231                 return -ENOMEM;
3232
3233         /* This is too big for the stack - allocate it from slab */
3234         ctxt_table_entries = ext ? 512 : 256;
3235         ret = -ENOMEM;
3236         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3237         if (!ctxt_tbls)
3238                 goto out_unmap;
3239
3240         for (bus = 0; bus < 256; bus++) {
3241                 ret = copy_context_table(iommu, &old_rt[bus],
3242                                          ctxt_tbls, bus, ext);
3243                 if (ret) {
3244                         pr_err("%s: Failed to copy context table for bus %d\n",
3245                                 iommu->name, bus);
3246                         continue;
3247                 }
3248         }
3249
3250         spin_lock_irqsave(&iommu->lock, flags);
3251
3252         /* Context tables are copied, now write them to the root_entry table */
3253         for (bus = 0; bus < 256; bus++) {
3254                 int idx = ext ? bus * 2 : bus;
3255                 u64 val;
3256
3257                 if (ctxt_tbls[idx]) {
3258                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3259                         iommu->root_entry[bus].lo = val;
3260                 }
3261
3262                 if (!ext || !ctxt_tbls[idx + 1])
3263                         continue;
3264
3265                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3266                 iommu->root_entry[bus].hi = val;
3267         }
3268
3269         spin_unlock_irqrestore(&iommu->lock, flags);
3270
3271         kfree(ctxt_tbls);
3272
3273         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3274
3275         ret = 0;
3276
3277 out_unmap:
3278         memunmap(old_rt);
3279
3280         return ret;
3281 }
3282
3283 static int __init init_dmars(void)
3284 {
3285         struct dmar_drhd_unit *drhd;
3286         struct dmar_rmrr_unit *rmrr;
3287         bool copied_tables = false;
3288         struct device *dev;
3289         struct intel_iommu *iommu;
3290         int i, ret;
3291
3292         /*
3293          * for each drhd
3294          *    allocate root
3295          *    initialize and program root entry to not present
3296          * endfor
3297          */
3298         for_each_drhd_unit(drhd) {
3299                 /*
3300                  * lock not needed as this is only incremented in the single
3301                  * threaded kernel __init code path all other access are read
3302                  * only
3303                  */
3304                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3305                         g_num_of_iommus++;
3306                         continue;
3307                 }
3308                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3309         }
3310
3311         /* Preallocate enough resources for IOMMU hot-addition */
3312         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3313                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3314
3315         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3316                         GFP_KERNEL);
3317         if (!g_iommus) {
3318                 pr_err("Allocating global iommu array failed\n");
3319                 ret = -ENOMEM;
3320                 goto error;
3321         }
3322
3323         for_each_active_iommu(iommu, drhd) {
3324                 /*
3325                  * Find the max pasid size of all IOMMU's in the system.
3326                  * We need to ensure the system pasid table is no bigger
3327                  * than the smallest supported.
3328                  */
3329                 if (pasid_supported(iommu)) {
3330                         u32 temp = 2 << ecap_pss(iommu->ecap);
3331
3332                         intel_pasid_max_id = min_t(u32, temp,
3333                                                    intel_pasid_max_id);
3334                 }
3335
3336                 g_iommus[iommu->seq_id] = iommu;
3337
3338                 intel_iommu_init_qi(iommu);
3339
3340                 ret = iommu_init_domains(iommu);
3341                 if (ret)
3342                         goto free_iommu;
3343
3344                 init_translation_status(iommu);
3345
3346                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3347                         iommu_disable_translation(iommu);
3348                         clear_translation_pre_enabled(iommu);
3349                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3350                                 iommu->name);
3351                 }
3352
3353                 /*
3354                  * TBD:
3355                  * we could share the same root & context tables
3356                  * among all IOMMU's. Need to Split it later.
3357                  */
3358                 ret = iommu_alloc_root_entry(iommu);
3359                 if (ret)
3360                         goto free_iommu;
3361
3362                 if (translation_pre_enabled(iommu)) {
3363                         pr_info("Translation already enabled - trying to copy translation structures\n");
3364
3365                         ret = copy_translation_tables(iommu);
3366                         if (ret) {
3367                                 /*
3368                                  * We found the IOMMU with translation
3369                                  * enabled - but failed to copy over the
3370                                  * old root-entry table. Try to proceed
3371                                  * by disabling translation now and
3372                                  * allocating a clean root-entry table.
3373                                  * This might cause DMAR faults, but
3374                                  * probably the dump will still succeed.
3375                                  */
3376                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3377                                        iommu->name);
3378                                 iommu_disable_translation(iommu);
3379                                 clear_translation_pre_enabled(iommu);
3380                         } else {
3381                                 pr_info("Copied translation tables from previous kernel for %s\n",
3382                                         iommu->name);
3383                                 copied_tables = true;
3384                         }
3385                 }
3386
3387                 if (!ecap_pass_through(iommu->ecap))
3388                         hw_pass_through = 0;
3389 #ifdef CONFIG_INTEL_IOMMU_SVM
3390                 if (pasid_supported(iommu))
3391                         intel_svm_init(iommu);
3392 #endif
3393         }
3394
3395         /*
3396          * Now that qi is enabled on all iommus, set the root entry and flush
3397          * caches. This is required on some Intel X58 chipsets, otherwise the
3398          * flush_context function will loop forever and the boot hangs.
3399          */
3400         for_each_active_iommu(iommu, drhd) {
3401                 iommu_flush_write_buffer(iommu);
3402                 iommu_set_root_entry(iommu);
3403                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3404                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3405         }
3406
3407         if (iommu_pass_through)
3408                 iommu_identity_mapping |= IDENTMAP_ALL;
3409
3410 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3411         iommu_identity_mapping |= IDENTMAP_GFX;
3412 #endif
3413
3414         check_tylersburg_isoch();
3415
3416         if (iommu_identity_mapping) {
3417                 ret = si_domain_init(hw_pass_through);
3418                 if (ret)
3419                         goto free_iommu;
3420         }
3421
3422
3423         /*
3424          * If we copied translations from a previous kernel in the kdump
3425          * case, we can not assign the devices to domains now, as that
3426          * would eliminate the old mappings. So skip this part and defer
3427          * the assignment to device driver initialization time.
3428          */
3429         if (copied_tables)
3430                 goto domains_done;
3431
3432         /*
3433          * If pass through is not set or not enabled, setup context entries for
3434          * identity mappings for rmrr, gfx, and isa and may fall back to static
3435          * identity mapping if iommu_identity_mapping is set.
3436          */
3437         if (iommu_identity_mapping) {
3438                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3439                 if (ret) {
3440                         pr_crit("Failed to setup IOMMU pass-through\n");
3441                         goto free_iommu;
3442                 }
3443         }
3444         /*
3445          * For each rmrr
3446          *   for each dev attached to rmrr
3447          *   do
3448          *     locate drhd for dev, alloc domain for dev
3449          *     allocate free domain
3450          *     allocate page table entries for rmrr
3451          *     if context not allocated for bus
3452          *           allocate and init context
3453          *           set present in root table for this bus
3454          *     init context with domain, translation etc
3455          *    endfor
3456          * endfor
3457          */
3458         pr_info("Setting RMRR:\n");
3459         for_each_rmrr_units(rmrr) {
3460                 /* some BIOS lists non-exist devices in DMAR table. */
3461                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3462                                           i, dev) {
3463                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3464                         if (ret)
3465                                 pr_err("Mapping reserved region failed\n");
3466                 }
3467         }
3468
3469         iommu_prepare_isa();
3470
3471 domains_done:
3472
3473         /*
3474          * for each drhd
3475          *   enable fault log
3476          *   global invalidate context cache
3477          *   global invalidate iotlb
3478          *   enable translation
3479          */
3480         for_each_iommu(iommu, drhd) {
3481                 if (drhd->ignored) {
3482                         /*
3483                          * we always have to disable PMRs or DMA may fail on
3484                          * this device
3485                          */
3486                         if (force_on)
3487                                 iommu_disable_protect_mem_regions(iommu);
3488                         continue;
3489                 }
3490
3491                 iommu_flush_write_buffer(iommu);
3492
3493 #ifdef CONFIG_INTEL_IOMMU_SVM
3494                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3495                         ret = intel_svm_enable_prq(iommu);
3496                         if (ret)
3497                                 goto free_iommu;
3498                 }
3499 #endif
3500                 ret = dmar_set_interrupt(iommu);
3501                 if (ret)
3502                         goto free_iommu;
3503
3504                 if (!translation_pre_enabled(iommu))
3505                         iommu_enable_translation(iommu);
3506
3507                 iommu_disable_protect_mem_regions(iommu);
3508         }
3509
3510         return 0;
3511
3512 free_iommu:
3513         for_each_active_iommu(iommu, drhd) {
3514                 disable_dmar_iommu(iommu);
3515                 free_dmar_iommu(iommu);
3516         }
3517
3518         kfree(g_iommus);
3519
3520 error:
3521         return ret;
3522 }
3523
3524 /* This takes a number of _MM_ pages, not VTD pages */
3525 static unsigned long intel_alloc_iova(struct device *dev,
3526                                      struct dmar_domain *domain,
3527                                      unsigned long nrpages, uint64_t dma_mask)
3528 {
3529         unsigned long iova_pfn;
3530
3531         /* Restrict dma_mask to the width that the iommu can handle */
3532         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3533         /* Ensure we reserve the whole size-aligned region */
3534         nrpages = __roundup_pow_of_two(nrpages);
3535
3536         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3537                 /*
3538                  * First try to allocate an io virtual address in
3539                  * DMA_BIT_MASK(32) and if that fails then try allocating
3540                  * from higher range
3541                  */
3542                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3543                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3544                 if (iova_pfn)
3545                         return iova_pfn;
3546         }
3547         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3548                                    IOVA_PFN(dma_mask), true);
3549         if (unlikely(!iova_pfn)) {
3550                 dev_err(dev, "Allocating %ld-page iova failed", nrpages);
3551                 return 0;
3552         }
3553
3554         return iova_pfn;
3555 }
3556
3557 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3558 {
3559         struct dmar_domain *domain, *tmp;
3560         struct dmar_rmrr_unit *rmrr;
3561         struct device *i_dev;
3562         int i, ret;
3563
3564         domain = find_domain(dev);
3565         if (domain)
3566                 goto out;
3567
3568         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3569         if (!domain)
3570                 goto out;
3571
3572         /* We have a new domain - setup possible RMRRs for the device */
3573         rcu_read_lock();
3574         for_each_rmrr_units(rmrr) {
3575                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3576                                           i, i_dev) {
3577                         if (i_dev != dev)
3578                                 continue;
3579
3580                         ret = domain_prepare_identity_map(dev, domain,
3581                                                           rmrr->base_address,
3582                                                           rmrr->end_address);
3583                         if (ret)
3584                                 dev_err(dev, "Mapping reserved region failed\n");
3585                 }
3586         }
3587         rcu_read_unlock();
3588
3589         tmp = set_domain_for_dev(dev, domain);
3590         if (!tmp || domain != tmp) {
3591                 domain_exit(domain);
3592                 domain = tmp;
3593         }
3594
3595 out:
3596
3597         if (!domain)
3598                 dev_err(dev, "Allocating domain failed\n");
3599
3600
3601         return domain;
3602 }
3603
3604 /* Check if the dev needs to go through non-identity map and unmap process.*/
3605 static int iommu_no_mapping(struct device *dev)
3606 {
3607         int found;
3608
3609         if (iommu_dummy(dev))
3610                 return 1;
3611
3612         if (!iommu_identity_mapping)
3613                 return 0;
3614
3615         found = identity_mapping(dev);
3616         if (found) {
3617                 if (iommu_should_identity_map(dev, 0))
3618                         return 1;
3619                 else {
3620                         /*
3621                          * 32 bit DMA is removed from si_domain and fall back
3622                          * to non-identity mapping.
3623                          */
3624                         dmar_remove_one_dev_info(si_domain, dev);
3625                         dev_info(dev, "32bit DMA uses non-identity mapping\n");
3626                         return 0;
3627                 }
3628         } else {
3629                 /*
3630                  * In case of a detached 64 bit DMA device from vm, the device
3631                  * is put into si_domain for identity mapping.
3632                  */
3633                 if (iommu_should_identity_map(dev, 0)) {
3634                         int ret;
3635                         ret = domain_add_dev_info(si_domain, dev);
3636                         if (!ret) {
3637                                 dev_info(dev, "64bit DMA uses identity mapping\n");
3638                                 return 1;
3639                         }
3640                 }
3641         }
3642
3643         return 0;
3644 }
3645
3646 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3647                                      size_t size, int dir, u64 dma_mask)
3648 {
3649         struct dmar_domain *domain;
3650         phys_addr_t start_paddr;
3651         unsigned long iova_pfn;
3652         int prot = 0;
3653         int ret;
3654         struct intel_iommu *iommu;
3655         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3656
3657         BUG_ON(dir == DMA_NONE);
3658
3659         if (iommu_no_mapping(dev))
3660                 return paddr;
3661
3662         domain = get_valid_domain_for_dev(dev);
3663         if (!domain)
3664                 return DMA_MAPPING_ERROR;
3665
3666         iommu = domain_get_iommu(domain);
3667         size = aligned_nrpages(paddr, size);
3668
3669         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3670         if (!iova_pfn)
3671                 goto error;
3672
3673         /*
3674          * Check if DMAR supports zero-length reads on write only
3675          * mappings..
3676          */
3677         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3678                         !cap_zlr(iommu->cap))
3679                 prot |= DMA_PTE_READ;
3680         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3681                 prot |= DMA_PTE_WRITE;
3682         /*
3683          * paddr - (paddr + size) might be partial page, we should map the whole
3684          * page.  Note: if two part of one page are separately mapped, we
3685          * might have two guest_addr mapping to the same host paddr, but this
3686          * is not a big problem
3687          */
3688         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3689                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3690         if (ret)
3691                 goto error;
3692
3693         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3694         start_paddr += paddr & ~PAGE_MASK;
3695         return start_paddr;
3696
3697 error:
3698         if (iova_pfn)
3699                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3700         dev_err(dev, "Device request: %zx@%llx dir %d --- failed\n",
3701                 size, (unsigned long long)paddr, dir);
3702         return DMA_MAPPING_ERROR;
3703 }
3704
3705 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3706                                  unsigned long offset, size_t size,
3707                                  enum dma_data_direction dir,
3708                                  unsigned long attrs)
3709 {
3710         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3711                                   dir, *dev->dma_mask);
3712 }
3713
3714 static dma_addr_t intel_map_resource(struct device *dev, phys_addr_t phys_addr,
3715                                      size_t size, enum dma_data_direction dir,
3716                                      unsigned long attrs)
3717 {
3718         return __intel_map_single(dev, phys_addr, size, dir, *dev->dma_mask);
3719 }
3720
3721 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3722 {
3723         struct dmar_domain *domain;
3724         unsigned long start_pfn, last_pfn;
3725         unsigned long nrpages;
3726         unsigned long iova_pfn;
3727         struct intel_iommu *iommu;
3728         struct page *freelist;
3729
3730         if (iommu_no_mapping(dev))
3731                 return;
3732
3733         domain = find_domain(dev);
3734         BUG_ON(!domain);
3735
3736         iommu = domain_get_iommu(domain);
3737
3738         iova_pfn = IOVA_PFN(dev_addr);
3739
3740         nrpages = aligned_nrpages(dev_addr, size);
3741         start_pfn = mm_to_dma_pfn(iova_pfn);
3742         last_pfn = start_pfn + nrpages - 1;
3743
3744         dev_dbg(dev, "Device unmapping: pfn %lx-%lx\n", start_pfn, last_pfn);
3745
3746         freelist = domain_unmap(domain, start_pfn, last_pfn);
3747
3748         if (intel_iommu_strict) {
3749                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3750                                       nrpages, !freelist, 0);
3751                 /* free iova */
3752                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3753                 dma_free_pagelist(freelist);
3754         } else {
3755                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3756                            (unsigned long)freelist);
3757                 /*
3758                  * queue up the release of the unmap to save the 1/6th of the
3759                  * cpu used up by the iotlb flush operation...
3760                  */
3761         }
3762 }
3763
3764 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3765                              size_t size, enum dma_data_direction dir,
3766                              unsigned long attrs)
3767 {
3768         intel_unmap(dev, dev_addr, size);
3769 }
3770
3771 static void *intel_alloc_coherent(struct device *dev, size_t size,
3772                                   dma_addr_t *dma_handle, gfp_t flags,
3773                                   unsigned long attrs)
3774 {
3775         struct page *page = NULL;
3776         int order;
3777
3778         size = PAGE_ALIGN(size);
3779         order = get_order(size);
3780
3781         if (!iommu_no_mapping(dev))
3782                 flags &= ~(GFP_DMA | GFP_DMA32);
3783         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3784                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3785                         flags |= GFP_DMA;
3786                 else
3787                         flags |= GFP_DMA32;
3788         }
3789
3790         if (gfpflags_allow_blocking(flags)) {
3791                 unsigned int count = size >> PAGE_SHIFT;
3792
3793                 page = dma_alloc_from_contiguous(dev, count, order,
3794                                                  flags & __GFP_NOWARN);
3795                 if (page && iommu_no_mapping(dev) &&
3796                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3797                         dma_release_from_contiguous(dev, page, count);
3798                         page = NULL;
3799                 }
3800         }
3801
3802         if (!page)
3803                 page = alloc_pages(flags, order);
3804         if (!page)
3805                 return NULL;
3806         memset(page_address(page), 0, size);
3807
3808         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3809                                          DMA_BIDIRECTIONAL,
3810                                          dev->coherent_dma_mask);
3811         if (*dma_handle != DMA_MAPPING_ERROR)
3812                 return page_address(page);
3813         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3814                 __free_pages(page, order);
3815
3816         return NULL;
3817 }
3818
3819 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3820                                 dma_addr_t dma_handle, unsigned long attrs)
3821 {
3822         int order;
3823         struct page *page = virt_to_page(vaddr);
3824
3825         size = PAGE_ALIGN(size);
3826         order = get_order(size);
3827
3828         intel_unmap(dev, dma_handle, size);
3829         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3830                 __free_pages(page, order);
3831 }
3832
3833 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3834                            int nelems, enum dma_data_direction dir,
3835                            unsigned long attrs)
3836 {
3837         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3838         unsigned long nrpages = 0;
3839         struct scatterlist *sg;
3840         int i;
3841
3842         for_each_sg(sglist, sg, nelems, i) {
3843                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3844         }
3845
3846         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3847 }
3848
3849 static int intel_nontranslate_map_sg(struct device *hddev,
3850         struct scatterlist *sglist, int nelems, int dir)
3851 {
3852         int i;
3853         struct scatterlist *sg;
3854
3855         for_each_sg(sglist, sg, nelems, i) {
3856                 BUG_ON(!sg_page(sg));
3857                 sg->dma_address = sg_phys(sg);
3858                 sg->dma_length = sg->length;
3859         }
3860         return nelems;
3861 }
3862
3863 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3864                         enum dma_data_direction dir, unsigned long attrs)
3865 {
3866         int i;
3867         struct dmar_domain *domain;
3868         size_t size = 0;
3869         int prot = 0;
3870         unsigned long iova_pfn;
3871         int ret;
3872         struct scatterlist *sg;
3873         unsigned long start_vpfn;
3874         struct intel_iommu *iommu;
3875
3876         BUG_ON(dir == DMA_NONE);
3877         if (iommu_no_mapping(dev))
3878                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3879
3880         domain = get_valid_domain_for_dev(dev);
3881         if (!domain)
3882                 return 0;
3883
3884         iommu = domain_get_iommu(domain);
3885
3886         for_each_sg(sglist, sg, nelems, i)
3887                 size += aligned_nrpages(sg->offset, sg->length);
3888
3889         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3890                                 *dev->dma_mask);
3891         if (!iova_pfn) {
3892                 sglist->dma_length = 0;
3893                 return 0;
3894         }
3895
3896         /*
3897          * Check if DMAR supports zero-length reads on write only
3898          * mappings..
3899          */
3900         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3901                         !cap_zlr(iommu->cap))
3902                 prot |= DMA_PTE_READ;
3903         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3904                 prot |= DMA_PTE_WRITE;
3905
3906         start_vpfn = mm_to_dma_pfn(iova_pfn);
3907
3908         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3909         if (unlikely(ret)) {
3910                 dma_pte_free_pagetable(domain, start_vpfn,
3911                                        start_vpfn + size - 1,
3912                                        agaw_to_level(domain->agaw) + 1);
3913                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3914                 return 0;
3915         }
3916
3917         return nelems;
3918 }
3919
3920 static const struct dma_map_ops intel_dma_ops = {
3921         .alloc = intel_alloc_coherent,
3922         .free = intel_free_coherent,
3923         .map_sg = intel_map_sg,
3924         .unmap_sg = intel_unmap_sg,
3925         .map_page = intel_map_page,
3926         .unmap_page = intel_unmap_page,
3927         .map_resource = intel_map_resource,
3928         .unmap_resource = intel_unmap_page,
3929         .dma_supported = dma_direct_supported,
3930 };
3931
3932 static inline int iommu_domain_cache_init(void)
3933 {
3934         int ret = 0;
3935
3936         iommu_domain_cache = kmem_cache_create("iommu_domain",
3937                                          sizeof(struct dmar_domain),
3938                                          0,
3939                                          SLAB_HWCACHE_ALIGN,
3940
3941                                          NULL);
3942         if (!iommu_domain_cache) {
3943                 pr_err("Couldn't create iommu_domain cache\n");
3944                 ret = -ENOMEM;
3945         }
3946
3947         return ret;
3948 }
3949
3950 static inline int iommu_devinfo_cache_init(void)
3951 {
3952         int ret = 0;
3953
3954         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3955                                          sizeof(struct device_domain_info),
3956                                          0,
3957                                          SLAB_HWCACHE_ALIGN,
3958                                          NULL);
3959         if (!iommu_devinfo_cache) {
3960                 pr_err("Couldn't create devinfo cache\n");
3961                 ret = -ENOMEM;
3962         }
3963
3964         return ret;
3965 }
3966
3967 static int __init iommu_init_mempool(void)
3968 {
3969         int ret;
3970         ret = iova_cache_get();
3971         if (ret)
3972                 return ret;
3973
3974         ret = iommu_domain_cache_init();
3975         if (ret)
3976                 goto domain_error;
3977
3978         ret = iommu_devinfo_cache_init();
3979         if (!ret)
3980                 return ret;
3981
3982         kmem_cache_destroy(iommu_domain_cache);
3983 domain_error:
3984         iova_cache_put();
3985
3986         return -ENOMEM;
3987 }
3988
3989 static void __init iommu_exit_mempool(void)
3990 {
3991         kmem_cache_destroy(iommu_devinfo_cache);
3992         kmem_cache_destroy(iommu_domain_cache);
3993         iova_cache_put();
3994 }
3995
3996 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3997 {
3998         struct dmar_drhd_unit *drhd;
3999         u32 vtbar;
4000         int rc;
4001
4002         /* We know that this device on this chipset has its own IOMMU.
4003          * If we find it under a different IOMMU, then the BIOS is lying
4004          * to us. Hope that the IOMMU for this device is actually
4005          * disabled, and it needs no translation...
4006          */
4007         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
4008         if (rc) {
4009                 /* "can't" happen */
4010                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
4011                 return;
4012         }
4013         vtbar &= 0xffff0000;
4014
4015         /* we know that the this iommu should be at offset 0xa000 from vtbar */
4016         drhd = dmar_find_matched_drhd_unit(pdev);
4017         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
4018                             TAINT_FIRMWARE_WORKAROUND,
4019                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
4020                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4021 }
4022 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
4023
4024 static void __init init_no_remapping_devices(void)
4025 {
4026         struct dmar_drhd_unit *drhd;
4027         struct device *dev;
4028         int i;
4029
4030         for_each_drhd_unit(drhd) {
4031                 if (!drhd->include_all) {
4032                         for_each_active_dev_scope(drhd->devices,
4033                                                   drhd->devices_cnt, i, dev)
4034                                 break;
4035                         /* ignore DMAR unit if no devices exist */
4036                         if (i == drhd->devices_cnt)
4037                                 drhd->ignored = 1;
4038                 }
4039         }
4040
4041         for_each_active_drhd_unit(drhd) {
4042                 if (drhd->include_all)
4043                         continue;
4044
4045                 for_each_active_dev_scope(drhd->devices,
4046                                           drhd->devices_cnt, i, dev)
4047                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
4048                                 break;
4049                 if (i < drhd->devices_cnt)
4050                         continue;
4051
4052                 /* This IOMMU has *only* gfx devices. Either bypass it or
4053                    set the gfx_mapped flag, as appropriate */
4054                 if (dmar_map_gfx) {
4055                         intel_iommu_gfx_mapped = 1;
4056                 } else {
4057                         drhd->ignored = 1;
4058                         for_each_active_dev_scope(drhd->devices,
4059                                                   drhd->devices_cnt, i, dev)
4060                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
4061                 }
4062         }
4063 }
4064
4065 #ifdef CONFIG_SUSPEND
4066 static int init_iommu_hw(void)
4067 {
4068         struct dmar_drhd_unit *drhd;
4069         struct intel_iommu *iommu = NULL;
4070
4071         for_each_active_iommu(iommu, drhd)
4072                 if (iommu->qi)
4073                         dmar_reenable_qi(iommu);
4074
4075         for_each_iommu(iommu, drhd) {
4076                 if (drhd->ignored) {
4077                         /*
4078                          * we always have to disable PMRs or DMA may fail on
4079                          * this device
4080                          */
4081                         if (force_on)
4082                                 iommu_disable_protect_mem_regions(iommu);
4083                         continue;
4084                 }
4085
4086                 iommu_flush_write_buffer(iommu);
4087
4088                 iommu_set_root_entry(iommu);
4089
4090                 iommu->flush.flush_context(iommu, 0, 0, 0,
4091                                            DMA_CCMD_GLOBAL_INVL);
4092                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4093                 iommu_enable_translation(iommu);
4094                 iommu_disable_protect_mem_regions(iommu);
4095         }
4096
4097         return 0;
4098 }
4099
4100 static void iommu_flush_all(void)
4101 {
4102         struct dmar_drhd_unit *drhd;
4103         struct intel_iommu *iommu;
4104
4105         for_each_active_iommu(iommu, drhd) {
4106                 iommu->flush.flush_context(iommu, 0, 0, 0,
4107                                            DMA_CCMD_GLOBAL_INVL);
4108                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4109                                          DMA_TLB_GLOBAL_FLUSH);
4110         }
4111 }
4112
4113 static int iommu_suspend(void)
4114 {
4115         struct dmar_drhd_unit *drhd;
4116         struct intel_iommu *iommu = NULL;
4117         unsigned long flag;
4118
4119         for_each_active_iommu(iommu, drhd) {
4120                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4121                                                  GFP_ATOMIC);
4122                 if (!iommu->iommu_state)
4123                         goto nomem;
4124         }
4125
4126         iommu_flush_all();
4127
4128         for_each_active_iommu(iommu, drhd) {
4129                 iommu_disable_translation(iommu);
4130
4131                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4132
4133                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4134                         readl(iommu->reg + DMAR_FECTL_REG);
4135                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4136                         readl(iommu->reg + DMAR_FEDATA_REG);
4137                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4138                         readl(iommu->reg + DMAR_FEADDR_REG);
4139                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4140                         readl(iommu->reg + DMAR_FEUADDR_REG);
4141
4142                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4143         }
4144         return 0;
4145
4146 nomem:
4147         for_each_active_iommu(iommu, drhd)
4148                 kfree(iommu->iommu_state);
4149
4150         return -ENOMEM;
4151 }
4152
4153 static void iommu_resume(void)
4154 {
4155         struct dmar_drhd_unit *drhd;
4156         struct intel_iommu *iommu = NULL;
4157         unsigned long flag;
4158
4159         if (init_iommu_hw()) {
4160                 if (force_on)
4161                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4162                 else
4163                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4164                 return;
4165         }
4166
4167         for_each_active_iommu(iommu, drhd) {
4168
4169                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4170
4171                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4172                         iommu->reg + DMAR_FECTL_REG);
4173                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4174                         iommu->reg + DMAR_FEDATA_REG);
4175                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4176                         iommu->reg + DMAR_FEADDR_REG);
4177                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4178                         iommu->reg + DMAR_FEUADDR_REG);
4179
4180                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4181         }
4182
4183         for_each_active_iommu(iommu, drhd)
4184                 kfree(iommu->iommu_state);
4185 }
4186
4187 static struct syscore_ops iommu_syscore_ops = {
4188         .resume         = iommu_resume,
4189         .suspend        = iommu_suspend,
4190 };
4191
4192 static void __init init_iommu_pm_ops(void)
4193 {
4194         register_syscore_ops(&iommu_syscore_ops);
4195 }
4196
4197 #else
4198 static inline void init_iommu_pm_ops(void) {}
4199 #endif  /* CONFIG_PM */
4200
4201
4202 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4203 {
4204         struct acpi_dmar_reserved_memory *rmrr;
4205         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4206         struct dmar_rmrr_unit *rmrru;
4207         size_t length;
4208
4209         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4210         if (!rmrru)
4211                 goto out;
4212
4213         rmrru->hdr = header;
4214         rmrr = (struct acpi_dmar_reserved_memory *)header;
4215         rmrru->base_address = rmrr->base_address;
4216         rmrru->end_address = rmrr->end_address;
4217
4218         length = rmrr->end_address - rmrr->base_address + 1;
4219         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4220                                               IOMMU_RESV_DIRECT);
4221         if (!rmrru->resv)
4222                 goto free_rmrru;
4223
4224         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4225                                 ((void *)rmrr) + rmrr->header.length,
4226                                 &rmrru->devices_cnt);
4227         if (rmrru->devices_cnt && rmrru->devices == NULL)
4228                 goto free_all;
4229
4230         list_add(&rmrru->list, &dmar_rmrr_units);
4231
4232         return 0;
4233 free_all:
4234         kfree(rmrru->resv);
4235 free_rmrru:
4236         kfree(rmrru);
4237 out:
4238         return -ENOMEM;
4239 }
4240
4241 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4242 {
4243         struct dmar_atsr_unit *atsru;
4244         struct acpi_dmar_atsr *tmp;
4245
4246         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4247                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4248                 if (atsr->segment != tmp->segment)
4249                         continue;
4250                 if (atsr->header.length != tmp->header.length)
4251                         continue;
4252                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4253                         return atsru;
4254         }
4255
4256         return NULL;
4257 }
4258
4259 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4260 {
4261         struct acpi_dmar_atsr *atsr;
4262         struct dmar_atsr_unit *atsru;
4263
4264         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4265                 return 0;
4266
4267         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4268         atsru = dmar_find_atsr(atsr);
4269         if (atsru)
4270                 return 0;
4271
4272         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4273         if (!atsru)
4274                 return -ENOMEM;
4275
4276         /*
4277          * If memory is allocated from slab by ACPI _DSM method, we need to
4278          * copy the memory content because the memory buffer will be freed
4279          * on return.
4280          */
4281         atsru->hdr = (void *)(atsru + 1);
4282         memcpy(atsru->hdr, hdr, hdr->length);
4283         atsru->include_all = atsr->flags & 0x1;
4284         if (!atsru->include_all) {
4285                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4286                                 (void *)atsr + atsr->header.length,
4287                                 &atsru->devices_cnt);
4288                 if (atsru->devices_cnt && atsru->devices == NULL) {
4289                         kfree(atsru);
4290                         return -ENOMEM;
4291                 }
4292         }
4293
4294         list_add_rcu(&atsru->list, &dmar_atsr_units);
4295
4296         return 0;
4297 }
4298
4299 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4300 {
4301         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4302         kfree(atsru);
4303 }
4304
4305 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4306 {
4307         struct acpi_dmar_atsr *atsr;
4308         struct dmar_atsr_unit *atsru;
4309
4310         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4311         atsru = dmar_find_atsr(atsr);
4312         if (atsru) {
4313                 list_del_rcu(&atsru->list);
4314                 synchronize_rcu();
4315                 intel_iommu_free_atsr(atsru);
4316         }
4317
4318         return 0;
4319 }
4320
4321 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4322 {
4323         int i;
4324         struct device *dev;
4325         struct acpi_dmar_atsr *atsr;
4326         struct dmar_atsr_unit *atsru;
4327
4328         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4329         atsru = dmar_find_atsr(atsr);
4330         if (!atsru)
4331                 return 0;
4332
4333         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4334                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4335                                           i, dev)
4336                         return -EBUSY;
4337         }
4338
4339         return 0;
4340 }
4341
4342 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4343 {
4344         int sp, ret;
4345         struct intel_iommu *iommu = dmaru->iommu;
4346
4347         if (g_iommus[iommu->seq_id])
4348                 return 0;
4349
4350         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4351                 pr_warn("%s: Doesn't support hardware pass through.\n",
4352                         iommu->name);
4353                 return -ENXIO;
4354         }
4355         if (!ecap_sc_support(iommu->ecap) &&
4356             domain_update_iommu_snooping(iommu)) {
4357                 pr_warn("%s: Doesn't support snooping.\n",
4358                         iommu->name);
4359                 return -ENXIO;
4360         }
4361         sp = domain_update_iommu_superpage(iommu) - 1;
4362         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4363                 pr_warn("%s: Doesn't support large page.\n",
4364                         iommu->name);
4365                 return -ENXIO;
4366         }
4367
4368         /*
4369          * Disable translation if already enabled prior to OS handover.
4370          */
4371         if (iommu->gcmd & DMA_GCMD_TE)
4372                 iommu_disable_translation(iommu);
4373
4374         g_iommus[iommu->seq_id] = iommu;
4375         ret = iommu_init_domains(iommu);
4376         if (ret == 0)
4377                 ret = iommu_alloc_root_entry(iommu);
4378         if (ret)
4379                 goto out;
4380
4381 #ifdef CONFIG_INTEL_IOMMU_SVM
4382         if (pasid_supported(iommu))
4383                 intel_svm_init(iommu);
4384 #endif
4385
4386         if (dmaru->ignored) {
4387                 /*
4388                  * we always have to disable PMRs or DMA may fail on this device
4389                  */
4390                 if (force_on)
4391                         iommu_disable_protect_mem_regions(iommu);
4392                 return 0;
4393         }
4394
4395         intel_iommu_init_qi(iommu);
4396         iommu_flush_write_buffer(iommu);
4397
4398 #ifdef CONFIG_INTEL_IOMMU_SVM
4399         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4400                 ret = intel_svm_enable_prq(iommu);
4401                 if (ret)
4402                         goto disable_iommu;
4403         }
4404 #endif
4405         ret = dmar_set_interrupt(iommu);
4406         if (ret)
4407                 goto disable_iommu;
4408
4409         iommu_set_root_entry(iommu);
4410         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4411         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4412         iommu_enable_translation(iommu);
4413
4414         iommu_disable_protect_mem_regions(iommu);
4415         return 0;
4416
4417 disable_iommu:
4418         disable_dmar_iommu(iommu);
4419 out:
4420         free_dmar_iommu(iommu);
4421         return ret;
4422 }
4423
4424 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4425 {
4426         int ret = 0;
4427         struct intel_iommu *iommu = dmaru->iommu;
4428
4429         if (!intel_iommu_enabled)
4430                 return 0;
4431         if (iommu == NULL)
4432                 return -EINVAL;
4433
4434         if (insert) {
4435                 ret = intel_iommu_add(dmaru);
4436         } else {
4437                 disable_dmar_iommu(iommu);
4438                 free_dmar_iommu(iommu);
4439         }
4440
4441         return ret;
4442 }
4443
4444 static void intel_iommu_free_dmars(void)
4445 {
4446         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4447         struct dmar_atsr_unit *atsru, *atsr_n;
4448
4449         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4450                 list_del(&rmrru->list);
4451                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4452                 kfree(rmrru->resv);
4453                 kfree(rmrru);
4454         }
4455
4456         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4457                 list_del(&atsru->list);
4458                 intel_iommu_free_atsr(atsru);
4459         }
4460 }
4461
4462 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4463 {
4464         int i, ret = 1;
4465         struct pci_bus *bus;
4466         struct pci_dev *bridge = NULL;
4467         struct device *tmp;
4468         struct acpi_dmar_atsr *atsr;
4469         struct dmar_atsr_unit *atsru;
4470
4471         dev = pci_physfn(dev);
4472         for (bus = dev->bus; bus; bus = bus->parent) {
4473                 bridge = bus->self;
4474                 /* If it's an integrated device, allow ATS */
4475                 if (!bridge)
4476                         return 1;
4477                 /* Connected via non-PCIe: no ATS */
4478                 if (!pci_is_pcie(bridge) ||
4479                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4480                         return 0;
4481                 /* If we found the root port, look it up in the ATSR */
4482                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4483                         break;
4484         }
4485
4486         rcu_read_lock();
4487         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4488                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4489                 if (atsr->segment != pci_domain_nr(dev->bus))
4490                         continue;
4491
4492                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4493                         if (tmp == &bridge->dev)
4494                                 goto out;
4495
4496                 if (atsru->include_all)
4497                         goto out;
4498         }
4499         ret = 0;
4500 out:
4501         rcu_read_unlock();
4502
4503         return ret;
4504 }
4505
4506 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4507 {
4508         int ret;
4509         struct dmar_rmrr_unit *rmrru;
4510         struct dmar_atsr_unit *atsru;
4511         struct acpi_dmar_atsr *atsr;
4512         struct acpi_dmar_reserved_memory *rmrr;
4513
4514         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4515                 return 0;
4516
4517         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4518                 rmrr = container_of(rmrru->hdr,
4519                                     struct acpi_dmar_reserved_memory, header);
4520                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4521                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4522                                 ((void *)rmrr) + rmrr->header.length,
4523                                 rmrr->segment, rmrru->devices,
4524                                 rmrru->devices_cnt);
4525                         if (ret < 0)
4526                                 return ret;
4527                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4528                         dmar_remove_dev_scope(info, rmrr->segment,
4529                                 rmrru->devices, rmrru->devices_cnt);
4530                 }
4531         }
4532
4533         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4534                 if (atsru->include_all)
4535                         continue;
4536
4537                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4538                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4539                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4540                                         (void *)atsr + atsr->header.length,
4541                                         atsr->segment, atsru->devices,
4542                                         atsru->devices_cnt);
4543                         if (ret > 0)
4544                                 break;
4545                         else if (ret < 0)
4546                                 return ret;
4547                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4548                         if (dmar_remove_dev_scope(info, atsr->segment,
4549                                         atsru->devices, atsru->devices_cnt))
4550                                 break;
4551                 }
4552         }
4553
4554         return 0;
4555 }
4556
4557 /*
4558  * Here we only respond to action of unbound device from driver.
4559  *
4560  * Added device is not attached to its DMAR domain here yet. That will happen
4561  * when mapping the device to iova.
4562  */
4563 static int device_notifier(struct notifier_block *nb,
4564                                   unsigned long action, void *data)
4565 {
4566         struct device *dev = data;
4567         struct dmar_domain *domain;
4568
4569         if (iommu_dummy(dev))
4570                 return 0;
4571
4572         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4573                 return 0;
4574
4575         domain = find_domain(dev);
4576         if (!domain)
4577                 return 0;
4578
4579         dmar_remove_one_dev_info(domain, dev);
4580         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4581                 domain_exit(domain);
4582
4583         return 0;
4584 }
4585
4586 static struct notifier_block device_nb = {
4587         .notifier_call = device_notifier,
4588 };
4589
4590 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4591                                        unsigned long val, void *v)
4592 {
4593         struct memory_notify *mhp = v;
4594         unsigned long long start, end;
4595         unsigned long start_vpfn, last_vpfn;
4596
4597         switch (val) {
4598         case MEM_GOING_ONLINE:
4599                 start = mhp->start_pfn << PAGE_SHIFT;
4600                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4601                 if (iommu_domain_identity_map(si_domain, start, end)) {
4602                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4603                                 start, end);
4604                         return NOTIFY_BAD;
4605                 }
4606                 break;
4607
4608         case MEM_OFFLINE:
4609         case MEM_CANCEL_ONLINE:
4610                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4611                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4612                 while (start_vpfn <= last_vpfn) {
4613                         struct iova *iova;
4614                         struct dmar_drhd_unit *drhd;
4615                         struct intel_iommu *iommu;
4616                         struct page *freelist;
4617
4618                         iova = find_iova(&si_domain->iovad, start_vpfn);
4619                         if (iova == NULL) {
4620                                 pr_debug("Failed get IOVA for PFN %lx\n",
4621                                          start_vpfn);
4622                                 break;
4623                         }
4624
4625                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4626                                                      start_vpfn, last_vpfn);
4627                         if (iova == NULL) {
4628                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4629                                         start_vpfn, last_vpfn);
4630                                 return NOTIFY_BAD;
4631                         }
4632
4633                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4634                                                iova->pfn_hi);
4635
4636                         rcu_read_lock();
4637                         for_each_active_iommu(iommu, drhd)
4638                                 iommu_flush_iotlb_psi(iommu, si_domain,
4639                                         iova->pfn_lo, iova_size(iova),
4640                                         !freelist, 0);
4641                         rcu_read_unlock();
4642                         dma_free_pagelist(freelist);
4643
4644                         start_vpfn = iova->pfn_hi + 1;
4645                         free_iova_mem(iova);
4646                 }
4647                 break;
4648         }
4649
4650         return NOTIFY_OK;
4651 }
4652
4653 static struct notifier_block intel_iommu_memory_nb = {
4654         .notifier_call = intel_iommu_memory_notifier,
4655         .priority = 0
4656 };
4657
4658 static void free_all_cpu_cached_iovas(unsigned int cpu)
4659 {
4660         int i;
4661
4662         for (i = 0; i < g_num_of_iommus; i++) {
4663                 struct intel_iommu *iommu = g_iommus[i];
4664                 struct dmar_domain *domain;
4665                 int did;
4666
4667                 if (!iommu)
4668                         continue;
4669
4670                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4671                         domain = get_iommu_domain(iommu, (u16)did);
4672
4673                         if (!domain)
4674                                 continue;
4675                         free_cpu_cached_iovas(cpu, &domain->iovad);
4676                 }
4677         }
4678 }
4679
4680 static int intel_iommu_cpu_dead(unsigned int cpu)
4681 {
4682         free_all_cpu_cached_iovas(cpu);
4683         return 0;
4684 }
4685
4686 static void intel_disable_iommus(void)
4687 {
4688         struct intel_iommu *iommu = NULL;
4689         struct dmar_drhd_unit *drhd;
4690
4691         for_each_iommu(iommu, drhd)
4692                 iommu_disable_translation(iommu);
4693 }
4694
4695 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4696 {
4697         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4698
4699         return container_of(iommu_dev, struct intel_iommu, iommu);
4700 }
4701
4702 static ssize_t intel_iommu_show_version(struct device *dev,
4703                                         struct device_attribute *attr,
4704                                         char *buf)
4705 {
4706         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4707         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4708         return sprintf(buf, "%d:%d\n",
4709                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4710 }
4711 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4712
4713 static ssize_t intel_iommu_show_address(struct device *dev,
4714                                         struct device_attribute *attr,
4715                                         char *buf)
4716 {
4717         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4718         return sprintf(buf, "%llx\n", iommu->reg_phys);
4719 }
4720 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4721
4722 static ssize_t intel_iommu_show_cap(struct device *dev,
4723                                     struct device_attribute *attr,
4724                                     char *buf)
4725 {
4726         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4727         return sprintf(buf, "%llx\n", iommu->cap);
4728 }
4729 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4730
4731 static ssize_t intel_iommu_show_ecap(struct device *dev,
4732                                     struct device_attribute *attr,
4733                                     char *buf)
4734 {
4735         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4736         return sprintf(buf, "%llx\n", iommu->ecap);
4737 }
4738 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4739
4740 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4741                                       struct device_attribute *attr,
4742                                       char *buf)
4743 {
4744         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4745         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4746 }
4747 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4748
4749 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4750                                            struct device_attribute *attr,
4751                                            char *buf)
4752 {
4753         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4754         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4755                                                   cap_ndoms(iommu->cap)));
4756 }
4757 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4758
4759 static struct attribute *intel_iommu_attrs[] = {
4760         &dev_attr_version.attr,
4761         &dev_attr_address.attr,
4762         &dev_attr_cap.attr,
4763         &dev_attr_ecap.attr,
4764         &dev_attr_domains_supported.attr,
4765         &dev_attr_domains_used.attr,
4766         NULL,
4767 };
4768
4769 static struct attribute_group intel_iommu_group = {
4770         .name = "intel-iommu",
4771         .attrs = intel_iommu_attrs,
4772 };
4773
4774 const struct attribute_group *intel_iommu_groups[] = {
4775         &intel_iommu_group,
4776         NULL,
4777 };
4778
4779 static int __init platform_optin_force_iommu(void)
4780 {
4781         struct pci_dev *pdev = NULL;
4782         bool has_untrusted_dev = false;
4783
4784         if (!dmar_platform_optin() || no_platform_optin)
4785                 return 0;
4786
4787         for_each_pci_dev(pdev) {
4788                 if (pdev->untrusted) {
4789                         has_untrusted_dev = true;
4790                         break;
4791                 }
4792         }
4793
4794         if (!has_untrusted_dev)
4795                 return 0;
4796
4797         if (no_iommu || dmar_disabled)
4798                 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
4799
4800         /*
4801          * If Intel-IOMMU is disabled by default, we will apply identity
4802          * map for all devices except those marked as being untrusted.
4803          */
4804         if (dmar_disabled)
4805                 iommu_identity_mapping |= IDENTMAP_ALL;
4806
4807         dmar_disabled = 0;
4808 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4809         swiotlb = 0;
4810 #endif
4811         no_iommu = 0;
4812
4813         return 1;
4814 }
4815
4816 int __init intel_iommu_init(void)
4817 {
4818         int ret = -ENODEV;
4819         struct dmar_drhd_unit *drhd;
4820         struct intel_iommu *iommu;
4821
4822         /*
4823          * Intel IOMMU is required for a TXT/tboot launch or platform
4824          * opt in, so enforce that.
4825          */
4826         force_on = tboot_force_iommu() || platform_optin_force_iommu();
4827
4828         if (iommu_init_mempool()) {
4829                 if (force_on)
4830                         panic("tboot: Failed to initialize iommu memory\n");
4831                 return -ENOMEM;
4832         }
4833
4834         down_write(&dmar_global_lock);
4835         if (dmar_table_init()) {
4836                 if (force_on)
4837                         panic("tboot: Failed to initialize DMAR table\n");
4838                 goto out_free_dmar;
4839         }
4840
4841         if (dmar_dev_scope_init() < 0) {
4842                 if (force_on)
4843                         panic("tboot: Failed to initialize DMAR device scope\n");
4844                 goto out_free_dmar;
4845         }
4846
4847         up_write(&dmar_global_lock);
4848
4849         /*
4850          * The bus notifier takes the dmar_global_lock, so lockdep will
4851          * complain later when we register it under the lock.
4852          */
4853         dmar_register_bus_notifier();
4854
4855         down_write(&dmar_global_lock);
4856
4857         if (no_iommu || dmar_disabled) {
4858                 /*
4859                  * We exit the function here to ensure IOMMU's remapping and
4860                  * mempool aren't setup, which means that the IOMMU's PMRs
4861                  * won't be disabled via the call to init_dmars(). So disable
4862                  * it explicitly here. The PMRs were setup by tboot prior to
4863                  * calling SENTER, but the kernel is expected to reset/tear
4864                  * down the PMRs.
4865                  */
4866                 if (intel_iommu_tboot_noforce) {
4867                         for_each_iommu(iommu, drhd)
4868                                 iommu_disable_protect_mem_regions(iommu);
4869                 }
4870
4871                 /*
4872                  * Make sure the IOMMUs are switched off, even when we
4873                  * boot into a kexec kernel and the previous kernel left
4874                  * them enabled
4875                  */
4876                 intel_disable_iommus();
4877                 goto out_free_dmar;
4878         }
4879
4880         if (list_empty(&dmar_rmrr_units))
4881                 pr_info("No RMRR found\n");
4882
4883         if (list_empty(&dmar_atsr_units))
4884                 pr_info("No ATSR found\n");
4885
4886         if (dmar_init_reserved_ranges()) {
4887                 if (force_on)
4888                         panic("tboot: Failed to reserve iommu ranges\n");
4889                 goto out_free_reserved_range;
4890         }
4891
4892         init_no_remapping_devices();
4893
4894         ret = init_dmars();
4895         if (ret) {
4896                 if (force_on)
4897                         panic("tboot: Failed to initialize DMARs\n");
4898                 pr_err("Initialization failed\n");
4899                 goto out_free_reserved_range;
4900         }
4901         up_write(&dmar_global_lock);
4902         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4903
4904 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4905         swiotlb = 0;
4906 #endif
4907         dma_ops = &intel_dma_ops;
4908
4909         init_iommu_pm_ops();
4910
4911         for_each_active_iommu(iommu, drhd) {
4912                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4913                                        intel_iommu_groups,
4914                                        "%s", iommu->name);
4915                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4916                 iommu_device_register(&iommu->iommu);
4917         }
4918
4919         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4920         bus_register_notifier(&pci_bus_type, &device_nb);
4921         if (si_domain && !hw_pass_through)
4922                 register_memory_notifier(&intel_iommu_memory_nb);
4923         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4924                           intel_iommu_cpu_dead);
4925         intel_iommu_enabled = 1;
4926         intel_iommu_debugfs_init();
4927
4928         return 0;
4929
4930 out_free_reserved_range:
4931         put_iova_domain(&reserved_iova_list);
4932 out_free_dmar:
4933         intel_iommu_free_dmars();
4934         up_write(&dmar_global_lock);
4935         iommu_exit_mempool();
4936         return ret;
4937 }
4938
4939 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4940 {
4941         struct intel_iommu *iommu = opaque;
4942
4943         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4944         return 0;
4945 }
4946
4947 /*
4948  * NB - intel-iommu lacks any sort of reference counting for the users of
4949  * dependent devices.  If multiple endpoints have intersecting dependent
4950  * devices, unbinding the driver from any one of them will possibly leave
4951  * the others unable to operate.
4952  */
4953 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4954 {
4955         if (!iommu || !dev || !dev_is_pci(dev))
4956                 return;
4957
4958         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4959 }
4960
4961 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4962 {
4963         struct intel_iommu *iommu;
4964         unsigned long flags;
4965
4966         assert_spin_locked(&device_domain_lock);
4967
4968         if (WARN_ON(!info))
4969                 return;
4970
4971         iommu = info->iommu;
4972
4973         if (info->dev) {
4974                 if (dev_is_pci(info->dev) && sm_supported(iommu))
4975                         intel_pasid_tear_down_entry(iommu, info->dev,
4976                                         PASID_RID2PASID);
4977
4978                 iommu_disable_dev_iotlb(info);
4979                 domain_context_clear(iommu, info->dev);
4980                 intel_pasid_free_table(info->dev);
4981         }
4982
4983         unlink_domain_info(info);
4984
4985         spin_lock_irqsave(&iommu->lock, flags);
4986         domain_detach_iommu(info->domain, iommu);
4987         spin_unlock_irqrestore(&iommu->lock, flags);
4988
4989         free_devinfo_mem(info);
4990 }
4991
4992 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4993                                      struct device *dev)
4994 {
4995         struct device_domain_info *info;
4996         unsigned long flags;
4997
4998         spin_lock_irqsave(&device_domain_lock, flags);
4999         info = dev->archdata.iommu;
5000         __dmar_remove_one_dev_info(info);
5001         spin_unlock_irqrestore(&device_domain_lock, flags);
5002 }
5003
5004 static int md_domain_init(struct dmar_domain *domain, int guest_width)
5005 {
5006         int adjust_width;
5007
5008         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
5009         domain_reserve_special_ranges(domain);
5010
5011         /* calculate AGAW */
5012         domain->gaw = guest_width;
5013         adjust_width = guestwidth_to_adjustwidth(guest_width);
5014         domain->agaw = width_to_agaw(adjust_width);
5015
5016         domain->iommu_coherency = 0;
5017         domain->iommu_snooping = 0;
5018         domain->iommu_superpage = 0;
5019         domain->max_addr = 0;
5020
5021         /* always allocate the top pgd */
5022         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
5023         if (!domain->pgd)
5024                 return -ENOMEM;
5025         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
5026         return 0;
5027 }
5028
5029 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
5030 {
5031         struct dmar_domain *dmar_domain;
5032         struct iommu_domain *domain;
5033
5034         if (type != IOMMU_DOMAIN_UNMANAGED)
5035                 return NULL;
5036
5037         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
5038         if (!dmar_domain) {
5039                 pr_err("Can't allocate dmar_domain\n");
5040                 return NULL;
5041         }
5042         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
5043                 pr_err("Domain initialization failed\n");
5044                 domain_exit(dmar_domain);
5045                 return NULL;
5046         }
5047         domain_update_iommu_cap(dmar_domain);
5048
5049         domain = &dmar_domain->domain;
5050         domain->geometry.aperture_start = 0;
5051         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
5052         domain->geometry.force_aperture = true;
5053
5054         return domain;
5055 }
5056
5057 static void intel_iommu_domain_free(struct iommu_domain *domain)
5058 {
5059         domain_exit(to_dmar_domain(domain));
5060 }
5061
5062 static int intel_iommu_attach_device(struct iommu_domain *domain,
5063                                      struct device *dev)
5064 {
5065         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5066         struct intel_iommu *iommu;
5067         int addr_width;
5068         u8 bus, devfn;
5069
5070         if (device_is_rmrr_locked(dev)) {
5071                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
5072                 return -EPERM;
5073         }
5074
5075         /* normally dev is not mapped */
5076         if (unlikely(domain_context_mapped(dev))) {
5077                 struct dmar_domain *old_domain;
5078
5079                 old_domain = find_domain(dev);
5080                 if (old_domain) {
5081                         rcu_read_lock();
5082                         dmar_remove_one_dev_info(old_domain, dev);
5083                         rcu_read_unlock();
5084
5085                         if (!domain_type_is_vm_or_si(old_domain) &&
5086                              list_empty(&old_domain->devices))
5087                                 domain_exit(old_domain);
5088                 }
5089         }
5090
5091         iommu = device_to_iommu(dev, &bus, &devfn);
5092         if (!iommu)
5093                 return -ENODEV;
5094
5095         /* check if this iommu agaw is sufficient for max mapped address */
5096         addr_width = agaw_to_width(iommu->agaw);
5097         if (addr_width > cap_mgaw(iommu->cap))
5098                 addr_width = cap_mgaw(iommu->cap);
5099
5100         if (dmar_domain->max_addr > (1LL << addr_width)) {
5101                 dev_err(dev, "%s: iommu width (%d) is not "
5102                         "sufficient for the mapped address (%llx)\n",
5103                         __func__, addr_width, dmar_domain->max_addr);
5104                 return -EFAULT;
5105         }
5106         dmar_domain->gaw = addr_width;
5107
5108         /*
5109          * Knock out extra levels of page tables if necessary
5110          */
5111         while (iommu->agaw < dmar_domain->agaw) {
5112                 struct dma_pte *pte;
5113
5114                 pte = dmar_domain->pgd;
5115                 if (dma_pte_present(pte)) {
5116                         dmar_domain->pgd = (struct dma_pte *)
5117                                 phys_to_virt(dma_pte_addr(pte));
5118                         free_pgtable_page(pte);
5119                 }
5120                 dmar_domain->agaw--;
5121         }
5122
5123         return domain_add_dev_info(dmar_domain, dev);
5124 }
5125
5126 static void intel_iommu_detach_device(struct iommu_domain *domain,
5127                                       struct device *dev)
5128 {
5129         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
5130 }
5131
5132 static int intel_iommu_map(struct iommu_domain *domain,
5133                            unsigned long iova, phys_addr_t hpa,
5134                            size_t size, int iommu_prot)
5135 {
5136         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5137         u64 max_addr;
5138         int prot = 0;
5139         int ret;
5140
5141         if (iommu_prot & IOMMU_READ)
5142                 prot |= DMA_PTE_READ;
5143         if (iommu_prot & IOMMU_WRITE)
5144                 prot |= DMA_PTE_WRITE;
5145         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
5146                 prot |= DMA_PTE_SNP;
5147
5148         max_addr = iova + size;
5149         if (dmar_domain->max_addr < max_addr) {
5150                 u64 end;
5151
5152                 /* check if minimum agaw is sufficient for mapped address */
5153                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5154                 if (end < max_addr) {
5155                         pr_err("%s: iommu width (%d) is not "
5156                                "sufficient for the mapped address (%llx)\n",
5157                                __func__, dmar_domain->gaw, max_addr);
5158                         return -EFAULT;
5159                 }
5160                 dmar_domain->max_addr = max_addr;
5161         }
5162         /* Round up size to next multiple of PAGE_SIZE, if it and
5163            the low bits of hpa would take us onto the next page */
5164         size = aligned_nrpages(hpa, size);
5165         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5166                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5167         return ret;
5168 }
5169
5170 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5171                                 unsigned long iova, size_t size)
5172 {
5173         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5174         struct page *freelist = NULL;
5175         unsigned long start_pfn, last_pfn;
5176         unsigned int npages;
5177         int iommu_id, level = 0;
5178
5179         /* Cope with horrid API which requires us to unmap more than the
5180            size argument if it happens to be a large-page mapping. */
5181         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5182
5183         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5184                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5185
5186         start_pfn = iova >> VTD_PAGE_SHIFT;
5187         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5188
5189         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5190
5191         npages = last_pfn - start_pfn + 1;
5192
5193         for_each_domain_iommu(iommu_id, dmar_domain)
5194                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5195                                       start_pfn, npages, !freelist, 0);
5196
5197         dma_free_pagelist(freelist);
5198
5199         if (dmar_domain->max_addr == iova + size)
5200                 dmar_domain->max_addr = iova;
5201
5202         return size;
5203 }
5204
5205 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5206                                             dma_addr_t iova)
5207 {
5208         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5209         struct dma_pte *pte;
5210         int level = 0;
5211         u64 phys = 0;
5212
5213         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5214         if (pte)
5215                 phys = dma_pte_addr(pte);
5216
5217         return phys;
5218 }
5219
5220 static bool intel_iommu_capable(enum iommu_cap cap)
5221 {
5222         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5223                 return domain_update_iommu_snooping(NULL) == 1;
5224         if (cap == IOMMU_CAP_INTR_REMAP)
5225                 return irq_remapping_enabled == 1;
5226
5227         return false;
5228 }
5229
5230 static int intel_iommu_add_device(struct device *dev)
5231 {
5232         struct intel_iommu *iommu;
5233         struct iommu_group *group;
5234         u8 bus, devfn;
5235
5236         iommu = device_to_iommu(dev, &bus, &devfn);
5237         if (!iommu)
5238                 return -ENODEV;
5239
5240         iommu_device_link(&iommu->iommu, dev);
5241
5242         group = iommu_group_get_for_dev(dev);
5243
5244         if (IS_ERR(group))
5245                 return PTR_ERR(group);
5246
5247         iommu_group_put(group);
5248         return 0;
5249 }
5250
5251 static void intel_iommu_remove_device(struct device *dev)
5252 {
5253         struct intel_iommu *iommu;
5254         u8 bus, devfn;
5255
5256         iommu = device_to_iommu(dev, &bus, &devfn);
5257         if (!iommu)
5258                 return;
5259
5260         iommu_group_remove_device(dev);
5261
5262         iommu_device_unlink(&iommu->iommu, dev);
5263 }
5264
5265 static void intel_iommu_get_resv_regions(struct device *device,
5266                                          struct list_head *head)
5267 {
5268         struct iommu_resv_region *reg;
5269         struct dmar_rmrr_unit *rmrr;
5270         struct device *i_dev;
5271         int i;
5272
5273         rcu_read_lock();
5274         for_each_rmrr_units(rmrr) {
5275                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5276                                           i, i_dev) {
5277                         if (i_dev != device)
5278                                 continue;
5279
5280                         list_add_tail(&rmrr->resv->list, head);
5281                 }
5282         }
5283         rcu_read_unlock();
5284
5285         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5286                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5287                                       0, IOMMU_RESV_MSI);
5288         if (!reg)
5289                 return;
5290         list_add_tail(&reg->list, head);
5291 }
5292
5293 static void intel_iommu_put_resv_regions(struct device *dev,
5294                                          struct list_head *head)
5295 {
5296         struct iommu_resv_region *entry, *next;
5297
5298         list_for_each_entry_safe(entry, next, head, list) {
5299                 if (entry->type == IOMMU_RESV_RESERVED)
5300                         kfree(entry);
5301         }
5302 }
5303
5304 #ifdef CONFIG_INTEL_IOMMU_SVM
5305 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5306 {
5307         struct device_domain_info *info;
5308         struct context_entry *context;
5309         struct dmar_domain *domain;
5310         unsigned long flags;
5311         u64 ctx_lo;
5312         int ret;
5313
5314         domain = get_valid_domain_for_dev(sdev->dev);
5315         if (!domain)
5316                 return -EINVAL;
5317
5318         spin_lock_irqsave(&device_domain_lock, flags);
5319         spin_lock(&iommu->lock);
5320
5321         ret = -EINVAL;
5322         info = sdev->dev->archdata.iommu;
5323         if (!info || !info->pasid_supported)
5324                 goto out;
5325
5326         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5327         if (WARN_ON(!context))
5328                 goto out;
5329
5330         ctx_lo = context[0].lo;
5331
5332         sdev->did = domain->iommu_did[iommu->seq_id];
5333         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5334
5335         if (!(ctx_lo & CONTEXT_PASIDE)) {
5336                 ctx_lo |= CONTEXT_PASIDE;
5337                 context[0].lo = ctx_lo;
5338                 wmb();
5339                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5340                                            DMA_CCMD_MASK_NOBIT,
5341                                            DMA_CCMD_DEVICE_INVL);
5342         }
5343
5344         /* Enable PASID support in the device, if it wasn't already */
5345         if (!info->pasid_enabled)
5346                 iommu_enable_dev_iotlb(info);
5347
5348         if (info->ats_enabled) {
5349                 sdev->dev_iotlb = 1;
5350                 sdev->qdep = info->ats_qdep;
5351                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5352                         sdev->qdep = 0;
5353         }
5354         ret = 0;
5355
5356  out:
5357         spin_unlock(&iommu->lock);
5358         spin_unlock_irqrestore(&device_domain_lock, flags);
5359
5360         return ret;
5361 }
5362
5363 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5364 {
5365         struct intel_iommu *iommu;
5366         u8 bus, devfn;
5367
5368         if (iommu_dummy(dev)) {
5369                 dev_warn(dev,
5370                          "No IOMMU translation for device; cannot enable SVM\n");
5371                 return NULL;
5372         }
5373
5374         iommu = device_to_iommu(dev, &bus, &devfn);
5375         if ((!iommu)) {
5376                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5377                 return NULL;
5378         }
5379
5380         return iommu;
5381 }
5382 #endif /* CONFIG_INTEL_IOMMU_SVM */
5383
5384 const struct iommu_ops intel_iommu_ops = {
5385         .capable                = intel_iommu_capable,
5386         .domain_alloc           = intel_iommu_domain_alloc,
5387         .domain_free            = intel_iommu_domain_free,
5388         .attach_dev             = intel_iommu_attach_device,
5389         .detach_dev             = intel_iommu_detach_device,
5390         .map                    = intel_iommu_map,
5391         .unmap                  = intel_iommu_unmap,
5392         .iova_to_phys           = intel_iommu_iova_to_phys,
5393         .add_device             = intel_iommu_add_device,
5394         .remove_device          = intel_iommu_remove_device,
5395         .get_resv_regions       = intel_iommu_get_resv_regions,
5396         .put_resv_regions       = intel_iommu_put_resv_regions,
5397         .device_group           = pci_device_group,
5398         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5399 };
5400
5401 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5402 {
5403         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5404         pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
5405         dmar_map_gfx = 0;
5406 }
5407
5408 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5409 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5410 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5411 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5412 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5413 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5414 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5415
5416 static void quirk_iommu_rwbf(struct pci_dev *dev)
5417 {
5418         /*
5419          * Mobile 4 Series Chipset neglects to set RWBF capability,
5420          * but needs it. Same seems to hold for the desktop versions.
5421          */
5422         pci_info(dev, "Forcing write-buffer flush capability\n");
5423         rwbf_quirk = 1;
5424 }
5425
5426 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5427 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5428 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5429 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5430 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5431 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5432 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5433
5434 #define GGC 0x52
5435 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5436 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5437 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5438 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5439 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5440 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5441 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5442 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5443
5444 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5445 {
5446         unsigned short ggc;
5447
5448         if (pci_read_config_word(dev, GGC, &ggc))
5449                 return;
5450
5451         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5452                 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5453                 dmar_map_gfx = 0;
5454         } else if (dmar_map_gfx) {
5455                 /* we have to ensure the gfx device is idle before we flush */
5456                 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
5457                 intel_iommu_strict = 1;
5458        }
5459 }
5460 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5461 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5462 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5463 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5464
5465 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5466    ISOCH DMAR unit for the Azalia sound device, but not give it any
5467    TLB entries, which causes it to deadlock. Check for that.  We do
5468    this in a function called from init_dmars(), instead of in a PCI
5469    quirk, because we don't want to print the obnoxious "BIOS broken"
5470    message if VT-d is actually disabled.
5471 */
5472 static void __init check_tylersburg_isoch(void)
5473 {
5474         struct pci_dev *pdev;
5475         uint32_t vtisochctrl;
5476
5477         /* If there's no Azalia in the system anyway, forget it. */
5478         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5479         if (!pdev)
5480                 return;
5481         pci_dev_put(pdev);
5482
5483         /* System Management Registers. Might be hidden, in which case
5484            we can't do the sanity check. But that's OK, because the
5485            known-broken BIOSes _don't_ actually hide it, so far. */
5486         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5487         if (!pdev)
5488                 return;
5489
5490         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5491                 pci_dev_put(pdev);
5492                 return;
5493         }
5494
5495         pci_dev_put(pdev);
5496
5497         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5498         if (vtisochctrl & 1)
5499                 return;
5500
5501         /* Drop all bits other than the number of TLB entries */
5502         vtisochctrl &= 0x1c;
5503
5504         /* If we have the recommended number of TLB entries (16), fine. */
5505         if (vtisochctrl == 0x10)
5506                 return;
5507
5508         /* Zero TLB entries? You get to ride the short bus to school. */
5509         if (!vtisochctrl) {
5510                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5511                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5512                      dmi_get_system_info(DMI_BIOS_VENDOR),
5513                      dmi_get_system_info(DMI_BIOS_VERSION),
5514                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5515                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5516                 return;
5517         }
5518
5519         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5520                vtisochctrl);
5521 }