drivers/iommu/intel-iommu.c

   1 /*
   2  * Copyright © 2006-2014 Intel Corporation.
   3  *
   4  * This program is free software; you can redistribute it and/or modify it
   5  * under the terms and conditions of the GNU General Public License,
   6  * version 2, as published by the Free Software Foundation.
   7  *
   8  * This program is distributed in the hope it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  11  * more details.
  12  *
  13  * Authors: David Woodhouse <dwmw2@infradead.org>,
  14  *          Ashok Raj <ashok.raj@intel.com>,
  15  *          Shaohua Li <shaohua.li@intel.com>,
  16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
  17  *          Fenghua Yu <fenghua.yu@intel.com>
  18  *          Joerg Roedel <jroedel@suse.de>
  19  */
  20
  21 #define pr_fmt(fmt)     "DMAR: " fmt
  22
  23 #include <linux/init.h>
  24 #include <linux/bitmap.h>
  25 #include <linux/debugfs.h>
  26 #include <linux/export.h>
  27 #include <linux/slab.h>
  28 #include <linux/irq.h>
  29 #include <linux/interrupt.h>
  30 #include <linux/spinlock.h>
  31 #include <linux/pci.h>
  32 #include <linux/dmar.h>
  33 #include <linux/dma-mapping.h>
  34 #include <linux/mempool.h>
  35 #include <linux/memory.h>
  36 #include <linux/cpu.h>
  37 #include <linux/timer.h>
  38 #include <linux/io.h>
  39 #include <linux/iova.h>
  40 #include <linux/iommu.h>
  41 #include <linux/intel-iommu.h>
  42 #include <linux/syscore_ops.h>
  43 #include <linux/tboot.h>
  44 #include <linux/dmi.h>
  45 #include <linux/pci-ats.h>
  46 #include <linux/memblock.h>
  47 #include <linux/dma-contiguous.h>
  48 #include <linux/dma-direct.h>
  49 #include <linux/crash_dump.h>
  50 #include <asm/irq_remapping.h>
  51 #include <asm/cacheflush.h>
  52 #include <asm/iommu.h>
  53
  54 #include "irq_remapping.h"
  55 #include "intel-pasid.h"
  56
  57 #define ROOT_SIZE               VTD_PAGE_SIZE
  58 #define CONTEXT_SIZE            VTD_PAGE_SIZE
  59
  60 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  61 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
  62 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  63 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
  64
  65 #define IOAPIC_RANGE_START      (0xfee00000)
  66 #define IOAPIC_RANGE_END        (0xfeefffff)
  67 #define IOVA_START_ADDR         (0x1000)
  68
  69 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
  70
  71 #define MAX_AGAW_WIDTH 64
  72 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
  73
  74 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
  75 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
  76
  77 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
  78    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
  79 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
  80                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
  81 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
  82
  83 /* IO virtual address start page frame number */
  84 #define IOVA_START_PFN          (1)
  85
  86 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
  87
  88 /* page table handling */
  89 #define LEVEL_STRIDE            (9)
  90 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
  91
  92 /*
  93  * This bitmap is used to advertise the page sizes our hardware support
  94  * to the IOMMU core, which will then use this information to split
  95  * physically contiguous memory regions it is mapping into page sizes
  96  * that we support.
  97  *
  98  * Traditionally the IOMMU core just handed us the mappings directly,
  99  * after making sure the size is an order of a 4KiB page and that the
 100  * mapping has natural alignment.
 101  *
 102  * To retain this behavior, we currently advertise that we support
 103  * all page sizes that are an order of 4KiB.
 104  *
 105  * If at some point we'd like to utilize the IOMMU core's new behavior,
 106  * we could change this to advertise the real page sizes we support.
 107  */
 108 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
 109
 110 static inline int agaw_to_level(int agaw)
 111 {
 112         return agaw + 2;
 113 }
 114
 115 static inline int agaw_to_width(int agaw)
 116 {
 117         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
 118 }
 119
 120 static inline int width_to_agaw(int width)
 121 {
 122         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
 123 }
 124
 125 static inline unsigned int level_to_offset_bits(int level)
 126 {
 127         return (level - 1) * LEVEL_STRIDE;
 128 }
 129
 130 static inline int pfn_level_offset(unsigned long pfn, int level)
 131 {
 132         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
 133 }
 134
 135 static inline unsigned long level_mask(int level)
 136 {
 137         return -1UL << level_to_offset_bits(level);
 138 }
 139
 140 static inline unsigned long level_size(int level)
 141 {
 142         return 1UL << level_to_offset_bits(level);
 143 }
 144
 145 static inline unsigned long align_to_level(unsigned long pfn, int level)
 146 {
 147         return (pfn + level_size(level) - 1) & level_mask(level);
 148 }
 149
 150 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
 151 {
 152         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
 153 }
 154
 155 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
 156    are never going to work. */
 157 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
 158 {
 159         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
 160 }
 161
 162 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
 163 {
 164         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
 165 }
 166 static inline unsigned long page_to_dma_pfn(struct page *pg)
 167 {
 168         return mm_to_dma_pfn(page_to_pfn(pg));
 169 }
 170 static inline unsigned long virt_to_dma_pfn(void *p)
 171 {
 172         return page_to_dma_pfn(virt_to_page(p));
 173 }
 174
 175 /* global iommu list, set NULL for ignored DMAR units */
 176 static struct intel_iommu **g_iommus;
 177
 178 static void __init check_tylersburg_isoch(void);
 179 static int rwbf_quirk;
 180
 181 /*
 182  * set to 1 to panic kernel if can't successfully enable VT-d
 183  * (used when kernel is launched w/ TXT)
 184  */
 185 static int force_on = 0;
 186 int intel_iommu_tboot_noforce;
 187
 188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 189
 190 /*
 191  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
 192  * if marked present.
 193  */
 194 static phys_addr_t root_entry_lctp(struct root_entry *re)
 195 {
 196         if (!(re->lo & 1))
 197                 return 0;
 198
 199         return re->lo & VTD_PAGE_MASK;
 200 }
 201
 202 /*
 203  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
 204  * if marked present.
 205  */
 206 static phys_addr_t root_entry_uctp(struct root_entry *re)
 207 {
 208         if (!(re->hi & 1))
 209                 return 0;
 210
 211         return re->hi & VTD_PAGE_MASK;
 212 }
 213
 214 static inline void context_clear_pasid_enable(struct context_entry *context)
 215 {
 216         context->lo &= ~(1ULL << 11);
 217 }
 218
 219 static inline bool context_pasid_enabled(struct context_entry *context)
 220 {
 221         return !!(context->lo & (1ULL << 11));
 222 }
 223
 224 static inline void context_set_copied(struct context_entry *context)
 225 {
 226         context->hi |= (1ull << 3);
 227 }
 228
 229 static inline bool context_copied(struct context_entry *context)
 230 {
 231         return !!(context->hi & (1ULL << 3));
 232 }
 233
 234 static inline bool __context_present(struct context_entry *context)
 235 {
 236         return (context->lo & 1);
 237 }
 238
 239 bool context_present(struct context_entry *context)
 240 {
 241         return context_pasid_enabled(context) ?
 242              __context_present(context) :
 243              __context_present(context) && !context_copied(context);
 244 }
 245
 246 static inline void context_set_present(struct context_entry *context)
 247 {
 248         context->lo |= 1;
 249 }
 250
 251 static inline void context_set_fault_enable(struct context_entry *context)
 252 {
 253         context->lo &= (((u64)-1) << 2) | 1;
 254 }
 255
 256 static inline void context_set_translation_type(struct context_entry *context,
 257                                                 unsigned long value)
 258 {
 259         context->lo &= (((u64)-1) << 4) | 3;
 260         context->lo |= (value & 3) << 2;
 261 }
 262
 263 static inline void context_set_address_root(struct context_entry *context,
 264                                             unsigned long value)
 265 {
 266         context->lo &= ~VTD_PAGE_MASK;
 267         context->lo |= value & VTD_PAGE_MASK;
 268 }
 269
 270 static inline void context_set_address_width(struct context_entry *context,
 271                                              unsigned long value)
 272 {
 273         context->hi |= value & 7;
 274 }
 275
 276 static inline void context_set_domain_id(struct context_entry *context,
 277                                          unsigned long value)
 278 {
 279         context->hi |= (value & ((1 << 16) - 1)) << 8;
 280 }
 281
 282 static inline int context_domain_id(struct context_entry *c)
 283 {
 284         return((c->hi >> 8) & 0xffff);
 285 }
 286
 287 static inline void context_clear_entry(struct context_entry *context)
 288 {
 289         context->lo = 0;
 290         context->hi = 0;
 291 }
 292
 293 /*
 294  * This domain is a statically identity mapping domain.
 295  *      1. This domain creats a static 1:1 mapping to all usable memory.
 296  *      2. It maps to each iommu if successful.
 297  *      3. Each iommu mapps to this domain if successful.
 298  */
 299 static struct dmar_domain *si_domain;
 300 static int hw_pass_through = 1;
 301
 302 /*
 303  * Domain represents a virtual machine, more than one devices
 304  * across iommus may be owned in one domain, e.g. kvm guest.
 305  */
 306 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
 307
 308 /* si_domain contains mulitple devices */
 309 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
 310
 311 #define for_each_domain_iommu(idx, domain)                      \
 312         for (idx = 0; idx < g_num_of_iommus; idx++)             \
 313                 if (domain->iommu_refcnt[idx])
 314
 315 struct dmar_rmrr_unit {
 316         struct list_head list;          /* list of rmrr units   */
 317         struct acpi_dmar_header *hdr;   /* ACPI header          */
 318         u64     base_address;           /* reserved base address*/
 319         u64     end_address;            /* reserved end address */
 320         struct dmar_dev_scope *devices; /* target devices */
 321         int     devices_cnt;            /* target device count */
 322         struct iommu_resv_region *resv; /* reserved region handle */
 323 };
 324
 325 struct dmar_atsr_unit {
 326         struct list_head list;          /* list of ATSR units */
 327         struct acpi_dmar_header *hdr;   /* ACPI header */
 328         struct dmar_dev_scope *devices; /* target devices */
 329         int devices_cnt;                /* target device count */
 330         u8 include_all:1;               /* include all ports */
 331 };
 332
 333 static LIST_HEAD(dmar_atsr_units);
 334 static LIST_HEAD(dmar_rmrr_units);
 335
 336 #define for_each_rmrr_units(rmrr) \
 337         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
 338
 339 /* bitmap for indexing intel_iommus */
 340 static int g_num_of_iommus;
 341
 342 static void domain_exit(struct dmar_domain *domain);
 343 static void domain_remove_dev_info(struct dmar_domain *domain);
 344 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
 345                                      struct device *dev);
 346 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
 347 static void domain_context_clear(struct intel_iommu *iommu,
 348                                  struct device *dev);
 349 static int domain_detach_iommu(struct dmar_domain *domain,
 350                                struct intel_iommu *iommu);
 351
 352 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
 353 int dmar_disabled = 0;
 354 #else
 355 int dmar_disabled = 1;
 356 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
 357
 358 int intel_iommu_enabled = 0;
 359 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
 360
 361 static int dmar_map_gfx = 1;
 362 static int dmar_forcedac;
 363 static int intel_iommu_strict;
 364 static int intel_iommu_superpage = 1;
 365 static int intel_iommu_sm = 1;
 366 static int iommu_identity_mapping;
 367
 368 #define IDENTMAP_ALL            1
 369 #define IDENTMAP_GFX            2
 370 #define IDENTMAP_AZALIA         4
 371
 372 #define sm_supported(iommu)     (intel_iommu_sm && ecap_smts((iommu)->ecap))
 373 #define pasid_supported(iommu)  (sm_supported(iommu) &&                 \
 374                                  ecap_pasid((iommu)->ecap))
 375
 376 int intel_iommu_gfx_mapped;
 377 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
 378
 379 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
 380 static DEFINE_SPINLOCK(device_domain_lock);
 381 static LIST_HEAD(device_domain_list);
 382
 383 /*
 384  * Iterate over elements in device_domain_list and call the specified
 385  * callback @fn against each element.
 386  */
 387 int for_each_device_domain(int (*fn)(struct device_domain_info *info,
 388                                      void *data), void *data)
 389 {
 390         int ret = 0;
 391         unsigned long flags;
 392         struct device_domain_info *info;
 393
 394         spin_lock_irqsave(&device_domain_lock, flags);
 395         list_for_each_entry(info, &device_domain_list, global) {
 396                 ret = fn(info, data);
 397                 if (ret) {
 398                         spin_unlock_irqrestore(&device_domain_lock, flags);
 399                         return ret;
 400                 }
 401         }
 402         spin_unlock_irqrestore(&device_domain_lock, flags);
 403
 404         return 0;
 405 }
 406
 407 const struct iommu_ops intel_iommu_ops;
 408
 409 static bool translation_pre_enabled(struct intel_iommu *iommu)
 410 {
 411         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
 412 }
 413
 414 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
 415 {
 416         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
 417 }
 418
 419 static void init_translation_status(struct intel_iommu *iommu)
 420 {
 421         u32 gsts;
 422
 423         gsts = readl(iommu->reg + DMAR_GSTS_REG);
 424         if (gsts & DMA_GSTS_TES)
 425                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
 426 }
 427
 428 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
 429 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
 430 {
 431         return container_of(dom, struct dmar_domain, domain);
 432 }
 433
 434 static int __init intel_iommu_setup(char *str)
 435 {
 436         if (!str)
 437                 return -EINVAL;
 438         while (*str) {
 439                 if (!strncmp(str, "on", 2)) {
 440                         dmar_disabled = 0;
 441                         pr_info("IOMMU enabled\n");
 442                 } else if (!strncmp(str, "off", 3)) {
 443                         dmar_disabled = 1;
 444                         pr_info("IOMMU disabled\n");
 445                 } else if (!strncmp(str, "igfx_off", 8)) {
 446                         dmar_map_gfx = 0;
 447                         pr_info("Disable GFX device mapping\n");
 448                 } else if (!strncmp(str, "forcedac", 8)) {
 449                         pr_info("Forcing DAC for PCI devices\n");
 450                         dmar_forcedac = 1;
 451                 } else if (!strncmp(str, "strict", 6)) {
 452                         pr_info("Disable batched IOTLB flush\n");
 453                         intel_iommu_strict = 1;
 454                 } else if (!strncmp(str, "sp_off", 6)) {
 455                         pr_info("Disable supported super page\n");
 456                         intel_iommu_superpage = 0;
 457                 } else if (!strncmp(str, "sm_off", 6)) {
 458                         pr_info("Intel-IOMMU: disable scalable mode support\n");
 459                         intel_iommu_sm = 0;
 460                 } else if (!strncmp(str, "tboot_noforce", 13)) {
 461                         printk(KERN_INFO
 462                                 "Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
 463                         intel_iommu_tboot_noforce = 1;
 464                 }
 465
 466                 str += strcspn(str, ",");
 467                 while (*str == ',')
 468                         str++;
 469         }
 470         return 0;
 471 }
 472 __setup("intel_iommu=", intel_iommu_setup);
 473
 474 static struct kmem_cache *iommu_domain_cache;
 475 static struct kmem_cache *iommu_devinfo_cache;
 476
 477 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
 478 {
 479         struct dmar_domain **domains;
 480         int idx = did >> 8;
 481
 482         domains = iommu->domains[idx];
 483         if (!domains)
 484                 return NULL;
 485
 486         return domains[did & 0xff];
 487 }
 488
 489 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
 490                              struct dmar_domain *domain)
 491 {
 492         struct dmar_domain **domains;
 493         int idx = did >> 8;
 494
 495         if (!iommu->domains[idx]) {
 496                 size_t size = 256 * sizeof(struct dmar_domain *);
 497                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
 498         }
 499
 500         domains = iommu->domains[idx];
 501         if (WARN_ON(!domains))
 502                 return;
 503         else
 504                 domains[did & 0xff] = domain;
 505 }
 506
 507 void *alloc_pgtable_page(int node)
 508 {
 509         struct page *page;
 510         void *vaddr = NULL;
 511
 512         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
 513         if (page)
 514                 vaddr = page_address(page);
 515         return vaddr;
 516 }
 517
 518 void free_pgtable_page(void *vaddr)
 519 {
 520         free_page((unsigned long)vaddr);
 521 }
 522
 523 static inline void *alloc_domain_mem(void)
 524 {
 525         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
 526 }
 527
 528 static void free_domain_mem(void *vaddr)
 529 {
 530         kmem_cache_free(iommu_domain_cache, vaddr);
 531 }
 532
 533 static inline void * alloc_devinfo_mem(void)
 534 {
 535         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
 536 }
 537
 538 static inline void free_devinfo_mem(void *vaddr)
 539 {
 540         kmem_cache_free(iommu_devinfo_cache, vaddr);
 541 }
 542
 543 static inline int domain_type_is_vm(struct dmar_domain *domain)
 544 {
 545         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
 546 }
 547
 548 static inline int domain_type_is_si(struct dmar_domain *domain)
 549 {
 550         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
 551 }
 552
 553 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
 554 {
 555         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
 556                                 DOMAIN_FLAG_STATIC_IDENTITY);
 557 }
 558
 559 static inline int domain_pfn_supported(struct dmar_domain *domain,
 560                                        unsigned long pfn)
 561 {
 562         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
 563
 564         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
 565 }
 566
 567 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
 568 {
 569         unsigned long sagaw;
 570         int agaw = -1;
 571
 572         sagaw = cap_sagaw(iommu->cap);
 573         for (agaw = width_to_agaw(max_gaw);
 574              agaw >= 0; agaw--) {
 575                 if (test_bit(agaw, &sagaw))
 576                         break;
 577         }
 578
 579         return agaw;
 580 }
 581
 582 /*
 583  * Calculate max SAGAW for each iommu.
 584  */
 585 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
 586 {
 587         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
 588 }
 589
 590 /*
 591  * calculate agaw for each iommu.
 592  * "SAGAW" may be different across iommus, use a default agaw, and
 593  * get a supported less agaw for iommus that don't support the default agaw.
 594  */
 595 int iommu_calculate_agaw(struct intel_iommu *iommu)
 596 {
 597         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
 598 }
 599
 600 /* This functionin only returns single iommu in a domain */
 601 struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
 602 {
 603         int iommu_id;
 604
 605         /* si_domain and vm domain should not get here. */
 606         BUG_ON(domain_type_is_vm_or_si(domain));
 607         for_each_domain_iommu(iommu_id, domain)
 608                 break;
 609
 610         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
 611                 return NULL;
 612
 613         return g_iommus[iommu_id];
 614 }
 615
 616 static void domain_update_iommu_coherency(struct dmar_domain *domain)
 617 {
 618         struct dmar_drhd_unit *drhd;
 619         struct intel_iommu *iommu;
 620         bool found = false;
 621         int i;
 622
 623         domain->iommu_coherency = 1;
 624
 625         for_each_domain_iommu(i, domain) {
 626                 found = true;
 627                 if (!ecap_coherent(g_iommus[i]->ecap)) {
 628                         domain->iommu_coherency = 0;
 629                         break;
 630                 }
 631         }
 632         if (found)
 633                 return;
 634
 635         /* No hardware attached; use lowest common denominator */
 636         rcu_read_lock();
 637         for_each_active_iommu(iommu, drhd) {
 638                 if (!ecap_coherent(iommu->ecap)) {
 639                         domain->iommu_coherency = 0;
 640                         break;
 641                 }
 642         }
 643         rcu_read_unlock();
 644 }
 645
 646 static int domain_update_iommu_snooping(struct intel_iommu *skip)
 647 {
 648         struct dmar_drhd_unit *drhd;
 649         struct intel_iommu *iommu;
 650         int ret = 1;
 651
 652         rcu_read_lock();
 653         for_each_active_iommu(iommu, drhd) {
 654                 if (iommu != skip) {
 655                         if (!ecap_sc_support(iommu->ecap)) {
 656                                 ret = 0;
 657                                 break;
 658                         }
 659                 }
 660         }
 661         rcu_read_unlock();
 662
 663         return ret;
 664 }
 665
 666 static int domain_update_iommu_superpage(struct intel_iommu *skip)
 667 {
 668         struct dmar_drhd_unit *drhd;
 669         struct intel_iommu *iommu;
 670         int mask = 0xf;
 671
 672         if (!intel_iommu_superpage) {
 673                 return 0;
 674         }
 675
 676         /* set iommu_superpage to the smallest common denominator */
 677         rcu_read_lock();
 678         for_each_active_iommu(iommu, drhd) {
 679                 if (iommu != skip) {
 680                         mask &= cap_super_page_val(iommu->cap);
 681                         if (!mask)
 682                                 break;
 683                 }
 684         }
 685         rcu_read_unlock();
 686
 687         return fls(mask);
 688 }
 689
 690 /* Some capabilities may be different across iommus */
 691 static void domain_update_iommu_cap(struct dmar_domain *domain)
 692 {
 693         domain_update_iommu_coherency(domain);
 694         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
 695         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
 696 }
 697
 698 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 699                                          u8 devfn, int alloc)
 700 {
 701         struct root_entry *root = &iommu->root_entry[bus];
 702         struct context_entry *context;
 703         u64 *entry;
 704
 705         entry = &root->lo;
 706         if (sm_supported(iommu)) {
 707                 if (devfn >= 0x80) {
 708                         devfn -= 0x80;
 709                         entry = &root->hi;
 710                 }
 711                 devfn *= 2;
 712         }
 713         if (*entry & 1)
 714                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
 715         else {
 716                 unsigned long phy_addr;
 717                 if (!alloc)
 718                         return NULL;
 719
 720                 context = alloc_pgtable_page(iommu->node);
 721                 if (!context)
 722                         return NULL;
 723
 724                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
 725                 phy_addr = virt_to_phys((void *)context);
 726                 *entry = phy_addr | 1;
 727                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
 728         }
 729         return &context[devfn];
 730 }
 731
 732 static int iommu_dummy(struct device *dev)
 733 {
 734         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
 735 }
 736
 737 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
 738 {
 739         struct dmar_drhd_unit *drhd = NULL;
 740         struct intel_iommu *iommu;
 741         struct device *tmp;
 742         struct pci_dev *ptmp, *pdev = NULL;
 743         u16 segment = 0;
 744         int i;
 745
 746         if (iommu_dummy(dev))
 747                 return NULL;
 748
 749         if (dev_is_pci(dev)) {
 750                 struct pci_dev *pf_pdev;
 751
 752                 pdev = to_pci_dev(dev);
 753
 754 #ifdef CONFIG_X86
 755                 /* VMD child devices currently cannot be handled individually */
 756                 if (is_vmd(pdev->bus))
 757                         return NULL;
 758 #endif
 759
 760                 /* VFs aren't listed in scope tables; we need to look up
 761                  * the PF instead to find the IOMMU. */
 762                 pf_pdev = pci_physfn(pdev);
 763                 dev = &pf_pdev->dev;
 764                 segment = pci_domain_nr(pdev->bus);
 765         } else if (has_acpi_companion(dev))
 766                 dev = &ACPI_COMPANION(dev)->dev;
 767
 768         rcu_read_lock();
 769         for_each_active_iommu(iommu, drhd) {
 770                 if (pdev && segment != drhd->segment)
 771                         continue;
 772
 773                 for_each_active_dev_scope(drhd->devices,
 774                                           drhd->devices_cnt, i, tmp) {
 775                         if (tmp == dev) {
 776                                 /* For a VF use its original BDF# not that of the PF
 777                                  * which we used for the IOMMU lookup. Strictly speaking
 778                                  * we could do this for all PCI devices; we only need to
 779                                  * get the BDF# from the scope table for ACPI matches. */
 780                                 if (pdev && pdev->is_virtfn)
 781                                         goto got_pdev;
 782
 783                                 *bus = drhd->devices[i].bus;
 784                                 *devfn = drhd->devices[i].devfn;
 785                                 goto out;
 786                         }
 787
 788                         if (!pdev || !dev_is_pci(tmp))
 789                                 continue;
 790
 791                         ptmp = to_pci_dev(tmp);
 792                         if (ptmp->subordinate &&
 793                             ptmp->subordinate->number <= pdev->bus->number &&
 794                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
 795                                 goto got_pdev;
 796                 }
 797
 798                 if (pdev && drhd->include_all) {
 799                 got_pdev:
 800                         *bus = pdev->bus->number;
 801                         *devfn = pdev->devfn;
 802                         goto out;
 803                 }
 804         }
 805         iommu = NULL;
 806  out:
 807         rcu_read_unlock();
 808
 809         return iommu;
 810 }
 811
 812 static void domain_flush_cache(struct dmar_domain *domain,
 813                                void *addr, int size)
 814 {
 815         if (!domain->iommu_coherency)
 816                 clflush_cache_range(addr, size);
 817 }
 818
 819 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
 820 {
 821         struct context_entry *context;
 822         int ret = 0;
 823         unsigned long flags;
 824
 825         spin_lock_irqsave(&iommu->lock, flags);
 826         context = iommu_context_addr(iommu, bus, devfn, 0);
 827         if (context)
 828                 ret = context_present(context);
 829         spin_unlock_irqrestore(&iommu->lock, flags);
 830         return ret;
 831 }
 832
 833 static void free_context_table(struct intel_iommu *iommu)
 834 {
 835         int i;
 836         unsigned long flags;
 837         struct context_entry *context;
 838
 839         spin_lock_irqsave(&iommu->lock, flags);
 840         if (!iommu->root_entry) {
 841                 goto out;
 842         }
 843         for (i = 0; i < ROOT_ENTRY_NR; i++) {
 844                 context = iommu_context_addr(iommu, i, 0, 0);
 845                 if (context)
 846                         free_pgtable_page(context);
 847
 848                 if (!sm_supported(iommu))
 849                         continue;
 850
 851                 context = iommu_context_addr(iommu, i, 0x80, 0);
 852                 if (context)
 853                         free_pgtable_page(context);
 854
 855         }
 856         free_pgtable_page(iommu->root_entry);
 857         iommu->root_entry = NULL;
 858 out:
 859         spin_unlock_irqrestore(&iommu->lock, flags);
 860 }
 861
 862 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 863                                       unsigned long pfn, int *target_level)
 864 {
 865         struct dma_pte *parent, *pte = NULL;
 866         int level = agaw_to_level(domain->agaw);
 867         int offset;
 868
 869         BUG_ON(!domain->pgd);
 870
 871         if (!domain_pfn_supported(domain, pfn))
 872                 /* Address beyond IOMMU's addressing capabilities. */
 873                 return NULL;
 874
 875         parent = domain->pgd;
 876
 877         while (1) {
 878                 void *tmp_page;
 879
 880                 offset = pfn_level_offset(pfn, level);
 881                 pte = &parent[offset];
 882                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
 883                         break;
 884                 if (level == *target_level)
 885                         break;
 886
 887                 if (!dma_pte_present(pte)) {
 888                         uint64_t pteval;
 889
 890                         tmp_page = alloc_pgtable_page(domain->nid);
 891
 892                         if (!tmp_page)
 893                                 return NULL;
 894
 895                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
 896                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
 897                         if (cmpxchg64(&pte->val, 0ULL, pteval))
 898                                 /* Someone else set it while we were thinking; use theirs. */
 899                                 free_pgtable_page(tmp_page);
 900                         else
 901                                 domain_flush_cache(domain, pte, sizeof(*pte));
 902                 }
 903                 if (level == 1)
 904                         break;
 905
 906                 parent = phys_to_virt(dma_pte_addr(pte));
 907                 level--;
 908         }
 909
 910         if (!*target_level)
 911                 *target_level = level;
 912
 913         return pte;
 914 }
 915
 916
 917 /* return address's pte at specific level */
 918 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
 919                                          unsigned long pfn,
 920                                          int level, int *large_page)
 921 {
 922         struct dma_pte *parent, *pte = NULL;
 923         int total = agaw_to_level(domain->agaw);
 924         int offset;
 925
 926         parent = domain->pgd;
 927         while (level <= total) {
 928                 offset = pfn_level_offset(pfn, total);
 929                 pte = &parent[offset];
 930                 if (level == total)
 931                         return pte;
 932
 933                 if (!dma_pte_present(pte)) {
 934                         *large_page = total;
 935                         break;
 936                 }
 937
 938                 if (dma_pte_superpage(pte)) {
 939                         *large_page = total;
 940                         return pte;
 941                 }
 942
 943                 parent = phys_to_virt(dma_pte_addr(pte));
 944                 total--;
 945         }
 946         return NULL;
 947 }
 948
 949 /* clear last level pte, a tlb flush should be followed */
 950 static void dma_pte_clear_range(struct dmar_domain *domain,
 951                                 unsigned long start_pfn,
 952                                 unsigned long last_pfn)
 953 {
 954         unsigned int large_page = 1;
 955         struct dma_pte *first_pte, *pte;
 956
 957         BUG_ON(!domain_pfn_supported(domain, start_pfn));
 958         BUG_ON(!domain_pfn_supported(domain, last_pfn));
 959         BUG_ON(start_pfn > last_pfn);
 960
 961         /* we don't need lock here; nobody else touches the iova range */
 962         do {
 963                 large_page = 1;
 964                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
 965                 if (!pte) {
 966                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
 967                         continue;
 968                 }
 969                 do {
 970                         dma_clear_pte(pte);
 971                         start_pfn += lvl_to_nr_pages(large_page);
 972                         pte++;
 973                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
 974
 975                 domain_flush_cache(domain, first_pte,
 976                                    (void *)pte - (void *)first_pte);
 977
 978         } while (start_pfn && start_pfn <= last_pfn);
 979 }
 980
 981 static void dma_pte_free_level(struct dmar_domain *domain, int level,
 982                                int retain_level, struct dma_pte *pte,
 983                                unsigned long pfn, unsigned long start_pfn,
 984                                unsigned long last_pfn)
 985 {
 986         pfn = max(start_pfn, pfn);
 987         pte = &pte[pfn_level_offset(pfn, level)];
 988
 989         do {
 990                 unsigned long level_pfn;
 991                 struct dma_pte *level_pte;
 992
 993                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
 994                         goto next;
 995
 996                 level_pfn = pfn & level_mask(level);
 997                 level_pte = phys_to_virt(dma_pte_addr(pte));
 998
 999                 if (level > 2) {
1000                         dma_pte_free_level(domain, level - 1, retain_level,
1001                                            level_pte, level_pfn, start_pfn,
1002                                            last_pfn);
1003                 }
1004
1005                 /*
1006                  * Free the page table if we're below the level we want to
1007                  * retain and the range covers the entire table.
1008                  */
1009                 if (level < retain_level && !(start_pfn > level_pfn ||
1010                       last_pfn < level_pfn + level_size(level) - 1)) {
1011                         dma_clear_pte(pte);
1012                         domain_flush_cache(domain, pte, sizeof(*pte));
1013                         free_pgtable_page(level_pte);
1014                 }
1015 next:
1016                 pfn += level_size(level);
1017         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1018 }
1019
1020 /*
1021  * clear last level (leaf) ptes and free page table pages below the
1022  * level we wish to keep intact.
1023  */
1024 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1025                                    unsigned long start_pfn,
1026                                    unsigned long last_pfn,
1027                                    int retain_level)
1028 {
1029         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1030         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1031         BUG_ON(start_pfn > last_pfn);
1032
1033         dma_pte_clear_range(domain, start_pfn, last_pfn);
1034
1035         /* We don't need lock here; nobody else touches the iova range */
1036         dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1037                            domain->pgd, 0, start_pfn, last_pfn);
1038
1039         /* free pgd */
1040         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1041                 free_pgtable_page(domain->pgd);
1042                 domain->pgd = NULL;
1043         }
1044 }
1045
1046 /* When a page at a given level is being unlinked from its parent, we don't
1047    need to *modify* it at all. All we need to do is make a list of all the
1048    pages which can be freed just as soon as we've flushed the IOTLB and we
1049    know the hardware page-walk will no longer touch them.
1050    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1051    be freed. */
1052 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1053                                             int level, struct dma_pte *pte,
1054                                             struct page *freelist)
1055 {
1056         struct page *pg;
1057
1058         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1059         pg->freelist = freelist;
1060         freelist = pg;
1061
1062         if (level == 1)
1063                 return freelist;
1064
1065         pte = page_address(pg);
1066         do {
1067                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1068                         freelist = dma_pte_list_pagetables(domain, level - 1,
1069                                                            pte, freelist);
1070                 pte++;
1071         } while (!first_pte_in_page(pte));
1072
1073         return freelist;
1074 }
1075
1076 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1077                                         struct dma_pte *pte, unsigned long pfn,
1078                                         unsigned long start_pfn,
1079                                         unsigned long last_pfn,
1080                                         struct page *freelist)
1081 {
1082         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1083
1084         pfn = max(start_pfn, pfn);
1085         pte = &pte[pfn_level_offset(pfn, level)];
1086
1087         do {
1088                 unsigned long level_pfn;
1089
1090                 if (!dma_pte_present(pte))
1091                         goto next;
1092
1093                 level_pfn = pfn & level_mask(level);
1094
1095                 /* If range covers entire pagetable, free it */
1096                 if (start_pfn <= level_pfn &&
1097                     last_pfn >= level_pfn + level_size(level) - 1) {
1098                         /* These suborbinate page tables are going away entirely. Don't
1099                            bother to clear them; we're just going to *free* them. */
1100                         if (level > 1 && !dma_pte_superpage(pte))
1101                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1102
1103                         dma_clear_pte(pte);
1104                         if (!first_pte)
1105                                 first_pte = pte;
1106                         last_pte = pte;
1107                 } else if (level > 1) {
1108                         /* Recurse down into a level that isn't *entirely* obsolete */
1109                         freelist = dma_pte_clear_level(domain, level - 1,
1110                                                        phys_to_virt(dma_pte_addr(pte)),
1111                                                        level_pfn, start_pfn, last_pfn,
1112                                                        freelist);
1113                 }
1114 next:
1115                 pfn += level_size(level);
1116         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1117
1118         if (first_pte)
1119                 domain_flush_cache(domain, first_pte,
1120                                    (void *)++last_pte - (void *)first_pte);
1121
1122         return freelist;
1123 }
1124
1125 /* We can't just free the pages because the IOMMU may still be walking
1126    the page tables, and may have cached the intermediate levels. The
1127    pages can only be freed after the IOTLB flush has been done. */
1128 static struct page *domain_unmap(struct dmar_domain *domain,
1129                                  unsigned long start_pfn,
1130                                  unsigned long last_pfn)
1131 {
1132         struct page *freelist = NULL;
1133
1134         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1135         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1136         BUG_ON(start_pfn > last_pfn);
1137
1138         /* we don't need lock here; nobody else touches the iova range */
1139         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1140                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1141
1142         /* free pgd */
1143         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1144                 struct page *pgd_page = virt_to_page(domain->pgd);
1145                 pgd_page->freelist = freelist;
1146                 freelist = pgd_page;
1147
1148                 domain->pgd = NULL;
1149         }
1150
1151         return freelist;
1152 }
1153
1154 static void dma_free_pagelist(struct page *freelist)
1155 {
1156         struct page *pg;
1157
1158         while ((pg = freelist)) {
1159                 freelist = pg->freelist;
1160                 free_pgtable_page(page_address(pg));
1161         }
1162 }
1163
1164 static void iova_entry_free(unsigned long data)
1165 {
1166         struct page *freelist = (struct page *)data;
1167
1168         dma_free_pagelist(freelist);
1169 }
1170
1171 /* iommu handling */
1172 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1173 {
1174         struct root_entry *root;
1175         unsigned long flags;
1176
1177         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1178         if (!root) {
1179                 pr_err("Allocating root entry for %s failed\n",
1180                         iommu->name);
1181                 return -ENOMEM;
1182         }
1183
1184         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1185
1186         spin_lock_irqsave(&iommu->lock, flags);
1187         iommu->root_entry = root;
1188         spin_unlock_irqrestore(&iommu->lock, flags);
1189
1190         return 0;
1191 }
1192
1193 static void iommu_set_root_entry(struct intel_iommu *iommu)
1194 {
1195         u64 addr;
1196         u32 sts;
1197         unsigned long flag;
1198
1199         addr = virt_to_phys(iommu->root_entry);
1200
1201         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1202         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1203
1204         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1205
1206         /* Make sure hardware complete it */
1207         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1208                       readl, (sts & DMA_GSTS_RTPS), sts);
1209
1210         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1211 }
1212
1213 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1214 {
1215         u32 val;
1216         unsigned long flag;
1217
1218         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1219                 return;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1223
1224         /* Make sure hardware complete it */
1225         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1226                       readl, (!(val & DMA_GSTS_WBFS)), val);
1227
1228         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1229 }
1230
1231 /* return value determine if we need a write buffer flush */
1232 static void __iommu_flush_context(struct intel_iommu *iommu,
1233                                   u16 did, u16 source_id, u8 function_mask,
1234                                   u64 type)
1235 {
1236         u64 val = 0;
1237         unsigned long flag;
1238
1239         switch (type) {
1240         case DMA_CCMD_GLOBAL_INVL:
1241                 val = DMA_CCMD_GLOBAL_INVL;
1242                 break;
1243         case DMA_CCMD_DOMAIN_INVL:
1244                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1245                 break;
1246         case DMA_CCMD_DEVICE_INVL:
1247                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1248                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1249                 break;
1250         default:
1251                 BUG();
1252         }
1253         val |= DMA_CCMD_ICC;
1254
1255         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1256         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1257
1258         /* Make sure hardware complete it */
1259         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1260                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1261
1262         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1263 }
1264
1265 /* return value determine if we need a write buffer flush */
1266 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1267                                 u64 addr, unsigned int size_order, u64 type)
1268 {
1269         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1270         u64 val = 0, val_iva = 0;
1271         unsigned long flag;
1272
1273         switch (type) {
1274         case DMA_TLB_GLOBAL_FLUSH:
1275                 /* global flush doesn't need set IVA_REG */
1276                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1277                 break;
1278         case DMA_TLB_DSI_FLUSH:
1279                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1280                 break;
1281         case DMA_TLB_PSI_FLUSH:
1282                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1283                 /* IH bit is passed in as part of address */
1284                 val_iva = size_order | addr;
1285                 break;
1286         default:
1287                 BUG();
1288         }
1289         /* Note: set drain read/write */
1290 #if 0
1291         /*
1292          * This is probably to be super secure.. Looks like we can
1293          * ignore it without any impact.
1294          */
1295         if (cap_read_drain(iommu->cap))
1296                 val |= DMA_TLB_READ_DRAIN;
1297 #endif
1298         if (cap_write_drain(iommu->cap))
1299                 val |= DMA_TLB_WRITE_DRAIN;
1300
1301         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1302         /* Note: Only uses first TLB reg currently */
1303         if (val_iva)
1304                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1305         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1306
1307         /* Make sure hardware complete it */
1308         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1309                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1310
1311         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1312
1313         /* check IOTLB invalidation granularity */
1314         if (DMA_TLB_IAIG(val) == 0)
1315                 pr_err("Flush IOTLB failed\n");
1316         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1317                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1318                         (unsigned long long)DMA_TLB_IIRG(type),
1319                         (unsigned long long)DMA_TLB_IAIG(val));
1320 }
1321
1322 static struct device_domain_info *
1323 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1324                          u8 bus, u8 devfn)
1325 {
1326         struct device_domain_info *info;
1327
1328         assert_spin_locked(&device_domain_lock);
1329
1330         if (!iommu->qi)
1331                 return NULL;
1332
1333         list_for_each_entry(info, &domain->devices, link)
1334                 if (info->iommu == iommu && info->bus == bus &&
1335                     info->devfn == devfn) {
1336                         if (info->ats_supported && info->dev)
1337                                 return info;
1338                         break;
1339                 }
1340
1341         return NULL;
1342 }
1343
1344 static void domain_update_iotlb(struct dmar_domain *domain)
1345 {
1346         struct device_domain_info *info;
1347         bool has_iotlb_device = false;
1348
1349         assert_spin_locked(&device_domain_lock);
1350
1351         list_for_each_entry(info, &domain->devices, link) {
1352                 struct pci_dev *pdev;
1353
1354                 if (!info->dev || !dev_is_pci(info->dev))
1355                         continue;
1356
1357                 pdev = to_pci_dev(info->dev);
1358                 if (pdev->ats_enabled) {
1359                         has_iotlb_device = true;
1360                         break;
1361                 }
1362         }
1363
1364         domain->has_iotlb_device = has_iotlb_device;
1365 }
1366
1367 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1368 {
1369         struct pci_dev *pdev;
1370
1371         assert_spin_locked(&device_domain_lock);
1372
1373         if (!info || !dev_is_pci(info->dev))
1374                 return;
1375
1376         pdev = to_pci_dev(info->dev);
1377         /* For IOMMU that supports device IOTLB throttling (DIT), we assign
1378          * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1379          * queue depth at PF level. If DIT is not set, PFSID will be treated as
1380          * reserved, which should be set to 0.
1381          */
1382         if (!ecap_dit(info->iommu->ecap))
1383                 info->pfsid = 0;
1384         else {
1385                 struct pci_dev *pf_pdev;
1386
1387                 /* pdev will be returned if device is not a vf */
1388                 pf_pdev = pci_physfn(pdev);
1389                 info->pfsid = PCI_DEVID(pf_pdev->bus->number, pf_pdev->devfn);
1390         }
1391
1392 #ifdef CONFIG_INTEL_IOMMU_SVM
1393         /* The PCIe spec, in its wisdom, declares that the behaviour of
1394            the device if you enable PASID support after ATS support is
1395            undefined. So always enable PASID support on devices which
1396            have it, even if we can't yet know if we're ever going to
1397            use it. */
1398         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1399                 info->pasid_enabled = 1;
1400
1401         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1402                 info->pri_enabled = 1;
1403 #endif
1404         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1405                 info->ats_enabled = 1;
1406                 domain_update_iotlb(info->domain);
1407                 info->ats_qdep = pci_ats_queue_depth(pdev);
1408         }
1409 }
1410
1411 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1412 {
1413         struct pci_dev *pdev;
1414
1415         assert_spin_locked(&device_domain_lock);
1416
1417         if (!dev_is_pci(info->dev))
1418                 return;
1419
1420         pdev = to_pci_dev(info->dev);
1421
1422         if (info->ats_enabled) {
1423                 pci_disable_ats(pdev);
1424                 info->ats_enabled = 0;
1425                 domain_update_iotlb(info->domain);
1426         }
1427 #ifdef CONFIG_INTEL_IOMMU_SVM
1428         if (info->pri_enabled) {
1429                 pci_disable_pri(pdev);
1430                 info->pri_enabled = 0;
1431         }
1432         if (info->pasid_enabled) {
1433                 pci_disable_pasid(pdev);
1434                 info->pasid_enabled = 0;
1435         }
1436 #endif
1437 }
1438
1439 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1440                                   u64 addr, unsigned mask)
1441 {
1442         u16 sid, qdep;
1443         unsigned long flags;
1444         struct device_domain_info *info;
1445
1446         if (!domain->has_iotlb_device)
1447                 return;
1448
1449         spin_lock_irqsave(&device_domain_lock, flags);
1450         list_for_each_entry(info, &domain->devices, link) {
1451                 if (!info->ats_enabled)
1452                         continue;
1453
1454                 sid = info->bus << 8 | info->devfn;
1455                 qdep = info->ats_qdep;
1456                 qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1457                                 qdep, addr, mask);
1458         }
1459         spin_unlock_irqrestore(&device_domain_lock, flags);
1460 }
1461
1462 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1463                                   struct dmar_domain *domain,
1464                                   unsigned long pfn, unsigned int pages,
1465                                   int ih, int map)
1466 {
1467         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1468         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1469         u16 did = domain->iommu_did[iommu->seq_id];
1470
1471         BUG_ON(pages == 0);
1472
1473         if (ih)
1474                 ih = 1 << 6;
1475         /*
1476          * Fallback to domain selective flush if no PSI support or the size is
1477          * too big.
1478          * PSI requires page size to be 2 ^ x, and the base address is naturally
1479          * aligned to the size
1480          */
1481         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1482                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1483                                                 DMA_TLB_DSI_FLUSH);
1484         else
1485                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1486                                                 DMA_TLB_PSI_FLUSH);
1487
1488         /*
1489          * In caching mode, changes of pages from non-present to present require
1490          * flush. However, device IOTLB doesn't need to be flushed in this case.
1491          */
1492         if (!cap_caching_mode(iommu->cap) || !map)
1493                 iommu_flush_dev_iotlb(domain, addr, mask);
1494 }
1495
1496 /* Notification for newly created mappings */
1497 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1498                                         struct dmar_domain *domain,
1499                                         unsigned long pfn, unsigned int pages)
1500 {
1501         /* It's a non-present to present mapping. Only flush if caching mode */
1502         if (cap_caching_mode(iommu->cap))
1503                 iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1504         else
1505                 iommu_flush_write_buffer(iommu);
1506 }
1507
1508 static void iommu_flush_iova(struct iova_domain *iovad)
1509 {
1510         struct dmar_domain *domain;
1511         int idx;
1512
1513         domain = container_of(iovad, struct dmar_domain, iovad);
1514
1515         for_each_domain_iommu(idx, domain) {
1516                 struct intel_iommu *iommu = g_iommus[idx];
1517                 u16 did = domain->iommu_did[iommu->seq_id];
1518
1519                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1520
1521                 if (!cap_caching_mode(iommu->cap))
1522                         iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1523                                               0, MAX_AGAW_PFN_WIDTH);
1524         }
1525 }
1526
1527 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1528 {
1529         u32 pmen;
1530         unsigned long flags;
1531
1532         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1533         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1534         pmen &= ~DMA_PMEN_EPM;
1535         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1536
1537         /* wait for the protected region status bit to clear */
1538         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1539                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1540
1541         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1542 }
1543
1544 static void iommu_enable_translation(struct intel_iommu *iommu)
1545 {
1546         u32 sts;
1547         unsigned long flags;
1548
1549         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1550         iommu->gcmd |= DMA_GCMD_TE;
1551         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1552
1553         /* Make sure hardware complete it */
1554         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1555                       readl, (sts & DMA_GSTS_TES), sts);
1556
1557         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1558 }
1559
1560 static void iommu_disable_translation(struct intel_iommu *iommu)
1561 {
1562         u32 sts;
1563         unsigned long flag;
1564
1565         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1566         iommu->gcmd &= ~DMA_GCMD_TE;
1567         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1568
1569         /* Make sure hardware complete it */
1570         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1571                       readl, (!(sts & DMA_GSTS_TES)), sts);
1572
1573         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1574 }
1575
1576
1577 static int iommu_init_domains(struct intel_iommu *iommu)
1578 {
1579         u32 ndomains, nlongs;
1580         size_t size;
1581
1582         ndomains = cap_ndoms(iommu->cap);
1583         pr_debug("%s: Number of Domains supported <%d>\n",
1584                  iommu->name, ndomains);
1585         nlongs = BITS_TO_LONGS(ndomains);
1586
1587         spin_lock_init(&iommu->lock);
1588
1589         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1590         if (!iommu->domain_ids) {
1591                 pr_err("%s: Allocating domain id array failed\n",
1592                        iommu->name);
1593                 return -ENOMEM;
1594         }
1595
1596         size = (ALIGN(ndomains, 256) >> 8) * sizeof(struct dmar_domain **);
1597         iommu->domains = kzalloc(size, GFP_KERNEL);
1598
1599         if (iommu->domains) {
1600                 size = 256 * sizeof(struct dmar_domain *);
1601                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1602         }
1603
1604         if (!iommu->domains || !iommu->domains[0]) {
1605                 pr_err("%s: Allocating domain array failed\n",
1606                        iommu->name);
1607                 kfree(iommu->domain_ids);
1608                 kfree(iommu->domains);
1609                 iommu->domain_ids = NULL;
1610                 iommu->domains    = NULL;
1611                 return -ENOMEM;
1612         }
1613
1614
1615
1616         /*
1617          * If Caching mode is set, then invalid translations are tagged
1618          * with domain-id 0, hence we need to pre-allocate it. We also
1619          * use domain-id 0 as a marker for non-allocated domain-id, so
1620          * make sure it is not used for a real domain.
1621          */
1622         set_bit(0, iommu->domain_ids);
1623
1624         /*
1625          * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1626          * entry for first-level or pass-through translation modes should
1627          * be programmed with a domain id different from those used for
1628          * second-level or nested translation. We reserve a domain id for
1629          * this purpose.
1630          */
1631         if (sm_supported(iommu))
1632                 set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1633
1634         return 0;
1635 }
1636
1637 static void disable_dmar_iommu(struct intel_iommu *iommu)
1638 {
1639         struct device_domain_info *info, *tmp;
1640         unsigned long flags;
1641
1642         if (!iommu->domains || !iommu->domain_ids)
1643                 return;
1644
1645 again:
1646         spin_lock_irqsave(&device_domain_lock, flags);
1647         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1648                 struct dmar_domain *domain;
1649
1650                 if (info->iommu != iommu)
1651                         continue;
1652
1653                 if (!info->dev || !info->domain)
1654                         continue;
1655
1656                 domain = info->domain;
1657
1658                 __dmar_remove_one_dev_info(info);
1659
1660                 if (!domain_type_is_vm_or_si(domain)) {
1661                         /*
1662                          * The domain_exit() function  can't be called under
1663                          * device_domain_lock, as it takes this lock itself.
1664                          * So release the lock here and re-run the loop
1665                          * afterwards.
1666                          */
1667                         spin_unlock_irqrestore(&device_domain_lock, flags);
1668                         domain_exit(domain);
1669                         goto again;
1670                 }
1671         }
1672         spin_unlock_irqrestore(&device_domain_lock, flags);
1673
1674         if (iommu->gcmd & DMA_GCMD_TE)
1675                 iommu_disable_translation(iommu);
1676 }
1677
1678 static void free_dmar_iommu(struct intel_iommu *iommu)
1679 {
1680         if ((iommu->domains) && (iommu->domain_ids)) {
1681                 int elems = ALIGN(cap_ndoms(iommu->cap), 256) >> 8;
1682                 int i;
1683
1684                 for (i = 0; i < elems; i++)
1685                         kfree(iommu->domains[i]);
1686                 kfree(iommu->domains);
1687                 kfree(iommu->domain_ids);
1688                 iommu->domains = NULL;
1689                 iommu->domain_ids = NULL;
1690         }
1691
1692         g_iommus[iommu->seq_id] = NULL;
1693
1694         /* free context mapping */
1695         free_context_table(iommu);
1696
1697 #ifdef CONFIG_INTEL_IOMMU_SVM
1698         if (pasid_supported(iommu)) {
1699                 if (ecap_prs(iommu->ecap))
1700                         intel_svm_finish_prq(iommu);
1701                 intel_svm_exit(iommu);
1702         }
1703 #endif
1704 }
1705
1706 static struct dmar_domain *alloc_domain(int flags)
1707 {
1708         struct dmar_domain *domain;
1709
1710         domain = alloc_domain_mem();
1711         if (!domain)
1712                 return NULL;
1713
1714         memset(domain, 0, sizeof(*domain));
1715         domain->nid = -1;
1716         domain->flags = flags;
1717         domain->has_iotlb_device = false;
1718         INIT_LIST_HEAD(&domain->devices);
1719
1720         return domain;
1721 }
1722
1723 /* Must be called with iommu->lock */
1724 static int domain_attach_iommu(struct dmar_domain *domain,
1725                                struct intel_iommu *iommu)
1726 {
1727         unsigned long ndomains;
1728         int num;
1729
1730         assert_spin_locked(&device_domain_lock);
1731         assert_spin_locked(&iommu->lock);
1732
1733         domain->iommu_refcnt[iommu->seq_id] += 1;
1734         domain->iommu_count += 1;
1735         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1736                 ndomains = cap_ndoms(iommu->cap);
1737                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1738
1739                 if (num >= ndomains) {
1740                         pr_err("%s: No free domain ids\n", iommu->name);
1741                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1742                         domain->iommu_count -= 1;
1743                         return -ENOSPC;
1744                 }
1745
1746                 set_bit(num, iommu->domain_ids);
1747                 set_iommu_domain(iommu, num, domain);
1748
1749                 domain->iommu_did[iommu->seq_id] = num;
1750                 domain->nid                      = iommu->node;
1751
1752                 domain_update_iommu_cap(domain);
1753         }
1754
1755         return 0;
1756 }
1757
1758 static int domain_detach_iommu(struct dmar_domain *domain,
1759                                struct intel_iommu *iommu)
1760 {
1761         int num, count = INT_MAX;
1762
1763         assert_spin_locked(&device_domain_lock);
1764         assert_spin_locked(&iommu->lock);
1765
1766         domain->iommu_refcnt[iommu->seq_id] -= 1;
1767         count = --domain->iommu_count;
1768         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1769                 num = domain->iommu_did[iommu->seq_id];
1770                 clear_bit(num, iommu->domain_ids);
1771                 set_iommu_domain(iommu, num, NULL);
1772
1773                 domain_update_iommu_cap(domain);
1774                 domain->iommu_did[iommu->seq_id] = 0;
1775         }
1776
1777         return count;
1778 }
1779
1780 static struct iova_domain reserved_iova_list;
1781 static struct lock_class_key reserved_rbtree_key;
1782
1783 static int dmar_init_reserved_ranges(void)
1784 {
1785         struct pci_dev *pdev = NULL;
1786         struct iova *iova;
1787         int i;
1788
1789         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN);
1790
1791         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1792                 &reserved_rbtree_key);
1793
1794         /* IOAPIC ranges shouldn't be accessed by DMA */
1795         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1796                 IOVA_PFN(IOAPIC_RANGE_END));
1797         if (!iova) {
1798                 pr_err("Reserve IOAPIC range failed\n");
1799                 return -ENODEV;
1800         }
1801
1802         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1803         for_each_pci_dev(pdev) {
1804                 struct resource *r;
1805
1806                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1807                         r = &pdev->resource[i];
1808                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1809                                 continue;
1810                         iova = reserve_iova(&reserved_iova_list,
1811                                             IOVA_PFN(r->start),
1812                                             IOVA_PFN(r->end));
1813                         if (!iova) {
1814                                 pr_err("Reserve iova failed\n");
1815                                 return -ENODEV;
1816                         }
1817                 }
1818         }
1819         return 0;
1820 }
1821
1822 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1823 {
1824         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1825 }
1826
1827 static inline int guestwidth_to_adjustwidth(int gaw)
1828 {
1829         int agaw;
1830         int r = (gaw - 12) % 9;
1831
1832         if (r == 0)
1833                 agaw = gaw;
1834         else
1835                 agaw = gaw + 9 - r;
1836         if (agaw > 64)
1837                 agaw = 64;
1838         return agaw;
1839 }
1840
1841 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1842                        int guest_width)
1843 {
1844         int adjust_width, agaw;
1845         unsigned long sagaw;
1846         int err;
1847
1848         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
1849
1850         err = init_iova_flush_queue(&domain->iovad,
1851                                     iommu_flush_iova, iova_entry_free);
1852         if (err)
1853                 return err;
1854
1855         domain_reserve_special_ranges(domain);
1856
1857         /* calculate AGAW */
1858         if (guest_width > cap_mgaw(iommu->cap))
1859                 guest_width = cap_mgaw(iommu->cap);
1860         domain->gaw = guest_width;
1861         adjust_width = guestwidth_to_adjustwidth(guest_width);
1862         agaw = width_to_agaw(adjust_width);
1863         sagaw = cap_sagaw(iommu->cap);
1864         if (!test_bit(agaw, &sagaw)) {
1865                 /* hardware doesn't support it, choose a bigger one */
1866                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1867                 agaw = find_next_bit(&sagaw, 5, agaw);
1868                 if (agaw >= 5)
1869                         return -ENODEV;
1870         }
1871         domain->agaw = agaw;
1872
1873         if (ecap_coherent(iommu->ecap))
1874                 domain->iommu_coherency = 1;
1875         else
1876                 domain->iommu_coherency = 0;
1877
1878         if (ecap_sc_support(iommu->ecap))
1879                 domain->iommu_snooping = 1;
1880         else
1881                 domain->iommu_snooping = 0;
1882
1883         if (intel_iommu_superpage)
1884                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1885         else
1886                 domain->iommu_superpage = 0;
1887
1888         domain->nid = iommu->node;
1889
1890         /* always allocate the top pgd */
1891         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1892         if (!domain->pgd)
1893                 return -ENOMEM;
1894         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1895         return 0;
1896 }
1897
1898 static void domain_exit(struct dmar_domain *domain)
1899 {
1900         struct page *freelist = NULL;
1901
1902         /* Domain 0 is reserved, so dont process it */
1903         if (!domain)
1904                 return;
1905
1906         /* Remove associated devices and clear attached or cached domains */
1907         rcu_read_lock();
1908         domain_remove_dev_info(domain);
1909         rcu_read_unlock();
1910
1911         /* destroy iovas */
1912         put_iova_domain(&domain->iovad);
1913
1914         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1915
1916         dma_free_pagelist(freelist);
1917
1918         free_domain_mem(domain);
1919 }
1920
1921 static int domain_context_mapping_one(struct dmar_domain *domain,
1922                                       struct intel_iommu *iommu,
1923                                       u8 bus, u8 devfn)
1924 {
1925         u16 did = domain->iommu_did[iommu->seq_id];
1926         int translation = CONTEXT_TT_MULTI_LEVEL;
1927         struct device_domain_info *info = NULL;
1928         struct context_entry *context;
1929         unsigned long flags;
1930         struct dma_pte *pgd;
1931         int ret, agaw;
1932
1933         WARN_ON(did == 0);
1934
1935         if (hw_pass_through && domain_type_is_si(domain))
1936                 translation = CONTEXT_TT_PASS_THROUGH;
1937
1938         pr_debug("Set context mapping for %02x:%02x.%d\n",
1939                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1940
1941         BUG_ON(!domain->pgd);
1942
1943         spin_lock_irqsave(&device_domain_lock, flags);
1944         spin_lock(&iommu->lock);
1945
1946         ret = -ENOMEM;
1947         context = iommu_context_addr(iommu, bus, devfn, 1);
1948         if (!context)
1949                 goto out_unlock;
1950
1951         ret = 0;
1952         if (context_present(context))
1953                 goto out_unlock;
1954
1955         /*
1956          * For kdump cases, old valid entries may be cached due to the
1957          * in-flight DMA and copied pgtable, but there is no unmapping
1958          * behaviour for them, thus we need an explicit cache flush for
1959          * the newly-mapped device. For kdump, at this point, the device
1960          * is supposed to finish reset at its driver probe stage, so no
1961          * in-flight DMA will exist, and we don't need to worry anymore
1962          * hereafter.
1963          */
1964         if (context_copied(context)) {
1965                 u16 did_old = context_domain_id(context);
1966
1967                 if (did_old < cap_ndoms(iommu->cap)) {
1968                         iommu->flush.flush_context(iommu, did_old,
1969                                                    (((u16)bus) << 8) | devfn,
1970                                                    DMA_CCMD_MASK_NOBIT,
1971                                                    DMA_CCMD_DEVICE_INVL);
1972                         iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1973                                                  DMA_TLB_DSI_FLUSH);
1974                 }
1975         }
1976
1977         pgd = domain->pgd;
1978
1979         context_clear_entry(context);
1980         context_set_domain_id(context, did);
1981
1982         /*
1983          * Skip top levels of page tables for iommu which has less agaw
1984          * than default.  Unnecessary for PT mode.
1985          */
1986         if (translation != CONTEXT_TT_PASS_THROUGH) {
1987                 for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
1988                         ret = -ENOMEM;
1989                         pgd = phys_to_virt(dma_pte_addr(pgd));
1990                         if (!dma_pte_present(pgd))
1991                                 goto out_unlock;
1992                 }
1993
1994                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1995                 if (info && info->ats_supported)
1996                         translation = CONTEXT_TT_DEV_IOTLB;
1997                 else
1998                         translation = CONTEXT_TT_MULTI_LEVEL;
1999
2000                 context_set_address_root(context, virt_to_phys(pgd));
2001                 context_set_address_width(context, agaw);
2002         } else {
2003                 /*
2004                  * In pass through mode, AW must be programmed to
2005                  * indicate the largest AGAW value supported by
2006                  * hardware. And ASR is ignored by hardware.
2007                  */
2008                 context_set_address_width(context, iommu->msagaw);
2009         }
2010
2011         context_set_translation_type(context, translation);
2012         context_set_fault_enable(context);
2013         context_set_present(context);
2014         domain_flush_cache(domain, context, sizeof(*context));
2015
2016         /*
2017          * It's a non-present to present mapping. If hardware doesn't cache
2018          * non-present entry we only need to flush the write-buffer. If the
2019          * _does_ cache non-present entries, then it does so in the special
2020          * domain #0, which we have to flush:
2021          */
2022         if (cap_caching_mode(iommu->cap)) {
2023                 iommu->flush.flush_context(iommu, 0,
2024                                            (((u16)bus) << 8) | devfn,
2025                                            DMA_CCMD_MASK_NOBIT,
2026                                            DMA_CCMD_DEVICE_INVL);
2027                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2028         } else {
2029                 iommu_flush_write_buffer(iommu);
2030         }
2031         iommu_enable_dev_iotlb(info);
2032
2033         ret = 0;
2034
2035 out_unlock:
2036         spin_unlock(&iommu->lock);
2037         spin_unlock_irqrestore(&device_domain_lock, flags);
2038
2039         return ret;
2040 }
2041
2042 struct domain_context_mapping_data {
2043         struct dmar_domain *domain;
2044         struct intel_iommu *iommu;
2045 };
2046
2047 static int domain_context_mapping_cb(struct pci_dev *pdev,
2048                                      u16 alias, void *opaque)
2049 {
2050         struct domain_context_mapping_data *data = opaque;
2051
2052         return domain_context_mapping_one(data->domain, data->iommu,
2053                                           PCI_BUS_NUM(alias), alias & 0xff);
2054 }
2055
2056 static int
2057 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2058 {
2059         struct intel_iommu *iommu;
2060         u8 bus, devfn;
2061         struct domain_context_mapping_data data;
2062
2063         iommu = device_to_iommu(dev, &bus, &devfn);
2064         if (!iommu)
2065                 return -ENODEV;
2066
2067         if (!dev_is_pci(dev))
2068                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2069
2070         data.domain = domain;
2071         data.iommu = iommu;
2072
2073         return pci_for_each_dma_alias(to_pci_dev(dev),
2074                                       &domain_context_mapping_cb, &data);
2075 }
2076
2077 static int domain_context_mapped_cb(struct pci_dev *pdev,
2078                                     u16 alias, void *opaque)
2079 {
2080         struct intel_iommu *iommu = opaque;
2081
2082         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2083 }
2084
2085 static int domain_context_mapped(struct device *dev)
2086 {
2087         struct intel_iommu *iommu;
2088         u8 bus, devfn;
2089
2090         iommu = device_to_iommu(dev, &bus, &devfn);
2091         if (!iommu)
2092                 return -ENODEV;
2093
2094         if (!dev_is_pci(dev))
2095                 return device_context_mapped(iommu, bus, devfn);
2096
2097         return !pci_for_each_dma_alias(to_pci_dev(dev),
2098                                        domain_context_mapped_cb, iommu);
2099 }
2100
2101 /* Returns a number of VTD pages, but aligned to MM page size */
2102 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2103                                             size_t size)
2104 {
2105         host_addr &= ~PAGE_MASK;
2106         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2107 }
2108
2109 /* Return largest possible superpage level for a given mapping */
2110 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2111                                           unsigned long iov_pfn,
2112                                           unsigned long phy_pfn,
2113                                           unsigned long pages)
2114 {
2115         int support, level = 1;
2116         unsigned long pfnmerge;
2117
2118         support = domain->iommu_superpage;
2119
2120         /* To use a large page, the virtual *and* physical addresses
2121            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2122            of them will mean we have to use smaller pages. So just
2123            merge them and check both at once. */
2124         pfnmerge = iov_pfn | phy_pfn;
2125
2126         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2127                 pages >>= VTD_STRIDE_SHIFT;
2128                 if (!pages)
2129                         break;
2130                 pfnmerge >>= VTD_STRIDE_SHIFT;
2131                 level++;
2132                 support--;
2133         }
2134         return level;
2135 }
2136
2137 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2138                             struct scatterlist *sg, unsigned long phys_pfn,
2139                             unsigned long nr_pages, int prot)
2140 {
2141         struct dma_pte *first_pte = NULL, *pte = NULL;
2142         phys_addr_t uninitialized_var(pteval);
2143         unsigned long sg_res = 0;
2144         unsigned int largepage_lvl = 0;
2145         unsigned long lvl_pages = 0;
2146
2147         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2148
2149         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2150                 return -EINVAL;
2151
2152         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2153
2154         if (!sg) {
2155                 sg_res = nr_pages;
2156                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2157         }
2158
2159         while (nr_pages > 0) {
2160                 uint64_t tmp;
2161
2162                 if (!sg_res) {
2163                         unsigned int pgoff = sg->offset & ~PAGE_MASK;
2164
2165                         sg_res = aligned_nrpages(sg->offset, sg->length);
2166                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + pgoff;
2167                         sg->dma_length = sg->length;
2168                         pteval = (sg_phys(sg) - pgoff) | prot;
2169                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2170                 }
2171
2172                 if (!pte) {
2173                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2174
2175                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2176                         if (!pte)
2177                                 return -ENOMEM;
2178                         /* It is large page*/
2179                         if (largepage_lvl > 1) {
2180                                 unsigned long nr_superpages, end_pfn;
2181
2182                                 pteval |= DMA_PTE_LARGE_PAGE;
2183                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2184
2185                                 nr_superpages = sg_res / lvl_pages;
2186                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2187
2188                                 /*
2189                                  * Ensure that old small page tables are
2190                                  * removed to make room for superpage(s).
2191                                  * We're adding new large pages, so make sure
2192                                  * we don't remove their parent tables.
2193                                  */
2194                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn,
2195                                                        largepage_lvl + 1);
2196                         } else {
2197                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2198                         }
2199
2200                 }
2201                 /* We don't need lock here, nobody else
2202                  * touches the iova range
2203                  */
2204                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2205                 if (tmp) {
2206                         static int dumps = 5;
2207                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2208                                 iov_pfn, tmp, (unsigned long long)pteval);
2209                         if (dumps) {
2210                                 dumps--;
2211                                 debug_dma_dump_mappings(NULL);
2212                         }
2213                         WARN_ON(1);
2214                 }
2215
2216                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2217
2218                 BUG_ON(nr_pages < lvl_pages);
2219                 BUG_ON(sg_res < lvl_pages);
2220
2221                 nr_pages -= lvl_pages;
2222                 iov_pfn += lvl_pages;
2223                 phys_pfn += lvl_pages;
2224                 pteval += lvl_pages * VTD_PAGE_SIZE;
2225                 sg_res -= lvl_pages;
2226
2227                 /* If the next PTE would be the first in a new page, then we
2228                    need to flush the cache on the entries we've just written.
2229                    And then we'll need to recalculate 'pte', so clear it and
2230                    let it get set again in the if (!pte) block above.
2231
2232                    If we're done (!nr_pages) we need to flush the cache too.
2233
2234                    Also if we've been setting superpages, we may need to
2235                    recalculate 'pte' and switch back to smaller pages for the
2236                    end of the mapping, if the trailing size is not enough to
2237                    use another superpage (i.e. sg_res < lvl_pages). */
2238                 pte++;
2239                 if (!nr_pages || first_pte_in_page(pte) ||
2240                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2241                         domain_flush_cache(domain, first_pte,
2242                                            (void *)pte - (void *)first_pte);
2243                         pte = NULL;
2244                 }
2245
2246                 if (!sg_res && nr_pages)
2247                         sg = sg_next(sg);
2248         }
2249         return 0;
2250 }
2251
2252 static int domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2253                          struct scatterlist *sg, unsigned long phys_pfn,
2254                          unsigned long nr_pages, int prot)
2255 {
2256        int ret;
2257        struct intel_iommu *iommu;
2258
2259        /* Do the real mapping first */
2260        ret = __domain_mapping(domain, iov_pfn, sg, phys_pfn, nr_pages, prot);
2261        if (ret)
2262                return ret;
2263
2264        /* Notify about the new mapping */
2265        if (domain_type_is_vm(domain)) {
2266                /* VM typed domains can have more than one IOMMUs */
2267                int iommu_id;
2268                for_each_domain_iommu(iommu_id, domain) {
2269                        iommu = g_iommus[iommu_id];
2270                        __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2271                }
2272        } else {
2273                /* General domains only have one IOMMU */
2274                iommu = domain_get_iommu(domain);
2275                __mapping_notify_one(iommu, domain, iov_pfn, nr_pages);
2276        }
2277
2278        return 0;
2279 }
2280
2281 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2282                                     struct scatterlist *sg, unsigned long nr_pages,
2283                                     int prot)
2284 {
2285         return domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2286 }
2287
2288 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2289                                      unsigned long phys_pfn, unsigned long nr_pages,
2290                                      int prot)
2291 {
2292         return domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2293 }
2294
2295 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2296 {
2297         unsigned long flags;
2298         struct context_entry *context;
2299         u16 did_old;
2300
2301         if (!iommu)
2302                 return;
2303
2304         spin_lock_irqsave(&iommu->lock, flags);
2305         context = iommu_context_addr(iommu, bus, devfn, 0);
2306         if (!context) {
2307                 spin_unlock_irqrestore(&iommu->lock, flags);
2308                 return;
2309         }
2310         did_old = context_domain_id(context);
2311         context_clear_entry(context);
2312         __iommu_flush_cache(iommu, context, sizeof(*context));
2313         spin_unlock_irqrestore(&iommu->lock, flags);
2314         iommu->flush.flush_context(iommu,
2315                                    did_old,
2316                                    (((u16)bus) << 8) | devfn,
2317                                    DMA_CCMD_MASK_NOBIT,
2318                                    DMA_CCMD_DEVICE_INVL);
2319         iommu->flush.flush_iotlb(iommu,
2320                                  did_old,
2321                                  0,
2322                                  0,
2323                                  DMA_TLB_DSI_FLUSH);
2324 }
2325
2326 static inline void unlink_domain_info(struct device_domain_info *info)
2327 {
2328         assert_spin_locked(&device_domain_lock);
2329         list_del(&info->link);
2330         list_del(&info->global);
2331         if (info->dev)
2332                 info->dev->archdata.iommu = NULL;
2333 }
2334
2335 static void domain_remove_dev_info(struct dmar_domain *domain)
2336 {
2337         struct device_domain_info *info, *tmp;
2338         unsigned long flags;
2339
2340         spin_lock_irqsave(&device_domain_lock, flags);
2341         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2342                 __dmar_remove_one_dev_info(info);
2343         spin_unlock_irqrestore(&device_domain_lock, flags);
2344 }
2345
2346 /*
2347  * find_domain
2348  * Note: we use struct device->archdata.iommu stores the info
2349  */
2350 static struct dmar_domain *find_domain(struct device *dev)
2351 {
2352         struct device_domain_info *info;
2353
2354         /* No lock here, assumes no domain exit in normal case */
2355         info = dev->archdata.iommu;
2356         if (likely(info))
2357                 return info->domain;
2358         return NULL;
2359 }
2360
2361 static inline struct device_domain_info *
2362 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2363 {
2364         struct device_domain_info *info;
2365
2366         list_for_each_entry(info, &device_domain_list, global)
2367                 if (info->iommu->segment == segment && info->bus == bus &&
2368                     info->devfn == devfn)
2369                         return info;
2370
2371         return NULL;
2372 }
2373
2374 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2375                                                     int bus, int devfn,
2376                                                     struct device *dev,
2377                                                     struct dmar_domain *domain)
2378 {
2379         struct dmar_domain *found = NULL;
2380         struct device_domain_info *info;
2381         unsigned long flags;
2382         int ret;
2383
2384         info = alloc_devinfo_mem();
2385         if (!info)
2386                 return NULL;
2387
2388         info->bus = bus;
2389         info->devfn = devfn;
2390         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2391         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2392         info->ats_qdep = 0;
2393         info->dev = dev;
2394         info->domain = domain;
2395         info->iommu = iommu;
2396         info->pasid_table = NULL;
2397
2398         if (dev && dev_is_pci(dev)) {
2399                 struct pci_dev *pdev = to_pci_dev(info->dev);
2400
2401                 if (!pci_ats_disabled() &&
2402                     ecap_dev_iotlb_support(iommu->ecap) &&
2403                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2404                     dmar_find_matched_atsr_unit(pdev))
2405                         info->ats_supported = 1;
2406
2407                 if (sm_supported(iommu)) {
2408                         if (pasid_supported(iommu)) {
2409                                 int features = pci_pasid_features(pdev);
2410                                 if (features >= 0)
2411                                         info->pasid_supported = features | 1;
2412                         }
2413
2414                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2415                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2416                                 info->pri_supported = 1;
2417                 }
2418         }
2419
2420         spin_lock_irqsave(&device_domain_lock, flags);
2421         if (dev)
2422                 found = find_domain(dev);
2423
2424         if (!found) {
2425                 struct device_domain_info *info2;
2426                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2427                 if (info2) {
2428                         found      = info2->domain;
2429                         info2->dev = dev;
2430                 }
2431         }
2432
2433         if (found) {
2434                 spin_unlock_irqrestore(&device_domain_lock, flags);
2435                 free_devinfo_mem(info);
2436                 /* Caller must free the original domain */
2437                 return found;
2438         }
2439
2440         spin_lock(&iommu->lock);
2441         ret = domain_attach_iommu(domain, iommu);
2442         spin_unlock(&iommu->lock);
2443
2444         if (ret) {
2445                 spin_unlock_irqrestore(&device_domain_lock, flags);
2446                 free_devinfo_mem(info);
2447                 return NULL;
2448         }
2449
2450         list_add(&info->link, &domain->devices);
2451         list_add(&info->global, &device_domain_list);
2452         if (dev)
2453                 dev->archdata.iommu = info;
2454         spin_unlock_irqrestore(&device_domain_lock, flags);
2455
2456         /* PASID table is mandatory for a PCI device in scalable mode. */
2457         if (dev && dev_is_pci(dev) && sm_supported(iommu)) {
2458                 ret = intel_pasid_alloc_table(dev);
2459                 if (ret) {
2460                         pr_err("PASID table allocation for %s failed\n",
2461                                dev_name(dev));
2462                         dmar_remove_one_dev_info(domain, dev);
2463                         return NULL;
2464                 }
2465         }
2466
2467         if (dev && domain_context_mapping(domain, dev)) {
2468                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2469                 dmar_remove_one_dev_info(domain, dev);
2470                 return NULL;
2471         }
2472
2473         return domain;
2474 }
2475
2476 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2477 {
2478         *(u16 *)opaque = alias;
2479         return 0;
2480 }
2481
2482 static struct dmar_domain *find_or_alloc_domain(struct device *dev, int gaw)
2483 {
2484         struct device_domain_info *info = NULL;
2485         struct dmar_domain *domain = NULL;
2486         struct intel_iommu *iommu;
2487         u16 dma_alias;
2488         unsigned long flags;
2489         u8 bus, devfn;
2490
2491         iommu = device_to_iommu(dev, &bus, &devfn);
2492         if (!iommu)
2493                 return NULL;
2494
2495         if (dev_is_pci(dev)) {
2496                 struct pci_dev *pdev = to_pci_dev(dev);
2497
2498                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2499
2500                 spin_lock_irqsave(&device_domain_lock, flags);
2501                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2502                                                       PCI_BUS_NUM(dma_alias),
2503                                                       dma_alias & 0xff);
2504                 if (info) {
2505                         iommu = info->iommu;
2506                         domain = info->domain;
2507                 }
2508                 spin_unlock_irqrestore(&device_domain_lock, flags);
2509
2510                 /* DMA alias already has a domain, use it */
2511                 if (info)
2512                         goto out;
2513         }
2514
2515         /* Allocate and initialize new domain for the device */
2516         domain = alloc_domain(0);
2517         if (!domain)
2518                 return NULL;
2519         if (domain_init(domain, iommu, gaw)) {
2520                 domain_exit(domain);
2521                 return NULL;
2522         }
2523
2524 out:
2525
2526         return domain;
2527 }
2528
2529 static struct dmar_domain *set_domain_for_dev(struct device *dev,
2530                                               struct dmar_domain *domain)
2531 {
2532         struct intel_iommu *iommu;
2533         struct dmar_domain *tmp;
2534         u16 req_id, dma_alias;
2535         u8 bus, devfn;
2536
2537         iommu = device_to_iommu(dev, &bus, &devfn);
2538         if (!iommu)
2539                 return NULL;
2540
2541         req_id = ((u16)bus << 8) | devfn;
2542
2543         if (dev_is_pci(dev)) {
2544                 struct pci_dev *pdev = to_pci_dev(dev);
2545
2546                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2547
2548                 /* register PCI DMA alias device */
2549                 if (req_id != dma_alias) {
2550                         tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2551                                         dma_alias & 0xff, NULL, domain);
2552
2553                         if (!tmp || tmp != domain)
2554                                 return tmp;
2555                 }
2556         }
2557
2558         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2559         if (!tmp || tmp != domain)
2560                 return tmp;
2561
2562         return domain;
2563 }
2564
2565 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2566 {
2567         struct dmar_domain *domain, *tmp;
2568
2569         domain = find_domain(dev);
2570         if (domain)
2571                 goto out;
2572
2573         domain = find_or_alloc_domain(dev, gaw);
2574         if (!domain)
2575                 goto out;
2576
2577         tmp = set_domain_for_dev(dev, domain);
2578         if (!tmp || domain != tmp) {
2579                 domain_exit(domain);
2580                 domain = tmp;
2581         }
2582
2583 out:
2584
2585         return domain;
2586 }
2587
2588 static int iommu_domain_identity_map(struct dmar_domain *domain,
2589                                      unsigned long long start,
2590                                      unsigned long long end)
2591 {
2592         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2593         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2594
2595         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2596                           dma_to_mm_pfn(last_vpfn))) {
2597                 pr_err("Reserving iova failed\n");
2598                 return -ENOMEM;
2599         }
2600
2601         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2602         /*
2603          * RMRR range might have overlap with physical memory range,
2604          * clear it first
2605          */
2606         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2607
2608         return __domain_mapping(domain, first_vpfn, NULL,
2609                                 first_vpfn, last_vpfn - first_vpfn + 1,
2610                                 DMA_PTE_READ|DMA_PTE_WRITE);
2611 }
2612
2613 static int domain_prepare_identity_map(struct device *dev,
2614                                        struct dmar_domain *domain,
2615                                        unsigned long long start,
2616                                        unsigned long long end)
2617 {
2618         /* For _hardware_ passthrough, don't bother. But for software
2619            passthrough, we do it anyway -- it may indicate a memory
2620            range which is reserved in E820, so which didn't get set
2621            up to start with in si_domain */
2622         if (domain == si_domain && hw_pass_through) {
2623                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2624                         dev_name(dev), start, end);
2625                 return 0;
2626         }
2627
2628         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2629                 dev_name(dev), start, end);
2630
2631         if (end < start) {
2632                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2633                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2634                         dmi_get_system_info(DMI_BIOS_VENDOR),
2635                         dmi_get_system_info(DMI_BIOS_VERSION),
2636                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2637                 return -EIO;
2638         }
2639
2640         if (end >> agaw_to_width(domain->agaw)) {
2641                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2642                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2643                      agaw_to_width(domain->agaw),
2644                      dmi_get_system_info(DMI_BIOS_VENDOR),
2645                      dmi_get_system_info(DMI_BIOS_VERSION),
2646                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2647                 return -EIO;
2648         }
2649
2650         return iommu_domain_identity_map(domain, start, end);
2651 }
2652
2653 static int iommu_prepare_identity_map(struct device *dev,
2654                                       unsigned long long start,
2655                                       unsigned long long end)
2656 {
2657         struct dmar_domain *domain;
2658         int ret;
2659
2660         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2661         if (!domain)
2662                 return -ENOMEM;
2663
2664         ret = domain_prepare_identity_map(dev, domain, start, end);
2665         if (ret)
2666                 domain_exit(domain);
2667
2668         return ret;
2669 }
2670
2671 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2672                                          struct device *dev)
2673 {
2674         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2675                 return 0;
2676         return iommu_prepare_identity_map(dev, rmrr->base_address,
2677                                           rmrr->end_address);
2678 }
2679
2680 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2681 static inline void iommu_prepare_isa(void)
2682 {
2683         struct pci_dev *pdev;
2684         int ret;
2685
2686         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2687         if (!pdev)
2688                 return;
2689
2690         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2691         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2692
2693         if (ret)
2694                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2695
2696         pci_dev_put(pdev);
2697 }
2698 #else
2699 static inline void iommu_prepare_isa(void)
2700 {
2701         return;
2702 }
2703 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2704
2705 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2706
2707 static int __init si_domain_init(int hw)
2708 {
2709         int nid, ret = 0;
2710
2711         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2712         if (!si_domain)
2713                 return -EFAULT;
2714
2715         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2716                 domain_exit(si_domain);
2717                 return -EFAULT;
2718         }
2719
2720         pr_debug("Identity mapping domain allocated\n");
2721
2722         if (hw)
2723                 return 0;
2724
2725         for_each_online_node(nid) {
2726                 unsigned long start_pfn, end_pfn;
2727                 int i;
2728
2729                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2730                         ret = iommu_domain_identity_map(si_domain,
2731                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2732                         if (ret)
2733                                 return ret;
2734                 }
2735         }
2736
2737         return 0;
2738 }
2739
2740 static int identity_mapping(struct device *dev)
2741 {
2742         struct device_domain_info *info;
2743
2744         if (likely(!iommu_identity_mapping))
2745                 return 0;
2746
2747         info = dev->archdata.iommu;
2748         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2749                 return (info->domain == si_domain);
2750
2751         return 0;
2752 }
2753
2754 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2755 {
2756         struct dmar_domain *ndomain;
2757         struct intel_iommu *iommu;
2758         u8 bus, devfn;
2759
2760         iommu = device_to_iommu(dev, &bus, &devfn);
2761         if (!iommu)
2762                 return -ENODEV;
2763
2764         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2765         if (ndomain != domain)
2766                 return -EBUSY;
2767
2768         return 0;
2769 }
2770
2771 static bool device_has_rmrr(struct device *dev)
2772 {
2773         struct dmar_rmrr_unit *rmrr;
2774         struct device *tmp;
2775         int i;
2776
2777         rcu_read_lock();
2778         for_each_rmrr_units(rmrr) {
2779                 /*
2780                  * Return TRUE if this RMRR contains the device that
2781                  * is passed in.
2782                  */
2783                 for_each_active_dev_scope(rmrr->devices,
2784                                           rmrr->devices_cnt, i, tmp)
2785                         if (tmp == dev) {
2786                                 rcu_read_unlock();
2787                                 return true;
2788                         }
2789         }
2790         rcu_read_unlock();
2791         return false;
2792 }
2793
2794 /*
2795  * There are a couple cases where we need to restrict the functionality of
2796  * devices associated with RMRRs.  The first is when evaluating a device for
2797  * identity mapping because problems exist when devices are moved in and out
2798  * of domains and their respective RMRR information is lost.  This means that
2799  * a device with associated RMRRs will never be in a "passthrough" domain.
2800  * The second is use of the device through the IOMMU API.  This interface
2801  * expects to have full control of the IOVA space for the device.  We cannot
2802  * satisfy both the requirement that RMRR access is maintained and have an
2803  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2804  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2805  * We therefore prevent devices associated with an RMRR from participating in
2806  * the IOMMU API, which eliminates them from device assignment.
2807  *
2808  * In both cases we assume that PCI USB devices with RMRRs have them largely
2809  * for historical reasons and that the RMRR space is not actively used post
2810  * boot.  This exclusion may change if vendors begin to abuse it.
2811  *
2812  * The same exception is made for graphics devices, with the requirement that
2813  * any use of the RMRR regions will be torn down before assigning the device
2814  * to a guest.
2815  */
2816 static bool device_is_rmrr_locked(struct device *dev)
2817 {
2818         if (!device_has_rmrr(dev))
2819                 return false;
2820
2821         if (dev_is_pci(dev)) {
2822                 struct pci_dev *pdev = to_pci_dev(dev);
2823
2824                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2825                         return false;
2826         }
2827
2828         return true;
2829 }
2830
2831 static int iommu_should_identity_map(struct device *dev, int startup)
2832 {
2833
2834         if (dev_is_pci(dev)) {
2835                 struct pci_dev *pdev = to_pci_dev(dev);
2836
2837                 if (device_is_rmrr_locked(dev))
2838                         return 0;
2839
2840                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2841                         return 1;
2842
2843                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2844                         return 1;
2845
2846                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2847                         return 0;
2848
2849                 /*
2850                  * We want to start off with all devices in the 1:1 domain, and
2851                  * take them out later if we find they can't access all of memory.
2852                  *
2853                  * However, we can't do this for PCI devices behind bridges,
2854                  * because all PCI devices behind the same bridge will end up
2855                  * with the same source-id on their transactions.
2856                  *
2857                  * Practically speaking, we can't change things around for these
2858                  * devices at run-time, because we can't be sure there'll be no
2859                  * DMA transactions in flight for any of their siblings.
2860                  *
2861                  * So PCI devices (unless they're on the root bus) as well as
2862                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2863                  * the 1:1 domain, just in _case_ one of their siblings turns out
2864                  * not to be able to map all of memory.
2865                  */
2866                 if (!pci_is_pcie(pdev)) {
2867                         if (!pci_is_root_bus(pdev->bus))
2868                                 return 0;
2869                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2870                                 return 0;
2871                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2872                         return 0;
2873         } else {
2874                 if (device_has_rmrr(dev))
2875                         return 0;
2876         }
2877
2878         /*
2879          * At boot time, we don't yet know if devices will be 64-bit capable.
2880          * Assume that they will — if they turn out not to be, then we can
2881          * take them out of the 1:1 domain later.
2882          */
2883         if (!startup) {
2884                 /*
2885                  * If the device's dma_mask is less than the system's memory
2886                  * size then this is not a candidate for identity mapping.
2887                  */
2888                 u64 dma_mask = *dev->dma_mask;
2889
2890                 if (dev->coherent_dma_mask &&
2891                     dev->coherent_dma_mask < dma_mask)
2892                         dma_mask = dev->coherent_dma_mask;
2893
2894                 return dma_mask >= dma_get_required_mask(dev);
2895         }
2896
2897         return 1;
2898 }
2899
2900 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2901 {
2902         int ret;
2903
2904         if (!iommu_should_identity_map(dev, 1))
2905                 return 0;
2906
2907         ret = domain_add_dev_info(si_domain, dev);
2908         if (!ret)
2909                 pr_info("%s identity mapping for device %s\n",
2910                         hw ? "Hardware" : "Software", dev_name(dev));
2911         else if (ret == -ENODEV)
2912                 /* device not associated with an iommu */
2913                 ret = 0;
2914
2915         return ret;
2916 }
2917
2918
2919 static int __init iommu_prepare_static_identity_mapping(int hw)
2920 {
2921         struct pci_dev *pdev = NULL;
2922         struct dmar_drhd_unit *drhd;
2923         struct intel_iommu *iommu;
2924         struct device *dev;
2925         int i;
2926         int ret = 0;
2927
2928         for_each_pci_dev(pdev) {
2929                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2930                 if (ret)
2931                         return ret;
2932         }
2933
2934         for_each_active_iommu(iommu, drhd)
2935                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2936                         struct acpi_device_physical_node *pn;
2937                         struct acpi_device *adev;
2938
2939                         if (dev->bus != &acpi_bus_type)
2940                                 continue;
2941
2942                         adev= to_acpi_device(dev);
2943                         mutex_lock(&adev->physical_node_lock);
2944                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2945                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2946                                 if (ret)
2947                                         break;
2948                         }
2949                         mutex_unlock(&adev->physical_node_lock);
2950                         if (ret)
2951                                 return ret;
2952                 }
2953
2954         return 0;
2955 }
2956
2957 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2958 {
2959         /*
2960          * Start from the sane iommu hardware state.
2961          * If the queued invalidation is already initialized by us
2962          * (for example, while enabling interrupt-remapping) then
2963          * we got the things already rolling from a sane state.
2964          */
2965         if (!iommu->qi) {
2966                 /*
2967                  * Clear any previous faults.
2968                  */
2969                 dmar_fault(-1, iommu);
2970                 /*
2971                  * Disable queued invalidation if supported and already enabled
2972                  * before OS handover.
2973                  */
2974                 dmar_disable_qi(iommu);
2975         }
2976
2977         if (dmar_enable_qi(iommu)) {
2978                 /*
2979                  * Queued Invalidate not enabled, use Register Based Invalidate
2980                  */
2981                 iommu->flush.flush_context = __iommu_flush_context;
2982                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2983                 pr_info("%s: Using Register based invalidation\n",
2984                         iommu->name);
2985         } else {
2986                 iommu->flush.flush_context = qi_flush_context;
2987                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2988                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2989         }
2990 }
2991
2992 static int copy_context_table(struct intel_iommu *iommu,
2993                               struct root_entry *old_re,
2994                               struct context_entry **tbl,
2995                               int bus, bool ext)
2996 {
2997         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2998         struct context_entry *new_ce = NULL, ce;
2999         struct context_entry *old_ce = NULL;
3000         struct root_entry re;
3001         phys_addr_t old_ce_phys;
3002
3003         tbl_idx = ext ? bus * 2 : bus;
3004         memcpy(&re, old_re, sizeof(re));
3005
3006         for (devfn = 0; devfn < 256; devfn++) {
3007                 /* First calculate the correct index */
3008                 idx = (ext ? devfn * 2 : devfn) % 256;
3009
3010                 if (idx == 0) {
3011                         /* First save what we may have and clean up */
3012                         if (new_ce) {
3013                                 tbl[tbl_idx] = new_ce;
3014                                 __iommu_flush_cache(iommu, new_ce,
3015                                                     VTD_PAGE_SIZE);
3016                                 pos = 1;
3017                         }
3018
3019                         if (old_ce)
3020                                 iounmap(old_ce);
3021
3022                         ret = 0;
3023                         if (devfn < 0x80)
3024                                 old_ce_phys = root_entry_lctp(&re);
3025                         else
3026                                 old_ce_phys = root_entry_uctp(&re);
3027
3028                         if (!old_ce_phys) {
3029                                 if (ext && devfn == 0) {
3030                                         /* No LCTP, try UCTP */
3031                                         devfn = 0x7f;
3032                                         continue;
3033                                 } else {
3034                                         goto out;
3035                                 }
3036                         }
3037
3038                         ret = -ENOMEM;
3039                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
3040                                         MEMREMAP_WB);
3041                         if (!old_ce)
3042                                 goto out;
3043
3044                         new_ce = alloc_pgtable_page(iommu->node);
3045                         if (!new_ce)
3046                                 goto out_unmap;
3047
3048                         ret = 0;
3049                 }
3050
3051                 /* Now copy the context entry */
3052                 memcpy(&ce, old_ce + idx, sizeof(ce));
3053
3054                 if (!__context_present(&ce))
3055                         continue;
3056
3057                 did = context_domain_id(&ce);
3058                 if (did >= 0 && did < cap_ndoms(iommu->cap))
3059                         set_bit(did, iommu->domain_ids);
3060
3061                 /*
3062                  * We need a marker for copied context entries. This
3063                  * marker needs to work for the old format as well as
3064                  * for extended context entries.
3065                  *
3066                  * Bit 67 of the context entry is used. In the old
3067                  * format this bit is available to software, in the
3068                  * extended format it is the PGE bit, but PGE is ignored
3069                  * by HW if PASIDs are disabled (and thus still
3070                  * available).
3071                  *
3072                  * So disable PASIDs first and then mark the entry
3073                  * copied. This means that we don't copy PASID
3074                  * translations from the old kernel, but this is fine as
3075                  * faults there are not fatal.
3076                  */
3077                 context_clear_pasid_enable(&ce);
3078                 context_set_copied(&ce);
3079
3080                 new_ce[idx] = ce;
3081         }
3082
3083         tbl[tbl_idx + pos] = new_ce;
3084
3085         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3086
3087 out_unmap:
3088         memunmap(old_ce);
3089
3090 out:
3091         return ret;
3092 }
3093
3094 static int copy_translation_tables(struct intel_iommu *iommu)
3095 {
3096         struct context_entry **ctxt_tbls;
3097         struct root_entry *old_rt;
3098         phys_addr_t old_rt_phys;
3099         int ctxt_table_entries;
3100         unsigned long flags;
3101         u64 rtaddr_reg;
3102         int bus, ret;
3103         bool new_ext, ext;
3104
3105         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3106         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3107         new_ext    = !!ecap_ecs(iommu->ecap);
3108
3109         /*
3110          * The RTT bit can only be changed when translation is disabled,
3111          * but disabling translation means to open a window for data
3112          * corruption. So bail out and don't copy anything if we would
3113          * have to change the bit.
3114          */
3115         if (new_ext != ext)
3116                 return -EINVAL;
3117
3118         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3119         if (!old_rt_phys)
3120                 return -EINVAL;
3121
3122         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3123         if (!old_rt)
3124                 return -ENOMEM;
3125
3126         /* This is too big for the stack - allocate it from slab */
3127         ctxt_table_entries = ext ? 512 : 256;
3128         ret = -ENOMEM;
3129         ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
3130         if (!ctxt_tbls)
3131                 goto out_unmap;
3132
3133         for (bus = 0; bus < 256; bus++) {
3134                 ret = copy_context_table(iommu, &old_rt[bus],
3135                                          ctxt_tbls, bus, ext);
3136                 if (ret) {
3137                         pr_err("%s: Failed to copy context table for bus %d\n",
3138                                 iommu->name, bus);
3139                         continue;
3140                 }
3141         }
3142
3143         spin_lock_irqsave(&iommu->lock, flags);
3144
3145         /* Context tables are copied, now write them to the root_entry table */
3146         for (bus = 0; bus < 256; bus++) {
3147                 int idx = ext ? bus * 2 : bus;
3148                 u64 val;
3149
3150                 if (ctxt_tbls[idx]) {
3151                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3152                         iommu->root_entry[bus].lo = val;
3153                 }
3154
3155                 if (!ext || !ctxt_tbls[idx + 1])
3156                         continue;
3157
3158                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3159                 iommu->root_entry[bus].hi = val;
3160         }
3161
3162         spin_unlock_irqrestore(&iommu->lock, flags);
3163
3164         kfree(ctxt_tbls);
3165
3166         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3167
3168         ret = 0;
3169
3170 out_unmap:
3171         memunmap(old_rt);
3172
3173         return ret;
3174 }
3175
3176 static int __init init_dmars(void)
3177 {
3178         struct dmar_drhd_unit *drhd;
3179         struct dmar_rmrr_unit *rmrr;
3180         bool copied_tables = false;
3181         struct device *dev;
3182         struct intel_iommu *iommu;
3183         int i, ret;
3184
3185         /*
3186          * for each drhd
3187          *    allocate root
3188          *    initialize and program root entry to not present
3189          * endfor
3190          */
3191         for_each_drhd_unit(drhd) {
3192                 /*
3193                  * lock not needed as this is only incremented in the single
3194                  * threaded kernel __init code path all other access are read
3195                  * only
3196                  */
3197                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3198                         g_num_of_iommus++;
3199                         continue;
3200                 }
3201                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3202         }
3203
3204         /* Preallocate enough resources for IOMMU hot-addition */
3205         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3206                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3207
3208         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3209                         GFP_KERNEL);
3210         if (!g_iommus) {
3211                 pr_err("Allocating global iommu array failed\n");
3212                 ret = -ENOMEM;
3213                 goto error;
3214         }
3215
3216         for_each_active_iommu(iommu, drhd) {
3217                 /*
3218                  * Find the max pasid size of all IOMMU's in the system.
3219                  * We need to ensure the system pasid table is no bigger
3220                  * than the smallest supported.
3221                  */
3222                 if (pasid_supported(iommu)) {
3223                         u32 temp = 2 << ecap_pss(iommu->ecap);
3224
3225                         intel_pasid_max_id = min_t(u32, temp,
3226                                                    intel_pasid_max_id);
3227                 }
3228
3229                 g_iommus[iommu->seq_id] = iommu;
3230
3231                 intel_iommu_init_qi(iommu);
3232
3233                 ret = iommu_init_domains(iommu);
3234                 if (ret)
3235                         goto free_iommu;
3236
3237                 init_translation_status(iommu);
3238
3239                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3240                         iommu_disable_translation(iommu);
3241                         clear_translation_pre_enabled(iommu);
3242                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3243                                 iommu->name);
3244                 }
3245
3246                 /*
3247                  * TBD:
3248                  * we could share the same root & context tables
3249                  * among all IOMMU's. Need to Split it later.
3250                  */
3251                 ret = iommu_alloc_root_entry(iommu);
3252                 if (ret)
3253                         goto free_iommu;
3254
3255                 if (translation_pre_enabled(iommu)) {
3256                         pr_info("Translation already enabled - trying to copy translation structures\n");
3257
3258                         ret = copy_translation_tables(iommu);
3259                         if (ret) {
3260                                 /*
3261                                  * We found the IOMMU with translation
3262                                  * enabled - but failed to copy over the
3263                                  * old root-entry table. Try to proceed
3264                                  * by disabling translation now and
3265                                  * allocating a clean root-entry table.
3266                                  * This might cause DMAR faults, but
3267                                  * probably the dump will still succeed.
3268                                  */
3269                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3270                                        iommu->name);
3271                                 iommu_disable_translation(iommu);
3272                                 clear_translation_pre_enabled(iommu);
3273                         } else {
3274                                 pr_info("Copied translation tables from previous kernel for %s\n",
3275                                         iommu->name);
3276                                 copied_tables = true;
3277                         }
3278                 }
3279
3280                 if (!ecap_pass_through(iommu->ecap))
3281                         hw_pass_through = 0;
3282 #ifdef CONFIG_INTEL_IOMMU_SVM
3283                 if (pasid_supported(iommu))
3284                         intel_svm_init(iommu);
3285 #endif
3286         }
3287
3288         /*
3289          * Now that qi is enabled on all iommus, set the root entry and flush
3290          * caches. This is required on some Intel X58 chipsets, otherwise the
3291          * flush_context function will loop forever and the boot hangs.
3292          */
3293         for_each_active_iommu(iommu, drhd) {
3294                 iommu_flush_write_buffer(iommu);
3295                 iommu_set_root_entry(iommu);
3296                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3297                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3298         }
3299
3300         if (iommu_pass_through)
3301                 iommu_identity_mapping |= IDENTMAP_ALL;
3302
3303 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3304         iommu_identity_mapping |= IDENTMAP_GFX;
3305 #endif
3306
3307         check_tylersburg_isoch();
3308
3309         if (iommu_identity_mapping) {
3310                 ret = si_domain_init(hw_pass_through);
3311                 if (ret)
3312                         goto free_iommu;
3313         }
3314
3315
3316         /*
3317          * If we copied translations from a previous kernel in the kdump
3318          * case, we can not assign the devices to domains now, as that
3319          * would eliminate the old mappings. So skip this part and defer
3320          * the assignment to device driver initialization time.
3321          */
3322         if (copied_tables)
3323                 goto domains_done;
3324
3325         /*
3326          * If pass through is not set or not enabled, setup context entries for
3327          * identity mappings for rmrr, gfx, and isa and may fall back to static
3328          * identity mapping if iommu_identity_mapping is set.
3329          */
3330         if (iommu_identity_mapping) {
3331                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3332                 if (ret) {
3333                         pr_crit("Failed to setup IOMMU pass-through\n");
3334                         goto free_iommu;
3335                 }
3336         }
3337         /*
3338          * For each rmrr
3339          *   for each dev attached to rmrr
3340          *   do
3341          *     locate drhd for dev, alloc domain for dev
3342          *     allocate free domain
3343          *     allocate page table entries for rmrr
3344          *     if context not allocated for bus
3345          *           allocate and init context
3346          *           set present in root table for this bus
3347          *     init context with domain, translation etc
3348          *    endfor
3349          * endfor
3350          */
3351         pr_info("Setting RMRR:\n");
3352         for_each_rmrr_units(rmrr) {
3353                 /* some BIOS lists non-exist devices in DMAR table. */
3354                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3355                                           i, dev) {
3356                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3357                         if (ret)
3358                                 pr_err("Mapping reserved region failed\n");
3359                 }
3360         }
3361
3362         iommu_prepare_isa();
3363
3364 domains_done:
3365
3366         /*
3367          * for each drhd
3368          *   enable fault log
3369          *   global invalidate context cache
3370          *   global invalidate iotlb
3371          *   enable translation
3372          */
3373         for_each_iommu(iommu, drhd) {
3374                 if (drhd->ignored) {
3375                         /*
3376                          * we always have to disable PMRs or DMA may fail on
3377                          * this device
3378                          */
3379                         if (force_on)
3380                                 iommu_disable_protect_mem_regions(iommu);
3381                         continue;
3382                 }
3383
3384                 iommu_flush_write_buffer(iommu);
3385
3386 #ifdef CONFIG_INTEL_IOMMU_SVM
3387                 if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3388                         ret = intel_svm_enable_prq(iommu);
3389                         if (ret)
3390                                 goto free_iommu;
3391                 }
3392 #endif
3393                 ret = dmar_set_interrupt(iommu);
3394                 if (ret)
3395                         goto free_iommu;
3396
3397                 if (!translation_pre_enabled(iommu))
3398                         iommu_enable_translation(iommu);
3399
3400                 iommu_disable_protect_mem_regions(iommu);
3401         }
3402
3403         return 0;
3404
3405 free_iommu:
3406         for_each_active_iommu(iommu, drhd) {
3407                 disable_dmar_iommu(iommu);
3408                 free_dmar_iommu(iommu);
3409         }
3410
3411         kfree(g_iommus);
3412
3413 error:
3414         return ret;
3415 }
3416
3417 /* This takes a number of _MM_ pages, not VTD pages */
3418 static unsigned long intel_alloc_iova(struct device *dev,
3419                                      struct dmar_domain *domain,
3420                                      unsigned long nrpages, uint64_t dma_mask)
3421 {
3422         unsigned long iova_pfn = 0;
3423
3424         /* Restrict dma_mask to the width that the iommu can handle */
3425         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3426         /* Ensure we reserve the whole size-aligned region */
3427         nrpages = __roundup_pow_of_two(nrpages);
3428
3429         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3430                 /*
3431                  * First try to allocate an io virtual address in
3432                  * DMA_BIT_MASK(32) and if that fails then try allocating
3433                  * from higher range
3434                  */
3435                 iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3436                                            IOVA_PFN(DMA_BIT_MASK(32)), false);
3437                 if (iova_pfn)
3438                         return iova_pfn;
3439         }
3440         iova_pfn = alloc_iova_fast(&domain->iovad, nrpages,
3441                                    IOVA_PFN(dma_mask), true);
3442         if (unlikely(!iova_pfn)) {
3443                 pr_err("Allocating %ld-page iova for %s failed",
3444                        nrpages, dev_name(dev));
3445                 return 0;
3446         }
3447
3448         return iova_pfn;
3449 }
3450
3451 struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3452 {
3453         struct dmar_domain *domain, *tmp;
3454         struct dmar_rmrr_unit *rmrr;
3455         struct device *i_dev;
3456         int i, ret;
3457
3458         domain = find_domain(dev);
3459         if (domain)
3460                 goto out;
3461
3462         domain = find_or_alloc_domain(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3463         if (!domain)
3464                 goto out;
3465
3466         /* We have a new domain - setup possible RMRRs for the device */
3467         rcu_read_lock();
3468         for_each_rmrr_units(rmrr) {
3469                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3470                                           i, i_dev) {
3471                         if (i_dev != dev)
3472                                 continue;
3473
3474                         ret = domain_prepare_identity_map(dev, domain,
3475                                                           rmrr->base_address,
3476                                                           rmrr->end_address);
3477                         if (ret)
3478                                 dev_err(dev, "Mapping reserved region failed\n");
3479                 }
3480         }
3481         rcu_read_unlock();
3482
3483         tmp = set_domain_for_dev(dev, domain);
3484         if (!tmp || domain != tmp) {
3485                 domain_exit(domain);
3486                 domain = tmp;
3487         }
3488
3489 out:
3490
3491         if (!domain)
3492                 pr_err("Allocating domain for %s failed\n", dev_name(dev));
3493
3494
3495         return domain;
3496 }
3497
3498 /* Check if the dev needs to go through non-identity map and unmap process.*/
3499 static int iommu_no_mapping(struct device *dev)
3500 {
3501         int found;
3502
3503         if (iommu_dummy(dev))
3504                 return 1;
3505
3506         if (!iommu_identity_mapping)
3507                 return 0;
3508
3509         found = identity_mapping(dev);
3510         if (found) {
3511                 if (iommu_should_identity_map(dev, 0))
3512                         return 1;
3513                 else {
3514                         /*
3515                          * 32 bit DMA is removed from si_domain and fall back
3516                          * to non-identity mapping.
3517                          */
3518                         dmar_remove_one_dev_info(si_domain, dev);
3519                         pr_info("32bit %s uses non-identity mapping\n",
3520                                 dev_name(dev));
3521                         return 0;
3522                 }
3523         } else {
3524                 /*
3525                  * In case of a detached 64 bit DMA device from vm, the device
3526                  * is put into si_domain for identity mapping.
3527                  */
3528                 if (iommu_should_identity_map(dev, 0)) {
3529                         int ret;
3530                         ret = domain_add_dev_info(si_domain, dev);
3531                         if (!ret) {
3532                                 pr_info("64bit %s uses identity mapping\n",
3533                                         dev_name(dev));
3534                                 return 1;
3535                         }
3536                 }
3537         }
3538
3539         return 0;
3540 }
3541
3542 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3543                                      size_t size, int dir, u64 dma_mask)
3544 {
3545         struct dmar_domain *domain;
3546         phys_addr_t start_paddr;
3547         unsigned long iova_pfn;
3548         int prot = 0;
3549         int ret;
3550         struct intel_iommu *iommu;
3551         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3552
3553         BUG_ON(dir == DMA_NONE);
3554
3555         if (iommu_no_mapping(dev))
3556                 return paddr;
3557
3558         domain = get_valid_domain_for_dev(dev);
3559         if (!domain)
3560                 return 0;
3561
3562         iommu = domain_get_iommu(domain);
3563         size = aligned_nrpages(paddr, size);
3564
3565         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3566         if (!iova_pfn)
3567                 goto error;
3568
3569         /*
3570          * Check if DMAR supports zero-length reads on write only
3571          * mappings..
3572          */
3573         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3574                         !cap_zlr(iommu->cap))
3575                 prot |= DMA_PTE_READ;
3576         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3577                 prot |= DMA_PTE_WRITE;
3578         /*
3579          * paddr - (paddr + size) might be partial page, we should map the whole
3580          * page.  Note: if two part of one page are separately mapped, we
3581          * might have two guest_addr mapping to the same host paddr, but this
3582          * is not a big problem
3583          */
3584         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova_pfn),
3585                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3586         if (ret)
3587                 goto error;
3588
3589         start_paddr = (phys_addr_t)iova_pfn << PAGE_SHIFT;
3590         start_paddr += paddr & ~PAGE_MASK;
3591         return start_paddr;
3592
3593 error:
3594         if (iova_pfn)
3595                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3596         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3597                 dev_name(dev), size, (unsigned long long)paddr, dir);
3598         return 0;
3599 }
3600
3601 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3602                                  unsigned long offset, size_t size,
3603                                  enum dma_data_direction dir,
3604                                  unsigned long attrs)
3605 {
3606         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3607                                   dir, *dev->dma_mask);
3608 }
3609
3610 static void intel_unmap(struct device *dev, dma_addr_t dev_addr, size_t size)
3611 {
3612         struct dmar_domain *domain;
3613         unsigned long start_pfn, last_pfn;
3614         unsigned long nrpages;
3615         unsigned long iova_pfn;
3616         struct intel_iommu *iommu;
3617         struct page *freelist;
3618
3619         if (iommu_no_mapping(dev))
3620                 return;
3621
3622         domain = find_domain(dev);
3623         BUG_ON(!domain);
3624
3625         iommu = domain_get_iommu(domain);
3626
3627         iova_pfn = IOVA_PFN(dev_addr);
3628
3629         nrpages = aligned_nrpages(dev_addr, size);
3630         start_pfn = mm_to_dma_pfn(iova_pfn);
3631         last_pfn = start_pfn + nrpages - 1;
3632
3633         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3634                  dev_name(dev), start_pfn, last_pfn);
3635
3636         freelist = domain_unmap(domain, start_pfn, last_pfn);
3637
3638         if (intel_iommu_strict) {
3639                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3640                                       nrpages, !freelist, 0);
3641                 /* free iova */
3642                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(nrpages));
3643                 dma_free_pagelist(freelist);
3644         } else {
3645                 queue_iova(&domain->iovad, iova_pfn, nrpages,
3646                            (unsigned long)freelist);
3647                 /*
3648                  * queue up the release of the unmap to save the 1/6th of the
3649                  * cpu used up by the iotlb flush operation...
3650                  */
3651         }
3652 }
3653
3654 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3655                              size_t size, enum dma_data_direction dir,
3656                              unsigned long attrs)
3657 {
3658         intel_unmap(dev, dev_addr, size);
3659 }
3660
3661 static void *intel_alloc_coherent(struct device *dev, size_t size,
3662                                   dma_addr_t *dma_handle, gfp_t flags,
3663                                   unsigned long attrs)
3664 {
3665         struct page *page = NULL;
3666         int order;
3667
3668         size = PAGE_ALIGN(size);
3669         order = get_order(size);
3670
3671         if (!iommu_no_mapping(dev))
3672                 flags &= ~(GFP_DMA | GFP_DMA32);
3673         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3674                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3675                         flags |= GFP_DMA;
3676                 else
3677                         flags |= GFP_DMA32;
3678         }
3679
3680         if (gfpflags_allow_blocking(flags)) {
3681                 unsigned int count = size >> PAGE_SHIFT;
3682
3683                 page = dma_alloc_from_contiguous(dev, count, order,
3684                                                  flags & __GFP_NOWARN);
3685                 if (page && iommu_no_mapping(dev) &&
3686                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3687                         dma_release_from_contiguous(dev, page, count);
3688                         page = NULL;
3689                 }
3690         }
3691
3692         if (!page)
3693                 page = alloc_pages(flags, order);
3694         if (!page)
3695                 return NULL;
3696         memset(page_address(page), 0, size);
3697
3698         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3699                                          DMA_BIDIRECTIONAL,
3700                                          dev->coherent_dma_mask);
3701         if (*dma_handle)
3702                 return page_address(page);
3703         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3704                 __free_pages(page, order);
3705
3706         return NULL;
3707 }
3708
3709 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3710                                 dma_addr_t dma_handle, unsigned long attrs)
3711 {
3712         int order;
3713         struct page *page = virt_to_page(vaddr);
3714
3715         size = PAGE_ALIGN(size);
3716         order = get_order(size);
3717
3718         intel_unmap(dev, dma_handle, size);
3719         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3720                 __free_pages(page, order);
3721 }
3722
3723 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3724                            int nelems, enum dma_data_direction dir,
3725                            unsigned long attrs)
3726 {
3727         dma_addr_t startaddr = sg_dma_address(sglist) & PAGE_MASK;
3728         unsigned long nrpages = 0;
3729         struct scatterlist *sg;
3730         int i;
3731
3732         for_each_sg(sglist, sg, nelems, i) {
3733                 nrpages += aligned_nrpages(sg_dma_address(sg), sg_dma_len(sg));
3734         }
3735
3736         intel_unmap(dev, startaddr, nrpages << VTD_PAGE_SHIFT);
3737 }
3738
3739 static int intel_nontranslate_map_sg(struct device *hddev,
3740         struct scatterlist *sglist, int nelems, int dir)
3741 {
3742         int i;
3743         struct scatterlist *sg;
3744
3745         for_each_sg(sglist, sg, nelems, i) {
3746                 BUG_ON(!sg_page(sg));
3747                 sg->dma_address = sg_phys(sg);
3748                 sg->dma_length = sg->length;
3749         }
3750         return nelems;
3751 }
3752
3753 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3754                         enum dma_data_direction dir, unsigned long attrs)
3755 {
3756         int i;
3757         struct dmar_domain *domain;
3758         size_t size = 0;
3759         int prot = 0;
3760         unsigned long iova_pfn;
3761         int ret;
3762         struct scatterlist *sg;
3763         unsigned long start_vpfn;
3764         struct intel_iommu *iommu;
3765
3766         BUG_ON(dir == DMA_NONE);
3767         if (iommu_no_mapping(dev))
3768                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3769
3770         domain = get_valid_domain_for_dev(dev);
3771         if (!domain)
3772                 return 0;
3773
3774         iommu = domain_get_iommu(domain);
3775
3776         for_each_sg(sglist, sg, nelems, i)
3777                 size += aligned_nrpages(sg->offset, sg->length);
3778
3779         iova_pfn = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3780                                 *dev->dma_mask);
3781         if (!iova_pfn) {
3782                 sglist->dma_length = 0;
3783                 return 0;
3784         }
3785
3786         /*
3787          * Check if DMAR supports zero-length reads on write only
3788          * mappings..
3789          */
3790         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3791                         !cap_zlr(iommu->cap))
3792                 prot |= DMA_PTE_READ;
3793         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3794                 prot |= DMA_PTE_WRITE;
3795
3796         start_vpfn = mm_to_dma_pfn(iova_pfn);
3797
3798         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3799         if (unlikely(ret)) {
3800                 dma_pte_free_pagetable(domain, start_vpfn,
3801                                        start_vpfn + size - 1,
3802                                        agaw_to_level(domain->agaw) + 1);
3803                 free_iova_fast(&domain->iovad, iova_pfn, dma_to_mm_pfn(size));
3804                 return 0;
3805         }
3806
3807         return nelems;
3808 }
3809
3810 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3811 {
3812         return !dma_addr;
3813 }
3814
3815 static const struct dma_map_ops intel_dma_ops = {
3816         .alloc = intel_alloc_coherent,
3817         .free = intel_free_coherent,
3818         .map_sg = intel_map_sg,
3819         .unmap_sg = intel_unmap_sg,
3820         .map_page = intel_map_page,
3821         .unmap_page = intel_unmap_page,
3822         .mapping_error = intel_mapping_error,
3823         .dma_supported = dma_direct_supported,
3824 };
3825
3826 static inline int iommu_domain_cache_init(void)
3827 {
3828         int ret = 0;
3829
3830         iommu_domain_cache = kmem_cache_create("iommu_domain",
3831                                          sizeof(struct dmar_domain),
3832                                          0,
3833                                          SLAB_HWCACHE_ALIGN,
3834
3835                                          NULL);
3836         if (!iommu_domain_cache) {
3837                 pr_err("Couldn't create iommu_domain cache\n");
3838                 ret = -ENOMEM;
3839         }
3840
3841         return ret;
3842 }
3843
3844 static inline int iommu_devinfo_cache_init(void)
3845 {
3846         int ret = 0;
3847
3848         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3849                                          sizeof(struct device_domain_info),
3850                                          0,
3851                                          SLAB_HWCACHE_ALIGN,
3852                                          NULL);
3853         if (!iommu_devinfo_cache) {
3854                 pr_err("Couldn't create devinfo cache\n");
3855                 ret = -ENOMEM;
3856         }
3857
3858         return ret;
3859 }
3860
3861 static int __init iommu_init_mempool(void)
3862 {
3863         int ret;
3864         ret = iova_cache_get();
3865         if (ret)
3866                 return ret;
3867
3868         ret = iommu_domain_cache_init();
3869         if (ret)
3870                 goto domain_error;
3871
3872         ret = iommu_devinfo_cache_init();
3873         if (!ret)
3874                 return ret;
3875
3876         kmem_cache_destroy(iommu_domain_cache);
3877 domain_error:
3878         iova_cache_put();
3879
3880         return -ENOMEM;
3881 }
3882
3883 static void __init iommu_exit_mempool(void)
3884 {
3885         kmem_cache_destroy(iommu_devinfo_cache);
3886         kmem_cache_destroy(iommu_domain_cache);
3887         iova_cache_put();
3888 }
3889
3890 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3891 {
3892         struct dmar_drhd_unit *drhd;
3893         u32 vtbar;
3894         int rc;
3895
3896         /* We know that this device on this chipset has its own IOMMU.
3897          * If we find it under a different IOMMU, then the BIOS is lying
3898          * to us. Hope that the IOMMU for this device is actually
3899          * disabled, and it needs no translation...
3900          */
3901         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3902         if (rc) {
3903                 /* "can't" happen */
3904                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3905                 return;
3906         }
3907         vtbar &= 0xffff0000;
3908
3909         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3910         drhd = dmar_find_matched_drhd_unit(pdev);
3911         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3912                             TAINT_FIRMWARE_WORKAROUND,
3913                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3914                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3915 }
3916 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3917
3918 static void __init init_no_remapping_devices(void)
3919 {
3920         struct dmar_drhd_unit *drhd;
3921         struct device *dev;
3922         int i;
3923
3924         for_each_drhd_unit(drhd) {
3925                 if (!drhd->include_all) {
3926                         for_each_active_dev_scope(drhd->devices,
3927                                                   drhd->devices_cnt, i, dev)
3928                                 break;
3929                         /* ignore DMAR unit if no devices exist */
3930                         if (i == drhd->devices_cnt)
3931                                 drhd->ignored = 1;
3932                 }
3933         }
3934
3935         for_each_active_drhd_unit(drhd) {
3936                 if (drhd->include_all)
3937                         continue;
3938
3939                 for_each_active_dev_scope(drhd->devices,
3940                                           drhd->devices_cnt, i, dev)
3941                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3942                                 break;
3943                 if (i < drhd->devices_cnt)
3944                         continue;
3945
3946                 /* This IOMMU has *only* gfx devices. Either bypass it or
3947                    set the gfx_mapped flag, as appropriate */
3948                 if (dmar_map_gfx) {
3949                         intel_iommu_gfx_mapped = 1;
3950                 } else {
3951                         drhd->ignored = 1;
3952                         for_each_active_dev_scope(drhd->devices,
3953                                                   drhd->devices_cnt, i, dev)
3954                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3955                 }
3956         }
3957 }
3958
3959 #ifdef CONFIG_SUSPEND
3960 static int init_iommu_hw(void)
3961 {
3962         struct dmar_drhd_unit *drhd;
3963         struct intel_iommu *iommu = NULL;
3964
3965         for_each_active_iommu(iommu, drhd)
3966                 if (iommu->qi)
3967                         dmar_reenable_qi(iommu);
3968
3969         for_each_iommu(iommu, drhd) {
3970                 if (drhd->ignored) {
3971                         /*
3972                          * we always have to disable PMRs or DMA may fail on
3973                          * this device
3974                          */
3975                         if (force_on)
3976                                 iommu_disable_protect_mem_regions(iommu);
3977                         continue;
3978                 }
3979
3980                 iommu_flush_write_buffer(iommu);
3981
3982                 iommu_set_root_entry(iommu);
3983
3984                 iommu->flush.flush_context(iommu, 0, 0, 0,
3985                                            DMA_CCMD_GLOBAL_INVL);
3986                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3987                 iommu_enable_translation(iommu);
3988                 iommu_disable_protect_mem_regions(iommu);
3989         }
3990
3991         return 0;
3992 }
3993
3994 static void iommu_flush_all(void)
3995 {
3996         struct dmar_drhd_unit *drhd;
3997         struct intel_iommu *iommu;
3998
3999         for_each_active_iommu(iommu, drhd) {
4000                 iommu->flush.flush_context(iommu, 0, 0, 0,
4001                                            DMA_CCMD_GLOBAL_INVL);
4002                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4003                                          DMA_TLB_GLOBAL_FLUSH);
4004         }
4005 }
4006
4007 static int iommu_suspend(void)
4008 {
4009         struct dmar_drhd_unit *drhd;
4010         struct intel_iommu *iommu = NULL;
4011         unsigned long flag;
4012
4013         for_each_active_iommu(iommu, drhd) {
4014                 iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
4015                                                  GFP_ATOMIC);
4016                 if (!iommu->iommu_state)
4017                         goto nomem;
4018         }
4019
4020         iommu_flush_all();
4021
4022         for_each_active_iommu(iommu, drhd) {
4023                 iommu_disable_translation(iommu);
4024
4025                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4026
4027                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4028                         readl(iommu->reg + DMAR_FECTL_REG);
4029                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4030                         readl(iommu->reg + DMAR_FEDATA_REG);
4031                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4032                         readl(iommu->reg + DMAR_FEADDR_REG);
4033                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4034                         readl(iommu->reg + DMAR_FEUADDR_REG);
4035
4036                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4037         }
4038         return 0;
4039
4040 nomem:
4041         for_each_active_iommu(iommu, drhd)
4042                 kfree(iommu->iommu_state);
4043
4044         return -ENOMEM;
4045 }
4046
4047 static void iommu_resume(void)
4048 {
4049         struct dmar_drhd_unit *drhd;
4050         struct intel_iommu *iommu = NULL;
4051         unsigned long flag;
4052
4053         if (init_iommu_hw()) {
4054                 if (force_on)
4055                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4056                 else
4057                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4058                 return;
4059         }
4060
4061         for_each_active_iommu(iommu, drhd) {
4062
4063                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4064
4065                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4066                         iommu->reg + DMAR_FECTL_REG);
4067                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4068                         iommu->reg + DMAR_FEDATA_REG);
4069                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4070                         iommu->reg + DMAR_FEADDR_REG);
4071                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4072                         iommu->reg + DMAR_FEUADDR_REG);
4073
4074                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4075         }
4076
4077         for_each_active_iommu(iommu, drhd)
4078                 kfree(iommu->iommu_state);
4079 }
4080
4081 static struct syscore_ops iommu_syscore_ops = {
4082         .resume         = iommu_resume,
4083         .suspend        = iommu_suspend,
4084 };
4085
4086 static void __init init_iommu_pm_ops(void)
4087 {
4088         register_syscore_ops(&iommu_syscore_ops);
4089 }
4090
4091 #else
4092 static inline void init_iommu_pm_ops(void) {}
4093 #endif  /* CONFIG_PM */
4094
4095
4096 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4097 {
4098         struct acpi_dmar_reserved_memory *rmrr;
4099         int prot = DMA_PTE_READ|DMA_PTE_WRITE;
4100         struct dmar_rmrr_unit *rmrru;
4101         size_t length;
4102
4103         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4104         if (!rmrru)
4105                 goto out;
4106
4107         rmrru->hdr = header;
4108         rmrr = (struct acpi_dmar_reserved_memory *)header;
4109         rmrru->base_address = rmrr->base_address;
4110         rmrru->end_address = rmrr->end_address;
4111
4112         length = rmrr->end_address - rmrr->base_address + 1;
4113         rmrru->resv = iommu_alloc_resv_region(rmrr->base_address, length, prot,
4114                                               IOMMU_RESV_DIRECT);
4115         if (!rmrru->resv)
4116                 goto free_rmrru;
4117
4118         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4119                                 ((void *)rmrr) + rmrr->header.length,
4120                                 &rmrru->devices_cnt);
4121         if (rmrru->devices_cnt && rmrru->devices == NULL)
4122                 goto free_all;
4123
4124         list_add(&rmrru->list, &dmar_rmrr_units);
4125
4126         return 0;
4127 free_all:
4128         kfree(rmrru->resv);
4129 free_rmrru:
4130         kfree(rmrru);
4131 out:
4132         return -ENOMEM;
4133 }
4134
4135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4136 {
4137         struct dmar_atsr_unit *atsru;
4138         struct acpi_dmar_atsr *tmp;
4139
4140         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4141                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4142                 if (atsr->segment != tmp->segment)
4143                         continue;
4144                 if (atsr->header.length != tmp->header.length)
4145                         continue;
4146                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4147                         return atsru;
4148         }
4149
4150         return NULL;
4151 }
4152
4153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4154 {
4155         struct acpi_dmar_atsr *atsr;
4156         struct dmar_atsr_unit *atsru;
4157
4158         if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
4159                 return 0;
4160
4161         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4162         atsru = dmar_find_atsr(atsr);
4163         if (atsru)
4164                 return 0;
4165
4166         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4167         if (!atsru)
4168                 return -ENOMEM;
4169
4170         /*
4171          * If memory is allocated from slab by ACPI _DSM method, we need to
4172          * copy the memory content because the memory buffer will be freed
4173          * on return.
4174          */
4175         atsru->hdr = (void *)(atsru + 1);
4176         memcpy(atsru->hdr, hdr, hdr->length);
4177         atsru->include_all = atsr->flags & 0x1;
4178         if (!atsru->include_all) {
4179                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4180                                 (void *)atsr + atsr->header.length,
4181                                 &atsru->devices_cnt);
4182                 if (atsru->devices_cnt && atsru->devices == NULL) {
4183                         kfree(atsru);
4184                         return -ENOMEM;
4185                 }
4186         }
4187
4188         list_add_rcu(&atsru->list, &dmar_atsr_units);
4189
4190         return 0;
4191 }
4192
4193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4194 {
4195         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4196         kfree(atsru);
4197 }
4198
4199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4200 {
4201         struct acpi_dmar_atsr *atsr;
4202         struct dmar_atsr_unit *atsru;
4203
4204         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4205         atsru = dmar_find_atsr(atsr);
4206         if (atsru) {
4207                 list_del_rcu(&atsru->list);
4208                 synchronize_rcu();
4209                 intel_iommu_free_atsr(atsru);
4210         }
4211
4212         return 0;
4213 }
4214
4215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4216 {
4217         int i;
4218         struct device *dev;
4219         struct acpi_dmar_atsr *atsr;
4220         struct dmar_atsr_unit *atsru;
4221
4222         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4223         atsru = dmar_find_atsr(atsr);
4224         if (!atsru)
4225                 return 0;
4226
4227         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4228                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4229                                           i, dev)
4230                         return -EBUSY;
4231         }
4232
4233         return 0;
4234 }
4235
4236 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4237 {
4238         int sp, ret = 0;
4239         struct intel_iommu *iommu = dmaru->iommu;
4240
4241         if (g_iommus[iommu->seq_id])
4242                 return 0;
4243
4244         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4245                 pr_warn("%s: Doesn't support hardware pass through.\n",
4246                         iommu->name);
4247                 return -ENXIO;
4248         }
4249         if (!ecap_sc_support(iommu->ecap) &&
4250             domain_update_iommu_snooping(iommu)) {
4251                 pr_warn("%s: Doesn't support snooping.\n",
4252                         iommu->name);
4253                 return -ENXIO;
4254         }
4255         sp = domain_update_iommu_superpage(iommu) - 1;
4256         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4257                 pr_warn("%s: Doesn't support large page.\n",
4258                         iommu->name);
4259                 return -ENXIO;
4260         }
4261
4262         /*
4263          * Disable translation if already enabled prior to OS handover.
4264          */
4265         if (iommu->gcmd & DMA_GCMD_TE)
4266                 iommu_disable_translation(iommu);
4267
4268         g_iommus[iommu->seq_id] = iommu;
4269         ret = iommu_init_domains(iommu);
4270         if (ret == 0)
4271                 ret = iommu_alloc_root_entry(iommu);
4272         if (ret)
4273                 goto out;
4274
4275 #ifdef CONFIG_INTEL_IOMMU_SVM
4276         if (pasid_supported(iommu))
4277                 intel_svm_init(iommu);
4278 #endif
4279
4280         if (dmaru->ignored) {
4281                 /*
4282                  * we always have to disable PMRs or DMA may fail on this device
4283                  */
4284                 if (force_on)
4285                         iommu_disable_protect_mem_regions(iommu);
4286                 return 0;
4287         }
4288
4289         intel_iommu_init_qi(iommu);
4290         iommu_flush_write_buffer(iommu);
4291
4292 #ifdef CONFIG_INTEL_IOMMU_SVM
4293         if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
4294                 ret = intel_svm_enable_prq(iommu);
4295                 if (ret)
4296                         goto disable_iommu;
4297         }
4298 #endif
4299         ret = dmar_set_interrupt(iommu);
4300         if (ret)
4301                 goto disable_iommu;
4302
4303         iommu_set_root_entry(iommu);
4304         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4305         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4306         iommu_enable_translation(iommu);
4307
4308         iommu_disable_protect_mem_regions(iommu);
4309         return 0;
4310
4311 disable_iommu:
4312         disable_dmar_iommu(iommu);
4313 out:
4314         free_dmar_iommu(iommu);
4315         return ret;
4316 }
4317
4318 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4319 {
4320         int ret = 0;
4321         struct intel_iommu *iommu = dmaru->iommu;
4322
4323         if (!intel_iommu_enabled)
4324                 return 0;
4325         if (iommu == NULL)
4326                 return -EINVAL;
4327
4328         if (insert) {
4329                 ret = intel_iommu_add(dmaru);
4330         } else {
4331                 disable_dmar_iommu(iommu);
4332                 free_dmar_iommu(iommu);
4333         }
4334
4335         return ret;
4336 }
4337
4338 static void intel_iommu_free_dmars(void)
4339 {
4340         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4341         struct dmar_atsr_unit *atsru, *atsr_n;
4342
4343         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4344                 list_del(&rmrru->list);
4345                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4346                 kfree(rmrru->resv);
4347                 kfree(rmrru);
4348         }
4349
4350         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4351                 list_del(&atsru->list);
4352                 intel_iommu_free_atsr(atsru);
4353         }
4354 }
4355
4356 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4357 {
4358         int i, ret = 1;
4359         struct pci_bus *bus;
4360         struct pci_dev *bridge = NULL;
4361         struct device *tmp;
4362         struct acpi_dmar_atsr *atsr;
4363         struct dmar_atsr_unit *atsru;
4364
4365         dev = pci_physfn(dev);
4366         for (bus = dev->bus; bus; bus = bus->parent) {
4367                 bridge = bus->self;
4368                 /* If it's an integrated device, allow ATS */
4369                 if (!bridge)
4370                         return 1;
4371                 /* Connected via non-PCIe: no ATS */
4372                 if (!pci_is_pcie(bridge) ||
4373                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4374                         return 0;
4375                 /* If we found the root port, look it up in the ATSR */
4376                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4377                         break;
4378         }
4379
4380         rcu_read_lock();
4381         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4382                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4383                 if (atsr->segment != pci_domain_nr(dev->bus))
4384                         continue;
4385
4386                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4387                         if (tmp == &bridge->dev)
4388                                 goto out;
4389
4390                 if (atsru->include_all)
4391                         goto out;
4392         }
4393         ret = 0;
4394 out:
4395         rcu_read_unlock();
4396
4397         return ret;
4398 }
4399
4400 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4401 {
4402         int ret = 0;
4403         struct dmar_rmrr_unit *rmrru;
4404         struct dmar_atsr_unit *atsru;
4405         struct acpi_dmar_atsr *atsr;
4406         struct acpi_dmar_reserved_memory *rmrr;
4407
4408         if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
4409                 return 0;
4410
4411         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4412                 rmrr = container_of(rmrru->hdr,
4413                                     struct acpi_dmar_reserved_memory, header);
4414                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4415                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4416                                 ((void *)rmrr) + rmrr->header.length,
4417                                 rmrr->segment, rmrru->devices,
4418                                 rmrru->devices_cnt);
4419                         if(ret < 0)
4420                                 return ret;
4421                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4422                         dmar_remove_dev_scope(info, rmrr->segment,
4423                                 rmrru->devices, rmrru->devices_cnt);
4424                 }
4425         }
4426
4427         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4428                 if (atsru->include_all)
4429                         continue;
4430
4431                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4432                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4433                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4434                                         (void *)atsr + atsr->header.length,
4435                                         atsr->segment, atsru->devices,
4436                                         atsru->devices_cnt);
4437                         if (ret > 0)
4438                                 break;
4439                         else if(ret < 0)
4440                                 return ret;
4441                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4442                         if (dmar_remove_dev_scope(info, atsr->segment,
4443                                         atsru->devices, atsru->devices_cnt))
4444                                 break;
4445                 }
4446         }
4447
4448         return 0;
4449 }
4450
4451 /*
4452  * Here we only respond to action of unbound device from driver.
4453  *
4454  * Added device is not attached to its DMAR domain here yet. That will happen
4455  * when mapping the device to iova.
4456  */
4457 static int device_notifier(struct notifier_block *nb,
4458                                   unsigned long action, void *data)
4459 {
4460         struct device *dev = data;
4461         struct dmar_domain *domain;
4462
4463         if (iommu_dummy(dev))
4464                 return 0;
4465
4466         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4467                 return 0;
4468
4469         domain = find_domain(dev);
4470         if (!domain)
4471                 return 0;
4472
4473         dmar_remove_one_dev_info(domain, dev);
4474         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4475                 domain_exit(domain);
4476
4477         return 0;
4478 }
4479
4480 static struct notifier_block device_nb = {
4481         .notifier_call = device_notifier,
4482 };
4483
4484 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4485                                        unsigned long val, void *v)
4486 {
4487         struct memory_notify *mhp = v;
4488         unsigned long long start, end;
4489         unsigned long start_vpfn, last_vpfn;
4490
4491         switch (val) {
4492         case MEM_GOING_ONLINE:
4493                 start = mhp->start_pfn << PAGE_SHIFT;
4494                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4495                 if (iommu_domain_identity_map(si_domain, start, end)) {
4496                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4497                                 start, end);
4498                         return NOTIFY_BAD;
4499                 }
4500                 break;
4501
4502         case MEM_OFFLINE:
4503         case MEM_CANCEL_ONLINE:
4504                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4505                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4506                 while (start_vpfn <= last_vpfn) {
4507                         struct iova *iova;
4508                         struct dmar_drhd_unit *drhd;
4509                         struct intel_iommu *iommu;
4510                         struct page *freelist;
4511
4512                         iova = find_iova(&si_domain->iovad, start_vpfn);
4513                         if (iova == NULL) {
4514                                 pr_debug("Failed get IOVA for PFN %lx\n",
4515                                          start_vpfn);
4516                                 break;
4517                         }
4518
4519                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4520                                                      start_vpfn, last_vpfn);
4521                         if (iova == NULL) {
4522                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4523                                         start_vpfn, last_vpfn);
4524                                 return NOTIFY_BAD;
4525                         }
4526
4527                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4528                                                iova->pfn_hi);
4529
4530                         rcu_read_lock();
4531                         for_each_active_iommu(iommu, drhd)
4532                                 iommu_flush_iotlb_psi(iommu, si_domain,
4533                                         iova->pfn_lo, iova_size(iova),
4534                                         !freelist, 0);
4535                         rcu_read_unlock();
4536                         dma_free_pagelist(freelist);
4537
4538                         start_vpfn = iova->pfn_hi + 1;
4539                         free_iova_mem(iova);
4540                 }
4541                 break;
4542         }
4543
4544         return NOTIFY_OK;
4545 }
4546
4547 static struct notifier_block intel_iommu_memory_nb = {
4548         .notifier_call = intel_iommu_memory_notifier,
4549         .priority = 0
4550 };
4551
4552 static void free_all_cpu_cached_iovas(unsigned int cpu)
4553 {
4554         int i;
4555
4556         for (i = 0; i < g_num_of_iommus; i++) {
4557                 struct intel_iommu *iommu = g_iommus[i];
4558                 struct dmar_domain *domain;
4559                 int did;
4560
4561                 if (!iommu)
4562                         continue;
4563
4564                 for (did = 0; did < cap_ndoms(iommu->cap); did++) {
4565                         domain = get_iommu_domain(iommu, (u16)did);
4566
4567                         if (!domain)
4568                                 continue;
4569                         free_cpu_cached_iovas(cpu, &domain->iovad);
4570                 }
4571         }
4572 }
4573
4574 static int intel_iommu_cpu_dead(unsigned int cpu)
4575 {
4576         free_all_cpu_cached_iovas(cpu);
4577         return 0;
4578 }
4579
4580 static void intel_disable_iommus(void)
4581 {
4582         struct intel_iommu *iommu = NULL;
4583         struct dmar_drhd_unit *drhd;
4584
4585         for_each_iommu(iommu, drhd)
4586                 iommu_disable_translation(iommu);
4587 }
4588
4589 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
4590 {
4591         struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
4592
4593         return container_of(iommu_dev, struct intel_iommu, iommu);
4594 }
4595
4596 static ssize_t intel_iommu_show_version(struct device *dev,
4597                                         struct device_attribute *attr,
4598                                         char *buf)
4599 {
4600         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4601         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4602         return sprintf(buf, "%d:%d\n",
4603                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4604 }
4605 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4606
4607 static ssize_t intel_iommu_show_address(struct device *dev,
4608                                         struct device_attribute *attr,
4609                                         char *buf)
4610 {
4611         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4612         return sprintf(buf, "%llx\n", iommu->reg_phys);
4613 }
4614 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4615
4616 static ssize_t intel_iommu_show_cap(struct device *dev,
4617                                     struct device_attribute *attr,
4618                                     char *buf)
4619 {
4620         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4621         return sprintf(buf, "%llx\n", iommu->cap);
4622 }
4623 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4624
4625 static ssize_t intel_iommu_show_ecap(struct device *dev,
4626                                     struct device_attribute *attr,
4627                                     char *buf)
4628 {
4629         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4630         return sprintf(buf, "%llx\n", iommu->ecap);
4631 }
4632 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4633
4634 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4635                                       struct device_attribute *attr,
4636                                       char *buf)
4637 {
4638         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4639         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4640 }
4641 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4642
4643 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4644                                            struct device_attribute *attr,
4645                                            char *buf)
4646 {
4647         struct intel_iommu *iommu = dev_to_intel_iommu(dev);
4648         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4649                                                   cap_ndoms(iommu->cap)));
4650 }
4651 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4652
4653 static struct attribute *intel_iommu_attrs[] = {
4654         &dev_attr_version.attr,
4655         &dev_attr_address.attr,
4656         &dev_attr_cap.attr,
4657         &dev_attr_ecap.attr,
4658         &dev_attr_domains_supported.attr,
4659         &dev_attr_domains_used.attr,
4660         NULL,
4661 };
4662
4663 static struct attribute_group intel_iommu_group = {
4664         .name = "intel-iommu",
4665         .attrs = intel_iommu_attrs,
4666 };
4667
4668 const struct attribute_group *intel_iommu_groups[] = {
4669         &intel_iommu_group,
4670         NULL,
4671 };
4672
4673 int __init intel_iommu_init(void)
4674 {
4675         int ret = -ENODEV;
4676         struct dmar_drhd_unit *drhd;
4677         struct intel_iommu *iommu;
4678
4679         /* VT-d is required for a TXT/tboot launch, so enforce that */
4680         force_on = tboot_force_iommu();
4681
4682         if (iommu_init_mempool()) {
4683                 if (force_on)
4684                         panic("tboot: Failed to initialize iommu memory\n");
4685                 return -ENOMEM;
4686         }
4687
4688         down_write(&dmar_global_lock);
4689         if (dmar_table_init()) {
4690                 if (force_on)
4691                         panic("tboot: Failed to initialize DMAR table\n");
4692                 goto out_free_dmar;
4693         }
4694
4695         if (dmar_dev_scope_init() < 0) {
4696                 if (force_on)
4697                         panic("tboot: Failed to initialize DMAR device scope\n");
4698                 goto out_free_dmar;
4699         }
4700
4701         up_write(&dmar_global_lock);
4702
4703         /*
4704          * The bus notifier takes the dmar_global_lock, so lockdep will
4705          * complain later when we register it under the lock.
4706          */
4707         dmar_register_bus_notifier();
4708
4709         down_write(&dmar_global_lock);
4710
4711         if (no_iommu || dmar_disabled) {
4712                 /*
4713                  * We exit the function here to ensure IOMMU's remapping and
4714                  * mempool aren't setup, which means that the IOMMU's PMRs
4715                  * won't be disabled via the call to init_dmars(). So disable
4716                  * it explicitly here. The PMRs were setup by tboot prior to
4717                  * calling SENTER, but the kernel is expected to reset/tear
4718                  * down the PMRs.
4719                  */
4720                 if (intel_iommu_tboot_noforce) {
4721                         for_each_iommu(iommu, drhd)
4722                                 iommu_disable_protect_mem_regions(iommu);
4723                 }
4724
4725                 /*
4726                  * Make sure the IOMMUs are switched off, even when we
4727                  * boot into a kexec kernel and the previous kernel left
4728                  * them enabled
4729                  */
4730                 intel_disable_iommus();
4731                 goto out_free_dmar;
4732         }
4733
4734         if (list_empty(&dmar_rmrr_units))
4735                 pr_info("No RMRR found\n");
4736
4737         if (list_empty(&dmar_atsr_units))
4738                 pr_info("No ATSR found\n");
4739
4740         if (dmar_init_reserved_ranges()) {
4741                 if (force_on)
4742                         panic("tboot: Failed to reserve iommu ranges\n");
4743                 goto out_free_reserved_range;
4744         }
4745
4746         init_no_remapping_devices();
4747
4748         ret = init_dmars();
4749         if (ret) {
4750                 if (force_on)
4751                         panic("tboot: Failed to initialize DMARs\n");
4752                 pr_err("Initialization failed\n");
4753                 goto out_free_reserved_range;
4754         }
4755         up_write(&dmar_global_lock);
4756         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4757
4758 #if defined(CONFIG_X86) && defined(CONFIG_SWIOTLB)
4759         swiotlb = 0;
4760 #endif
4761         dma_ops = &intel_dma_ops;
4762
4763         init_iommu_pm_ops();
4764
4765         for_each_active_iommu(iommu, drhd) {
4766                 iommu_device_sysfs_add(&iommu->iommu, NULL,
4767                                        intel_iommu_groups,
4768                                        "%s", iommu->name);
4769                 iommu_device_set_ops(&iommu->iommu, &intel_iommu_ops);
4770                 iommu_device_register(&iommu->iommu);
4771         }
4772
4773         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4774         bus_register_notifier(&pci_bus_type, &device_nb);
4775         if (si_domain && !hw_pass_through)
4776                 register_memory_notifier(&intel_iommu_memory_nb);
4777         cpuhp_setup_state(CPUHP_IOMMU_INTEL_DEAD, "iommu/intel:dead", NULL,
4778                           intel_iommu_cpu_dead);
4779         intel_iommu_enabled = 1;
4780         intel_iommu_debugfs_init();
4781
4782         return 0;
4783
4784 out_free_reserved_range:
4785         put_iova_domain(&reserved_iova_list);
4786 out_free_dmar:
4787         intel_iommu_free_dmars();
4788         up_write(&dmar_global_lock);
4789         iommu_exit_mempool();
4790         return ret;
4791 }
4792
4793 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4794 {
4795         struct intel_iommu *iommu = opaque;
4796
4797         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4798         return 0;
4799 }
4800
4801 /*
4802  * NB - intel-iommu lacks any sort of reference counting for the users of
4803  * dependent devices.  If multiple endpoints have intersecting dependent
4804  * devices, unbinding the driver from any one of them will possibly leave
4805  * the others unable to operate.
4806  */
4807 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4808 {
4809         if (!iommu || !dev || !dev_is_pci(dev))
4810                 return;
4811
4812         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4813 }
4814
4815 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4816 {
4817         struct intel_iommu *iommu;
4818         unsigned long flags;
4819
4820         assert_spin_locked(&device_domain_lock);
4821
4822         if (WARN_ON(!info))
4823                 return;
4824
4825         iommu = info->iommu;
4826
4827         if (info->dev) {
4828                 iommu_disable_dev_iotlb(info);
4829                 domain_context_clear(iommu, info->dev);
4830                 intel_pasid_free_table(info->dev);
4831         }
4832
4833         unlink_domain_info(info);
4834
4835         spin_lock_irqsave(&iommu->lock, flags);
4836         domain_detach_iommu(info->domain, iommu);
4837         spin_unlock_irqrestore(&iommu->lock, flags);
4838
4839         free_devinfo_mem(info);
4840 }
4841
4842 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4843                                      struct device *dev)
4844 {
4845         struct device_domain_info *info;
4846         unsigned long flags;
4847
4848         spin_lock_irqsave(&device_domain_lock, flags);
4849         info = dev->archdata.iommu;
4850         __dmar_remove_one_dev_info(info);
4851         spin_unlock_irqrestore(&device_domain_lock, flags);
4852 }
4853
4854 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4855 {
4856         int adjust_width;
4857
4858         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN);
4859         domain_reserve_special_ranges(domain);
4860
4861         /* calculate AGAW */
4862         domain->gaw = guest_width;
4863         adjust_width = guestwidth_to_adjustwidth(guest_width);
4864         domain->agaw = width_to_agaw(adjust_width);
4865
4866         domain->iommu_coherency = 0;
4867         domain->iommu_snooping = 0;
4868         domain->iommu_superpage = 0;
4869         domain->max_addr = 0;
4870
4871         /* always allocate the top pgd */
4872         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4873         if (!domain->pgd)
4874                 return -ENOMEM;
4875         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4876         return 0;
4877 }
4878
4879 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4880 {
4881         struct dmar_domain *dmar_domain;
4882         struct iommu_domain *domain;
4883
4884         if (type != IOMMU_DOMAIN_UNMANAGED)
4885                 return NULL;
4886
4887         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4888         if (!dmar_domain) {
4889                 pr_err("Can't allocate dmar_domain\n");
4890                 return NULL;
4891         }
4892         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4893                 pr_err("Domain initialization failed\n");
4894                 domain_exit(dmar_domain);
4895                 return NULL;
4896         }
4897         domain_update_iommu_cap(dmar_domain);
4898
4899         domain = &dmar_domain->domain;
4900         domain->geometry.aperture_start = 0;
4901         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4902         domain->geometry.force_aperture = true;
4903
4904         return domain;
4905 }
4906
4907 static void intel_iommu_domain_free(struct iommu_domain *domain)
4908 {
4909         domain_exit(to_dmar_domain(domain));
4910 }
4911
4912 static int intel_iommu_attach_device(struct iommu_domain *domain,
4913                                      struct device *dev)
4914 {
4915         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4916         struct intel_iommu *iommu;
4917         int addr_width;
4918         u8 bus, devfn;
4919
4920         if (device_is_rmrr_locked(dev)) {
4921                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4922                 return -EPERM;
4923         }
4924
4925         /* normally dev is not mapped */
4926         if (unlikely(domain_context_mapped(dev))) {
4927                 struct dmar_domain *old_domain;
4928
4929                 old_domain = find_domain(dev);
4930                 if (old_domain) {
4931                         rcu_read_lock();
4932                         dmar_remove_one_dev_info(old_domain, dev);
4933                         rcu_read_unlock();
4934
4935                         if (!domain_type_is_vm_or_si(old_domain) &&
4936                              list_empty(&old_domain->devices))
4937                                 domain_exit(old_domain);
4938                 }
4939         }
4940
4941         iommu = device_to_iommu(dev, &bus, &devfn);
4942         if (!iommu)
4943                 return -ENODEV;
4944
4945         /* check if this iommu agaw is sufficient for max mapped address */
4946         addr_width = agaw_to_width(iommu->agaw);
4947         if (addr_width > cap_mgaw(iommu->cap))
4948                 addr_width = cap_mgaw(iommu->cap);
4949
4950         if (dmar_domain->max_addr > (1LL << addr_width)) {
4951                 pr_err("%s: iommu width (%d) is not "
4952                        "sufficient for the mapped address (%llx)\n",
4953                        __func__, addr_width, dmar_domain->max_addr);
4954                 return -EFAULT;
4955         }
4956         dmar_domain->gaw = addr_width;
4957
4958         /*
4959          * Knock out extra levels of page tables if necessary
4960          */
4961         while (iommu->agaw < dmar_domain->agaw) {
4962                 struct dma_pte *pte;
4963
4964                 pte = dmar_domain->pgd;
4965                 if (dma_pte_present(pte)) {
4966                         dmar_domain->pgd = (struct dma_pte *)
4967                                 phys_to_virt(dma_pte_addr(pte));
4968                         free_pgtable_page(pte);
4969                 }
4970                 dmar_domain->agaw--;
4971         }
4972
4973         return domain_add_dev_info(dmar_domain, dev);
4974 }
4975
4976 static void intel_iommu_detach_device(struct iommu_domain *domain,
4977                                       struct device *dev)
4978 {
4979         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4980 }
4981
4982 static int intel_iommu_map(struct iommu_domain *domain,
4983                            unsigned long iova, phys_addr_t hpa,
4984                            size_t size, int iommu_prot)
4985 {
4986         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4987         u64 max_addr;
4988         int prot = 0;
4989         int ret;
4990
4991         if (iommu_prot & IOMMU_READ)
4992                 prot |= DMA_PTE_READ;
4993         if (iommu_prot & IOMMU_WRITE)
4994                 prot |= DMA_PTE_WRITE;
4995         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4996                 prot |= DMA_PTE_SNP;
4997
4998         max_addr = iova + size;
4999         if (dmar_domain->max_addr < max_addr) {
5000                 u64 end;
5001
5002                 /* check if minimum agaw is sufficient for mapped address */
5003                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
5004                 if (end < max_addr) {
5005                         pr_err("%s: iommu width (%d) is not "
5006                                "sufficient for the mapped address (%llx)\n",
5007                                __func__, dmar_domain->gaw, max_addr);
5008                         return -EFAULT;
5009                 }
5010                 dmar_domain->max_addr = max_addr;
5011         }
5012         /* Round up size to next multiple of PAGE_SIZE, if it and
5013            the low bits of hpa would take us onto the next page */
5014         size = aligned_nrpages(hpa, size);
5015         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
5016                                  hpa >> VTD_PAGE_SHIFT, size, prot);
5017         return ret;
5018 }
5019
5020 static size_t intel_iommu_unmap(struct iommu_domain *domain,
5021                                 unsigned long iova, size_t size)
5022 {
5023         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5024         struct page *freelist = NULL;
5025         unsigned long start_pfn, last_pfn;
5026         unsigned int npages;
5027         int iommu_id, level = 0;
5028
5029         /* Cope with horrid API which requires us to unmap more than the
5030            size argument if it happens to be a large-page mapping. */
5031         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
5032
5033         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
5034                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
5035
5036         start_pfn = iova >> VTD_PAGE_SHIFT;
5037         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
5038
5039         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
5040
5041         npages = last_pfn - start_pfn + 1;
5042
5043         for_each_domain_iommu(iommu_id, dmar_domain)
5044                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
5045                                       start_pfn, npages, !freelist, 0);
5046
5047         dma_free_pagelist(freelist);
5048
5049         if (dmar_domain->max_addr == iova + size)
5050                 dmar_domain->max_addr = iova;
5051
5052         return size;
5053 }
5054
5055 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
5056                                             dma_addr_t iova)
5057 {
5058         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
5059         struct dma_pte *pte;
5060         int level = 0;
5061         u64 phys = 0;
5062
5063         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
5064         if (pte)
5065                 phys = dma_pte_addr(pte);
5066
5067         return phys;
5068 }
5069
5070 static bool intel_iommu_capable(enum iommu_cap cap)
5071 {
5072         if (cap == IOMMU_CAP_CACHE_COHERENCY)
5073                 return domain_update_iommu_snooping(NULL) == 1;
5074         if (cap == IOMMU_CAP_INTR_REMAP)
5075                 return irq_remapping_enabled == 1;
5076
5077         return false;
5078 }
5079
5080 static int intel_iommu_add_device(struct device *dev)
5081 {
5082         struct intel_iommu *iommu;
5083         struct iommu_group *group;
5084         u8 bus, devfn;
5085
5086         iommu = device_to_iommu(dev, &bus, &devfn);
5087         if (!iommu)
5088                 return -ENODEV;
5089
5090         iommu_device_link(&iommu->iommu, dev);
5091
5092         group = iommu_group_get_for_dev(dev);
5093
5094         if (IS_ERR(group))
5095                 return PTR_ERR(group);
5096
5097         iommu_group_put(group);
5098         return 0;
5099 }
5100
5101 static void intel_iommu_remove_device(struct device *dev)
5102 {
5103         struct intel_iommu *iommu;
5104         u8 bus, devfn;
5105
5106         iommu = device_to_iommu(dev, &bus, &devfn);
5107         if (!iommu)
5108                 return;
5109
5110         iommu_group_remove_device(dev);
5111
5112         iommu_device_unlink(&iommu->iommu, dev);
5113 }
5114
5115 static void intel_iommu_get_resv_regions(struct device *device,
5116                                          struct list_head *head)
5117 {
5118         struct iommu_resv_region *reg;
5119         struct dmar_rmrr_unit *rmrr;
5120         struct device *i_dev;
5121         int i;
5122
5123         rcu_read_lock();
5124         for_each_rmrr_units(rmrr) {
5125                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
5126                                           i, i_dev) {
5127                         if (i_dev != device)
5128                                 continue;
5129
5130                         list_add_tail(&rmrr->resv->list, head);
5131                 }
5132         }
5133         rcu_read_unlock();
5134
5135         reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
5136                                       IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
5137                                       0, IOMMU_RESV_MSI);
5138         if (!reg)
5139                 return;
5140         list_add_tail(&reg->list, head);
5141 }
5142
5143 static void intel_iommu_put_resv_regions(struct device *dev,
5144                                          struct list_head *head)
5145 {
5146         struct iommu_resv_region *entry, *next;
5147
5148         list_for_each_entry_safe(entry, next, head, list) {
5149                 if (entry->type == IOMMU_RESV_RESERVED)
5150                         kfree(entry);
5151         }
5152 }
5153
5154 #ifdef CONFIG_INTEL_IOMMU_SVM
5155 #define MAX_NR_PASID_BITS (20)
5156 static inline unsigned long intel_iommu_get_pts(struct device *dev)
5157 {
5158         int pts, max_pasid;
5159
5160         max_pasid = intel_pasid_get_dev_max_id(dev);
5161         pts = find_first_bit((unsigned long *)&max_pasid, MAX_NR_PASID_BITS);
5162         if (pts < 5)
5163                 return 0;
5164
5165         return pts - 5;
5166 }
5167
5168 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5169 {
5170         struct device_domain_info *info;
5171         struct context_entry *context;
5172         struct dmar_domain *domain;
5173         unsigned long flags;
5174         u64 ctx_lo;
5175         int ret;
5176
5177         domain = get_valid_domain_for_dev(sdev->dev);
5178         if (!domain)
5179                 return -EINVAL;
5180
5181         spin_lock_irqsave(&device_domain_lock, flags);
5182         spin_lock(&iommu->lock);
5183
5184         ret = -EINVAL;
5185         info = sdev->dev->archdata.iommu;
5186         if (!info || !info->pasid_supported)
5187                 goto out;
5188
5189         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5190         if (WARN_ON(!context))
5191                 goto out;
5192
5193         ctx_lo = context[0].lo;
5194
5195         sdev->did = domain->iommu_did[iommu->seq_id];
5196         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5197
5198         if (!(ctx_lo & CONTEXT_PASIDE)) {
5199                 if (iommu->pasid_state_table)
5200                         context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5201                 context[1].lo = (u64)virt_to_phys(info->pasid_table->table) |
5202                         intel_iommu_get_pts(sdev->dev);
5203
5204                 wmb();
5205                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5206                  * extended to permit requests-with-PASID if the PASIDE bit
5207                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5208                  * however, the PASIDE bit is ignored and requests-with-PASID
5209                  * are unconditionally blocked. Which makes less sense.
5210                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5211                  * "guest mode" translation types depending on whether ATS
5212                  * is available or not. Annoyingly, we can't use the new
5213                  * modes *unless* PASIDE is set. */
5214                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5215                         ctx_lo &= ~CONTEXT_TT_MASK;
5216                         if (info->ats_supported)
5217                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5218                         else
5219                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5220                 }
5221                 ctx_lo |= CONTEXT_PASIDE;
5222                 if (iommu->pasid_state_table)
5223                         ctx_lo |= CONTEXT_DINVE;
5224                 if (info->pri_supported)
5225                         ctx_lo |= CONTEXT_PRS;
5226                 context[0].lo = ctx_lo;
5227                 wmb();
5228                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5229                                            DMA_CCMD_MASK_NOBIT,
5230                                            DMA_CCMD_DEVICE_INVL);
5231         }
5232
5233         /* Enable PASID support in the device, if it wasn't already */
5234         if (!info->pasid_enabled)
5235                 iommu_enable_dev_iotlb(info);
5236
5237         if (info->ats_enabled) {
5238                 sdev->dev_iotlb = 1;
5239                 sdev->qdep = info->ats_qdep;
5240                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5241                         sdev->qdep = 0;
5242         }
5243         ret = 0;
5244
5245  out:
5246         spin_unlock(&iommu->lock);
5247         spin_unlock_irqrestore(&device_domain_lock, flags);
5248
5249         return ret;
5250 }
5251
5252 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5253 {
5254         struct intel_iommu *iommu;
5255         u8 bus, devfn;
5256
5257         if (iommu_dummy(dev)) {
5258                 dev_warn(dev,
5259                          "No IOMMU translation for device; cannot enable SVM\n");
5260                 return NULL;
5261         }
5262
5263         iommu = device_to_iommu(dev, &bus, &devfn);
5264         if ((!iommu)) {
5265                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5266                 return NULL;
5267         }
5268
5269         return iommu;
5270 }
5271 #endif /* CONFIG_INTEL_IOMMU_SVM */
5272
5273 const struct iommu_ops intel_iommu_ops = {
5274         .capable                = intel_iommu_capable,
5275         .domain_alloc           = intel_iommu_domain_alloc,
5276         .domain_free            = intel_iommu_domain_free,
5277         .attach_dev             = intel_iommu_attach_device,
5278         .detach_dev             = intel_iommu_detach_device,
5279         .map                    = intel_iommu_map,
5280         .unmap                  = intel_iommu_unmap,
5281         .iova_to_phys           = intel_iommu_iova_to_phys,
5282         .add_device             = intel_iommu_add_device,
5283         .remove_device          = intel_iommu_remove_device,
5284         .get_resv_regions       = intel_iommu_get_resv_regions,
5285         .put_resv_regions       = intel_iommu_put_resv_regions,
5286         .device_group           = pci_device_group,
5287         .pgsize_bitmap          = INTEL_IOMMU_PGSIZES,
5288 };
5289
5290 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5291 {
5292         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5293         pr_info("Disabling IOMMU for graphics on this chipset\n");
5294         dmar_map_gfx = 0;
5295 }
5296
5297 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5298 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5299 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5300 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5301 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5302 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5303 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5304
5305 static void quirk_iommu_rwbf(struct pci_dev *dev)
5306 {
5307         /*
5308          * Mobile 4 Series Chipset neglects to set RWBF capability,
5309          * but needs it. Same seems to hold for the desktop versions.
5310          */
5311         pr_info("Forcing write-buffer flush capability\n");
5312         rwbf_quirk = 1;
5313 }
5314
5315 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5316 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5317 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5318 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5319 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5320 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5321 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5322
5323 #define GGC 0x52
5324 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5325 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5326 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5327 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5328 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5329 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5330 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5331 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5332
5333 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5334 {
5335         unsigned short ggc;
5336
5337         if (pci_read_config_word(dev, GGC, &ggc))
5338                 return;
5339
5340         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5341                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5342                 dmar_map_gfx = 0;
5343         } else if (dmar_map_gfx) {
5344                 /* we have to ensure the gfx device is idle before we flush */
5345                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5346                 intel_iommu_strict = 1;
5347        }
5348 }
5349 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5350 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5351 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5352 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5353
5354 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5355    ISOCH DMAR unit for the Azalia sound device, but not give it any
5356    TLB entries, which causes it to deadlock. Check for that.  We do
5357    this in a function called from init_dmars(), instead of in a PCI
5358    quirk, because we don't want to print the obnoxious "BIOS broken"
5359    message if VT-d is actually disabled.
5360 */
5361 static void __init check_tylersburg_isoch(void)
5362 {
5363         struct pci_dev *pdev;
5364         uint32_t vtisochctrl;
5365
5366         /* If there's no Azalia in the system anyway, forget it. */
5367         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5368         if (!pdev)
5369                 return;
5370         pci_dev_put(pdev);
5371
5372         /* System Management Registers. Might be hidden, in which case
5373            we can't do the sanity check. But that's OK, because the
5374            known-broken BIOSes _don't_ actually hide it, so far. */
5375         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5376         if (!pdev)
5377                 return;
5378
5379         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5380                 pci_dev_put(pdev);
5381                 return;
5382         }
5383
5384         pci_dev_put(pdev);
5385
5386         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5387         if (vtisochctrl & 1)
5388                 return;
5389
5390         /* Drop all bits other than the number of TLB entries */
5391         vtisochctrl &= 0x1c;
5392
5393         /* If we have the recommended number of TLB entries (16), fine. */
5394         if (vtisochctrl == 0x10)
5395                 return;
5396
5397         /* Zero TLB entries? You get to ride the short bus to school. */
5398         if (!vtisochctrl) {
5399                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5400                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5401                      dmi_get_system_info(DMI_BIOS_VENDOR),
5402                      dmi_get_system_info(DMI_BIOS_VERSION),
5403                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5404                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5405                 return;
5406         }
5407
5408         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5409                vtisochctrl);
5410 }