]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge git://git.infradead.org/iommu-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jun 2011 20:48:50 +0000 (05:48 +0900)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 1 Jun 2011 20:48:50 +0000 (05:48 +0900)
* git://git.infradead.org/iommu-2.6:
  intel-iommu: Fix off-by-one in RMRR setup
  intel-iommu: Add domain check in domain_remove_one_dev_info
  intel-iommu: Remove Host Bridge devices from identity mapping
  intel-iommu: Use coherent DMA mask when requested
  intel-iommu: Dont cache iova above 32bit
  intel-iommu: Speed up processing of the identity_mapping function
  intel-iommu: Check for identity mapping candidate using system dma mask
  intel-iommu: Only unlink device domains from iommu
  intel-iommu: Enable super page (2MiB, 1GiB, etc.) support
  intel-iommu: Flush unmaps at domain_exit
  intel-iommu: Remove obsolete comment from detect_intel_iommu
  intel-iommu: fix VT-d PMR disable for TXT on S3 resume

1  2 
Documentation/kernel-parameters.txt
drivers/pci/intel-iommu.c

index 5438a2d7907f564b05644a93931f081de19844d3,d005487c1a22ca571aff8852fb0d042a29e058e9..d9a203b058f18cf4c99fb98a298c373ed7b162ca
@@@ -245,7 -245,7 +245,7 @@@ bytes respectively. Such letter suffixe
  
        acpi_sleep=     [HW,ACPI] Sleep options
                        Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig,
 -                                old_ordering, s4_nonvs, sci_force_enable }
 +                                old_ordering, nonvs, sci_force_enable }
                        See Documentation/power/video.txt for information on
                        s3_bios and s3_mode.
                        s3_beep is for debugging; it makes the PC's speaker beep
                        With this option on every unmap_single operation will
                        result in a hardware IOTLB flush operation as opposed
                        to batching them for performance.
+               sp_off [Default Off]
+                       By default, super page will be supported if Intel IOMMU
+                       has the capability. With this option, super page will
+                       not be supported.
        intremap=       [X86-64, Intel-IOMMU]
                        Format: { on (default) | off | nosid }
                        on      enable Interrupt Remapping (default)
                        noexec=on: enable non-executable mappings (default)
                        noexec=off: disable non-executable mappings
  
 +      nosmep          [X86]
 +                      Disable SMEP (Supervisor Mode Execution Protection)
 +                      even if it is supported by processor.
 +
        noexec32        [X86-64]
                        This affects only 32-bit executables.
                        noexec32=on: enable non-executable mappings (default)
  
        nosoftlockup    [KNL] Disable the soft-lockup detector.
  
 -      noswapaccount   [KNL] Disable accounting of swap in memory resource
 -                      controller. (See Documentation/cgroups/memory.txt)
 -
        nosync          [HW,M68K] Disables sync negotiation for all devices.
  
        notsc           [BUGS=X86-32] Disable Time Stamp Counter
                                        bytes of sense data);
                                c = FIX_CAPACITY (decrease the reported
                                        device capacity by one sector);
 +                              d = NO_READ_DISC_INFO (don't use
 +                                      READ_DISC_INFO command);
 +                              e = NO_READ_CAPACITY_16 (don't use
 +                                      READ_CAPACITY_16 command);
                                h = CAPACITY_HEURISTICS (decrease the
                                        reported device capacity by one
                                        sector if the number is odd);
index 6af6b628175b459f03c2f0969f9b2be6d65dba48,a8867bd745e23911d6a0be68741cd0e040b25e07..59f17acf7f68b6d3ec6b4a4798a937f299bef8a6
@@@ -39,7 -39,6 +39,7 @@@
  #include <linux/syscore_ops.h>
  #include <linux/tboot.h>
  #include <linux/dmi.h>
 +#include <linux/pci-ats.h>
  #include <asm/cacheflush.h>
  #include <asm/iommu.h>
  #include "pci.h"
@@@ -47,6 -46,8 +47,8 @@@
  #define ROOT_SIZE             VTD_PAGE_SIZE
  #define CONTEXT_SIZE          VTD_PAGE_SIZE
  
+ #define IS_BRIDGE_HOST_DEVICE(pdev) \
+                           ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
  #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
  #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
  #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
@@@ -116,6 -117,11 +118,11 @@@ static inline unsigned long align_to_le
        return (pfn + level_size(level) - 1) & level_mask(level);
  }
  
+ static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
+ {
+       return  1 << ((lvl - 1) * LEVEL_STRIDE);
+ }
  /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
     are never going to work. */
  static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
@@@ -142,6 -148,12 +149,12 @@@ static struct intel_iommu **g_iommus
  static void __init check_tylersburg_isoch(void);
  static int rwbf_quirk;
  
+ /*
+  * set to 1 to panic kernel if can't successfully enable VT-d
+  * (used when kernel is launched w/ TXT)
+  */
+ static int force_on = 0;
  /*
   * 0: Present
   * 1-11: Reserved
@@@ -338,6 -350,9 +351,9 @@@ struct dmar_domain 
        int             iommu_coherency;/* indicate coherency of iommu access */
        int             iommu_snooping; /* indicate snooping control feature*/
        int             iommu_count;    /* reference count of iommu */
+       int             iommu_superpage;/* Level of superpages supported:
+                                          0 == 4KiB (no superpages), 1 == 2MiB,
+                                          2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
        spinlock_t      iommu_lock;     /* protect iommu set in domain */
        u64             max_addr;       /* maximum mapped address */
  };
@@@ -387,6 -402,7 +403,7 @@@ int dmar_disabled = 1
  static int dmar_map_gfx = 1;
  static int dmar_forcedac;
  static int intel_iommu_strict;
+ static int intel_iommu_superpage = 1;
  
  #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
  static DEFINE_SPINLOCK(device_domain_lock);
@@@ -417,6 -433,10 +434,10 @@@ static int __init intel_iommu_setup(cha
                        printk(KERN_INFO
                                "Intel-IOMMU: disable batched IOTLB flush\n");
                        intel_iommu_strict = 1;
+               } else if (!strncmp(str, "sp_off", 6)) {
+                       printk(KERN_INFO
+                               "Intel-IOMMU: disable supported super page\n");
+                       intel_iommu_superpage = 0;
                }
  
                str += strcspn(str, ",");
@@@ -555,11 -575,32 +576,32 @@@ static void domain_update_iommu_snoopin
        }
  }
  
+ static void domain_update_iommu_superpage(struct dmar_domain *domain)
+ {
+       int i, mask = 0xf;
+       if (!intel_iommu_superpage) {
+               domain->iommu_superpage = 0;
+               return;
+       }
+       domain->iommu_superpage = 4; /* 1TiB */
+       for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
+               mask |= cap_super_page_val(g_iommus[i]->cap);
+               if (!mask) {
+                       break;
+               }
+       }
+       domain->iommu_superpage = fls(mask);
+ }
  /* Some capabilities may be different across iommus */
  static void domain_update_iommu_cap(struct dmar_domain *domain)
  {
        domain_update_iommu_coherency(domain);
        domain_update_iommu_snooping(domain);
+       domain_update_iommu_superpage(domain);
  }
  
  static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
@@@ -689,23 -730,31 +731,31 @@@ out
  }
  
  static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
-                                     unsigned long pfn)
+                                     unsigned long pfn, int large_level)
  {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        struct dma_pte *parent, *pte = NULL;
        int level = agaw_to_level(domain->agaw);
-       int offset;
+       int offset, target_level;
  
        BUG_ON(!domain->pgd);
        BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
        parent = domain->pgd;
  
+       /* Search pte */
+       if (!large_level)
+               target_level = 1;
+       else
+               target_level = large_level;
        while (level > 0) {
                void *tmp_page;
  
                offset = pfn_level_offset(pfn, level);
                pte = &parent[offset];
-               if (level == 1)
+               if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE))
+                       break;
+               if (level == target_level)
                        break;
  
                if (!dma_pte_present(pte)) {
        return pte;
  }
  
  /* return address's pte at specific level */
  static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
                                         unsigned long pfn,
-                                        int level)
+                                        int level, int *large_page)
  {
        struct dma_pte *parent, *pte = NULL;
        int total = agaw_to_level(domain->agaw);
                if (level == total)
                        return pte;
  
-               if (!dma_pte_present(pte))
+               if (!dma_pte_present(pte)) {
+                       *large_page = total;
                        break;
+               }
+               if (pte->val & DMA_PTE_LARGE_PAGE) {
+                       *large_page = total;
+                       return pte;
+               }
                parent = phys_to_virt(dma_pte_addr(pte));
                total--;
        }
@@@ -763,6 -821,7 +822,7 @@@ static void dma_pte_clear_range(struct 
                                unsigned long last_pfn)
  {
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
+       unsigned int large_page = 1;
        struct dma_pte *first_pte, *pte;
  
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
  
        /* we don't need lock here; nobody else touches the iova range */
        do {
-               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
+               large_page = 1;
+               first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
                if (!pte) {
-                       start_pfn = align_to_level(start_pfn + 1, 2);
+                       start_pfn = align_to_level(start_pfn + 1, large_page + 1);
                        continue;
                }
-               do { 
+               do {
                        dma_clear_pte(pte);
-                       start_pfn++;
+                       start_pfn += lvl_to_nr_pages(large_page);
                        pte++;
                } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
  
@@@ -798,6 -858,7 +859,7 @@@ static void dma_pte_free_pagetable(stru
        int total = agaw_to_level(domain->agaw);
        int level;
        unsigned long tmp;
+       int large_page = 2;
  
        BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
        BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
                        return;
  
                do {
-                       first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
+                       large_page = level;
+                       first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
+                       if (large_page > level)
+                               level = large_page + 1;
                        if (!pte) {
                                tmp = align_to_level(tmp + 1, level + 1);
                                continue;
@@@ -1397,6 -1461,7 +1462,7 @@@ static int domain_init(struct dmar_doma
        else
                domain->iommu_snooping = 0;
  
+       domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
        domain->iommu_count = 1;
        domain->nid = iommu->node;
  
@@@ -1417,6 -1482,10 +1483,10 @@@ static void domain_exit(struct dmar_dom
        if (!domain)
                return;
  
+       /* Flush any lazy unmaps that may reference this domain */
+       if (!intel_iommu_strict)
+               flush_unmaps_timeout(0);
        domain_remove_dev_info(domain);
        /* destroy iovas */
        put_iova_domain(&domain->iovad);
@@@ -1648,6 -1717,34 +1718,34 @@@ static inline unsigned long aligned_nrp
        return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
  }
  
+ /* Return largest possible superpage level for a given mapping */
+ static inline int hardware_largepage_caps(struct dmar_domain *domain,
+                                         unsigned long iov_pfn,
+                                         unsigned long phy_pfn,
+                                         unsigned long pages)
+ {
+       int support, level = 1;
+       unsigned long pfnmerge;
+       support = domain->iommu_superpage;
+       /* To use a large page, the virtual *and* physical addresses
+          must be aligned to 2MiB/1GiB/etc. Lower bits set in either
+          of them will mean we have to use smaller pages. So just
+          merge them and check both at once. */
+       pfnmerge = iov_pfn | phy_pfn;
+       while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
+               pages >>= VTD_STRIDE_SHIFT;
+               if (!pages)
+                       break;
+               pfnmerge >>= VTD_STRIDE_SHIFT;
+               level++;
+               support--;
+       }
+       return level;
+ }
  static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
                            struct scatterlist *sg, unsigned long phys_pfn,
                            unsigned long nr_pages, int prot)
        phys_addr_t uninitialized_var(pteval);
        int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
        unsigned long sg_res;
+       unsigned int largepage_lvl = 0;
+       unsigned long lvl_pages = 0;
  
        BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
  
                pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
        }
  
-       while (nr_pages--) {
+       while (nr_pages > 0) {
                uint64_t tmp;
  
                if (!sg_res) {
                        sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
                        sg->dma_length = sg->length;
                        pteval = page_to_phys(sg_page(sg)) | prot;
+                       phys_pfn = pteval >> VTD_PAGE_SHIFT;
                }
                if (!pte) {
-                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
+                       largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
+                       first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
                        if (!pte)
                                return -ENOMEM;
+                       /* It is large page*/
+                       if (largepage_lvl > 1)
+                               pteval |= DMA_PTE_LARGE_PAGE;
+                       else
+                               pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
                }
                /* We don't need lock here, nobody else
                 * touches the iova range
                        }
                        WARN_ON(1);
                }
+               lvl_pages = lvl_to_nr_pages(largepage_lvl);
+               BUG_ON(nr_pages < lvl_pages);
+               BUG_ON(sg_res < lvl_pages);
+               nr_pages -= lvl_pages;
+               iov_pfn += lvl_pages;
+               phys_pfn += lvl_pages;
+               pteval += lvl_pages * VTD_PAGE_SIZE;
+               sg_res -= lvl_pages;
+               /* If the next PTE would be the first in a new page, then we
+                  need to flush the cache on the entries we've just written.
+                  And then we'll need to recalculate 'pte', so clear it and
+                  let it get set again in the if (!pte) block above.
+                  If we're done (!nr_pages) we need to flush the cache too.
+                  Also if we've been setting superpages, we may need to
+                  recalculate 'pte' and switch back to smaller pages for the
+                  end of the mapping, if the trailing size is not enough to
+                  use another superpage (i.e. sg_res < lvl_pages). */
                pte++;
-               if (!nr_pages || first_pte_in_page(pte)) {
+               if (!nr_pages || first_pte_in_page(pte) ||
+                   (largepage_lvl > 1 && sg_res < lvl_pages)) {
                        domain_flush_cache(domain, first_pte,
                                           (void *)pte - (void *)first_pte);
                        pte = NULL;
                }
-               iov_pfn++;
-               pteval += VTD_PAGE_SIZE;
-               sg_res--;
-               if (!sg_res)
+               if (!sg_res && nr_pages)
                        sg = sg_next(sg);
        }
        return 0;
@@@ -2016,7 -2147,7 +2148,7 @@@ static inline int iommu_prepare_rmrr_de
        if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
                return 0;
        return iommu_prepare_identity_map(pdev, rmrr->base_address,
-               rmrr->end_address + 1);
+               rmrr->end_address);
  }
  
  #ifdef CONFIG_DMAR_FLOPPY_WA
@@@ -2030,7 -2161,7 +2162,7 @@@ static inline void iommu_prepare_isa(vo
                return;
  
        printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
-       ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
+       ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
  
        if (ret)
                printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
@@@ -2106,10 -2237,10 +2238,10 @@@ static int identity_mapping(struct pci_
        if (likely(!iommu_identity_mapping))
                return 0;
  
+       info = pdev->dev.archdata.iommu;
+       if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
+               return (info->domain == si_domain);
  
-       list_for_each_entry(info, &si_domain->devices, link)
-               if (info->dev == pdev)
-                       return 1;
        return 0;
  }
  
@@@ -2187,8 -2318,19 +2319,19 @@@ static int iommu_should_identity_map(st
         * Assume that they will -- if they turn out not to be, then we can 
         * take them out of the 1:1 domain later.
         */
-       if (!startup)
-               return pdev->dma_mask > DMA_BIT_MASK(32);
+       if (!startup) {
+               /*
+                * If the device's dma_mask is less than the system's memory
+                * size then this is not a candidate for identity mapping.
+                */
+               u64 dma_mask = pdev->dma_mask;
+               if (pdev->dev.coherent_dma_mask &&
+                   pdev->dev.coherent_dma_mask < dma_mask)
+                       dma_mask = pdev->dev.coherent_dma_mask;
+               return dma_mask >= dma_get_required_mask(&pdev->dev);
+       }
  
        return 1;
  }
@@@ -2203,6 -2345,9 +2346,9 @@@ static int __init iommu_prepare_static_
                return -EFAULT;
  
        for_each_pci_dev(pdev) {
+               /* Skip Host/PCI Bridge devices */
+               if (IS_BRIDGE_HOST_DEVICE(pdev))
+                       continue;
                if (iommu_should_identity_map(pdev, 1)) {
                        printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
                               hw ? "hardware" : "software", pci_name(pdev));
        return 0;
  }
  
- static int __init init_dmars(int force_on)
+ static int __init init_dmars(void)
  {
        struct dmar_drhd_unit *drhd;
        struct dmar_rmrr_unit *rmrr;
@@@ -2592,8 -2737,7 +2738,7 @@@ static dma_addr_t __intel_map_single(st
        iommu = domain_get_iommu(domain);
        size = aligned_nrpages(paddr, size);
  
-       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
-                               pdev->dma_mask);
+       iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
        if (!iova)
                goto error;
  
@@@ -3118,7 -3262,17 +3263,17 @@@ static int init_iommu_hw(void
                if (iommu->qi)
                        dmar_reenable_qi(iommu);
  
-       for_each_active_iommu(iommu, drhd) {
+       for_each_iommu(iommu, drhd) {
+               if (drhd->ignored) {
+                       /*
+                        * we always have to disable PMRs or DMA may fail on
+                        * this device
+                        */
+                       if (force_on)
+                               iommu_disable_protect_mem_regions(iommu);
+                       continue;
+               }
+       
                iommu_flush_write_buffer(iommu);
  
                iommu_set_root_entry(iommu);
                                           DMA_CCMD_GLOBAL_INVL);
                iommu->flush.flush_iotlb(iommu, 0, 0, 0,
                                         DMA_TLB_GLOBAL_FLUSH);
-               iommu_enable_translation(iommu);
+               if (iommu_enable_translation(iommu))
+                       return 1;
                iommu_disable_protect_mem_regions(iommu);
        }
  
@@@ -3194,7 -3349,10 +3350,10 @@@ static void iommu_resume(void
        unsigned long flag;
  
        if (init_iommu_hw()) {
-               WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
+               if (force_on)
+                       panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
+               else
+                       WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
                return;
        }
  
@@@ -3271,7 -3429,6 +3430,6 @@@ static struct notifier_block device_nb 
  int __init intel_iommu_init(void)
  {
        int ret = 0;
-       int force_on = 0;
  
        /* VT-d is required for a TXT/tboot launch, so enforce that */
        force_on = tboot_force_iommu();
  
        init_no_remapping_devices();
  
-       ret = init_dmars(force_on);
+       ret = init_dmars();
        if (ret) {
                if (force_on)
                        panic("tboot: Failed to initialize DMARs\n");
@@@ -3380,8 -3537,8 +3538,8 @@@ static void domain_remove_one_dev_info(
        spin_lock_irqsave(&device_domain_lock, flags);
        list_for_each_safe(entry, tmp, &domain->devices) {
                info = list_entry(entry, struct device_domain_info, link);
-               /* No need to compare PCI domain; it has to be the same */
-               if (info->bus == pdev->bus->number &&
+               if (info->segment == pci_domain_nr(pdev->bus) &&
+                   info->bus == pdev->bus->number &&
                    info->devfn == pdev->devfn) {
                        list_del(&info->link);
                        list_del(&info->global);
                domain_update_iommu_cap(domain);
                spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
  
-               spin_lock_irqsave(&iommu->lock, tmp_flags);
-               clear_bit(domain->id, iommu->domain_ids);
-               iommu->domains[domain->id] = NULL;
-               spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+               if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
+                   !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
+                       spin_lock_irqsave(&iommu->lock, tmp_flags);
+                       clear_bit(domain->id, iommu->domain_ids);
+                       iommu->domains[domain->id] = NULL;
+                       spin_unlock_irqrestore(&iommu->lock, tmp_flags);
+               }
        }
  
        spin_unlock_irqrestore(&device_domain_lock, flags);
@@@ -3505,6 -3665,7 +3666,7 @@@ static int md_domain_init(struct dmar_d
        domain->iommu_count = 0;
        domain->iommu_coherency = 0;
        domain->iommu_snooping = 0;
+       domain->iommu_superpage = 0;
        domain->max_addr = 0;
        domain->nid = -1;
  
@@@ -3720,7 -3881,7 +3882,7 @@@ static phys_addr_t intel_iommu_iova_to_
        struct dma_pte *pte;
        u64 phys = 0;
  
-       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
+       pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
        if (pte)
                phys = dma_pte_addr(pte);