KVM: Use vcpu-specific gva->hva translation when querying host page size

[linux.git] / arch / x86 / kvm / mmu / mmu.c
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c

index 6f92b40d798cab7d4b9f4a451621319e687e29df..e4458c9aec8c8316ea1b565572e434ad444b007a 100644 (file)
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte)
   * requires a full MMU zap).  The flag is instead explicitly queried when
   * checking for MMIO spte cache hits.
   */
-#define MMIO_SPTE_GEN_MASK             GENMASK_ULL(18, 0)
+#define MMIO_SPTE_GEN_MASK             GENMASK_ULL(17, 0)
  
  #define MMIO_SPTE_GEN_LOW_START                3
  #define MMIO_SPTE_GEN_LOW_END          11
  #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                     MMIO_SPTE_GEN_LOW_START)
  
-#define MMIO_SPTE_GEN_HIGH_START       52
-#define MMIO_SPTE_GEN_HIGH_END         61
+#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_END         62
  #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                     MMIO_SPTE_GEN_HIGH_START)
+
  static u64 generation_mmio_spte_mask(u64 gen)
  {
         u64 mask;
  
         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
  
         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 spte)
  {
         u64 gen;
  
-       spte &= ~shadow_mmio_mask;
-
         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
         return gen;
@@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
  static u8 kvm_get_shadow_phys_bits(void)
  {
         /*
-        * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
-        * in CPU detection code, but MKTME treats those reduced bits as
-        * 'keyID' thus they are not reserved bits. Therefore for MKTME
-        * we should still return physical address bits reported by CPUID.
+        * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+        * in CPU detection code, but the processor treats those reduced bits as
+        * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+        * the physical address bits reported by CPUID.
          */
-       if (!boot_cpu_has(X86_FEATURE_TME) ||
-           WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
-               return boot_cpu_data.x86_phys_bits;
+       if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+               return cpuid_eax(0x80000008) & 0xff;
  
-       return cpuid_eax(0x80000008) & 0xff;
+       /*
+        * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+        * custom CPUID.  Proceed with whatever the kernel found since these features
+        * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+        */
+       return boot_cpu_data.x86_phys_bits;
  }
  
  static void kvm_mmu_reset_all_pte_masks(void)
@@ -1282,12 +1286,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
         return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
  }
  
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
  {
         unsigned long page_size;
         int i, ret = 0;
  
-       page_size = kvm_host_page_size(kvm, gfn);
+       page_size = kvm_host_page_size(vcpu, gfn);
  
         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                 if (page_size >= KVM_HPAGE_SIZE(i))
@@ -1324,31 +1328,42 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
  }
  
  static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
-                        bool *force_pt_level)
+                        int *max_levelp)
  {
-       int host_level, level, max_level;
+       int host_level, max_level = *max_levelp;
         struct kvm_memory_slot *slot;
  
-       if (unlikely(*force_pt_level))
+       if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
                 return PT_PAGE_TABLE_LEVEL;
  
         slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
-       *force_pt_level = !memslot_valid_for_gpte(slot, true);
-       if (unlikely(*force_pt_level))
+       if (!memslot_valid_for_gpte(slot, true)) {
+               *max_levelp = PT_PAGE_TABLE_LEVEL;
                 return PT_PAGE_TABLE_LEVEL;
+       }
  
-       host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
-       if (host_level == PT_PAGE_TABLE_LEVEL)
-               return host_level;
+       max_level = min(max_level, kvm_x86_ops->get_lpage_level());
+       for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+               if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot))
+                       break;
+       }
  
-       max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
+       *max_levelp = max_level;
  
-       for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
-                       break;
+       if (max_level == PT_PAGE_TABLE_LEVEL)
+               return PT_PAGE_TABLE_LEVEL;
  
-       return level - 1;
+       /*
+        * Note, host_mapping_level() does *not* handle transparent huge pages.
+        * As suggested by "mapping", it reflects the page size established by
+        * the associated vma, if there is one, i.e. host_mapping_level() will
+        * return a huge page level if and only if a vma exists and the backing
+        * implementation for the vma uses huge pages, e.g. hugetlbfs and dax.
+        * So, do not propagate host_mapping_level() to max_level as KVM can
+        * still promote the guest mapping to a huge page in the THP case.
+        */
+       host_level = host_mapping_level(vcpu, large_gfn);
+       return min(host_level, max_level);
  }
  
  /*
@@ -1410,7 +1425,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
         if (j != 0)
                 return;
         if (!prev_desc && !desc->more)
-               rmap_head->val = (unsigned long)desc->sptes[0];
+               rmap_head->val = 0;
         else
                 if (prev_desc)
                         prev_desc->more = desc->more;
@@ -1525,7 +1540,7 @@ struct rmap_iterator {
  /*
   * Iteration must be started by this function.  This should also be used after
   * removing/dropping sptes from the rmap link because in such cases the
- * information in the itererator may not be valid.
+ * information in the iterator may not be valid.
   *
   * Returns sptep if found, NULL otherwise.
   */
@@ -2899,6 +2914,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
  }
  
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+       LIST_HEAD(invalid_list);
+
+       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+               return 0;
+
+       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+                       break;
+
+               ++vcpu->kvm->stat.mmu_recycled;
+       }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+       if (!kvm_mmu_available_pages(vcpu->kvm))
+               return -ENOSPC;
+       return 0;
+}
+
  /*
   * Changing the number of mmu pages allocated to the vm
   * Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -3294,6 +3329,35 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
         __direct_pte_prefetch(vcpu, sp, sptep);
  }
  
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+                                       gfn_t gfn, kvm_pfn_t *pfnp,
+                                       int *levelp)
+{
+       kvm_pfn_t pfn = *pfnp;
+       int level = *levelp;
+
+       /*
+        * Check if it's a transparent hugepage. If this would be an
+        * hugetlbfs page, level wouldn't be set to
+        * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+        * here.
+        */
+       if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
+           !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
+           kvm_is_transparent_hugepage(pfn)) {
+               unsigned long mask;
+
+               /*
+                * mmu_notifier_retry() was successful and mmu_lock is held, so
+                * the pmd can't be split from under us.
+                */
+               *levelp = level = PT_DIRECTORY_LEVEL;
+               mask = KVM_PAGES_PER_HPAGE(level) - 1;
+               VM_BUG_ON((gfn & mask) != (pfn & mask));
+               *pfnp = pfn & ~mask;
+       }
+}
+
  static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
                                        gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
  {
@@ -3318,8 +3382,9 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
  }
  
  static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
-                       int map_writable, int level, kvm_pfn_t pfn,
-                       bool prefault, bool lpage_disallowed)
+                       int map_writable, int level, int max_level,
+                       kvm_pfn_t pfn, bool prefault,
+                       bool account_disallowed_nx_lpage)
  {
         struct kvm_shadow_walk_iterator it;
         struct kvm_mmu_page *sp;
@@ -3327,9 +3392,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
         gfn_t gfn = gpa >> PAGE_SHIFT;
         gfn_t base_gfn = gfn;
  
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+       if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
                 return RET_PF_RETRY;
  
+       if (likely(max_level > PT_PAGE_TABLE_LEVEL))
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+
         trace_kvm_mmu_spte_requested(gpa, level, pfn);
         for_each_shadow_entry(vcpu, gpa, it) {
                 /*
@@ -3348,7 +3416,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                                               it.level - 1, true, ACC_ALL);
  
                         link_shadow_page(vcpu, it.sptep, sp);
-                       if (lpage_disallowed)
+                       if (account_disallowed_nx_lpage)
                                 account_huge_nx_page(vcpu->kvm, sp);
                 }
         }
@@ -3384,45 +3452,6 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
         return -EFAULT;
  }
  
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t gfn, kvm_pfn_t *pfnp,
-                                       int *levelp)
-{
-       kvm_pfn_t pfn = *pfnp;
-       int level = *levelp;
-
-       /*
-        * Check if it's a transparent hugepage. If this would be an
-        * hugetlbfs page, level wouldn't be set to
-        * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
-        * here.
-        */
-       if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
-           !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
-           PageTransCompoundMap(pfn_to_page(pfn)) &&
-           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
-               unsigned long mask;
-               /*
-                * mmu_notifier_retry was successful and we hold the
-                * mmu_lock here, so the pmd can't become splitting
-                * from under us, and in turn
-                * __split_huge_page_refcount() can't run from under
-                * us and we can safely transfer the refcount from
-                * PG_tail to PG_head as we switch the pfn to tail to
-                * head.
-                */
-               *levelp = level = PT_DIRECTORY_LEVEL;
-               mask = KVM_PAGES_PER_HPAGE(level) - 1;
-               VM_BUG_ON((gfn & mask) != (pfn & mask));
-               if (pfn & mask) {
-                       kvm_release_pfn_clean(pfn);
-                       pfn &= ~mask;
-                       kvm_get_pfn(pfn);
-                       *pfnp = pfn;
-               }
-       }
-}
-
  static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
  {
@@ -3528,7 +3557,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
   * - true: let the vcpu to access on the same address again.
   * - false: let the real page fault path to fix it.
   */
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
                             u32 error_code)
  {
         struct kvm_shadow_walk_iterator iterator;
@@ -3537,9 +3566,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
         u64 spte = 0ull;
         uint retry_count = 0;
  
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return false;
-
         if (!page_fault_can_be_fast(error_code))
                 return false;
  
@@ -3548,7 +3574,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
         do {
                 u64 new_spte;
  
-               for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+               for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
                         if (!is_shadow_present_pte(spte) ||
                             iterator.level < level)
                                 break;
@@ -3626,71 +3652,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
  
         } while (true);
  
-       trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+       trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
                               spte, fault_handled);
         walk_shadow_page_lockless_end(vcpu);
  
         return fault_handled;
  }
  
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
-                        gfn_t gfn, bool prefault)
-{
-       int r;
-       int level;
-       bool force_pt_level;
-       kvm_pfn_t pfn;
-       unsigned long mmu_seq;
-       bool map_writable, write = error_code & PFERR_WRITE_MASK;
-       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
-                               is_nx_huge_page_enabled();
-
-       force_pt_level = lpage_disallowed;
-       level = mapping_level(vcpu, gfn, &force_pt_level);
-       if (likely(!force_pt_level)) {
-               /*
-                * This path builds a PAE pagetable - so we can map
-                * 2mb pages at maximum. Therefore check if the level
-                * is larger than that.
-                */
-               if (level > PT_DIRECTORY_LEVEL)
-                       level = PT_DIRECTORY_LEVEL;
-
-               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-       }
-
-       if (fast_page_fault(vcpu, v, level, error_code))
-               return RET_PF_RETRY;
-
-       mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       smp_rmb();
-
-       if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
-               return RET_PF_RETRY;
-
-       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
-               return r;
-
-       r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
-               goto out_unlock;
-       if (make_mmu_pages_available(vcpu) < 0)
-               goto out_unlock;
-       if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, v, write, map_writable, level, pfn,
-                        prefault, false);
-out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_release_pfn_clean(pfn);
-       return r;
-}
-
  static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
                                struct list_head *invalid_list)
  {
@@ -3981,7 +3949,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
  
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                   u32 access, struct x86_exception *exception)
  {
         if (exception)
@@ -3989,7 +3957,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
         return vaddr;
  }
  
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                          u32 access,
                                          struct x86_exception *exception)
  {
@@ -4001,20 +3969,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
  static bool
  __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
  {
-       int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+       int bit7 = (pte >> 7) & 1;
  
-       return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
-               ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+       return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
  }
  
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
  {
-       return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
-}
-
-static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
-{
-       return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+       return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
  }
  
  static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -4038,11 +4000,11 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
  {
         struct kvm_shadow_walk_iterator iterator;
         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+       struct rsvd_bits_validate *rsvd_check;
         int root, leaf;
         bool reserved = false;
  
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               goto exit;
+       rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
  
         walk_shadow_page_lockless_begin(vcpu);
  
@@ -4058,8 +4020,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                 if (!is_shadow_present_pte(spte))
                         break;
  
-               reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
-                                                   iterator.level);
+               /*
+                * Use a bitwise-OR instead of a logical-OR to aggregate the
+                * reserved bit and EPT's invalid memtype/XWR checks to avoid
+                * adding a Jcc in the loop.
+                */
+               reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
+                           __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
         }
  
         walk_shadow_page_lockless_end(vcpu);
@@ -4073,7 +4040,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                         root--;
                 }
         }
-exit:
+
         *sptep = spte;
         return reserved;
  }
@@ -4137,9 +4104,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
         struct kvm_shadow_walk_iterator iterator;
         u64 spte;
  
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return;
-
         walk_shadow_page_lockless_begin(vcpu);
         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
                 clear_sp_write_flooding_count(iterator.sptep);
@@ -4149,29 +4113,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
         walk_shadow_page_lockless_end(vcpu);
  }
  
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                               u32 error_code, bool prefault)
-{
-       gfn_t gfn = gva >> PAGE_SHIFT;
-       int r;
-
-       pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-
-       if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return RET_PF_EMULATE;
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
-
-       return nonpaging_map(vcpu, gva & PAGE_MASK,
-                            error_code, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                  gfn_t gfn)
  {
         struct kvm_arch_async_pf arch;
  
@@ -4180,11 +4123,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
         arch.direct_map = vcpu->arch.mmu->direct_map;
         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
  
-       return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+       return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+                                 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
  }
  
  static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+                        bool *writable)
  {
         struct kvm_memory_slot *slot;
         bool async;
@@ -4204,12 +4149,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                 return false; /* *pfn has correct page already */
  
         if (!prefault && kvm_can_do_async_pf(vcpu)) {
-               trace_kvm_try_async_get_page(gva, gfn);
+               trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
-                       trace_kvm_async_pf_doublefault(gva, gfn);
+                       trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
                         return true;
-               } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
                         return true;
         }
  
@@ -4217,11 +4162,81 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
         return false;
  }
  
+static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+                            bool prefault, int max_level, bool is_tdp)
+{
+       bool write = error_code & PFERR_WRITE_MASK;
+       bool exec = error_code & PFERR_FETCH_MASK;
+       bool lpage_disallowed = exec && is_nx_huge_page_enabled();
+       bool map_writable;
+
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       unsigned long mmu_seq;
+       kvm_pfn_t pfn;
+       int level, r;
+
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return RET_PF_EMULATE;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       if (lpage_disallowed)
+               max_level = PT_PAGE_TABLE_LEVEL;
+
+       level = mapping_level(vcpu, gfn, &max_level);
+       if (level > PT_PAGE_TABLE_LEVEL)
+               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+       if (fast_page_fault(vcpu, gpa, level, error_code))
+               return RET_PF_RETRY;
+
+       mmu_seq = vcpu->kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+               return RET_PF_RETRY;
+
+       if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+               return r;
+
+       r = RET_PF_RETRY;
+       spin_lock(&vcpu->kvm->mmu_lock);
+       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+               goto out_unlock;
+       if (make_mmu_pages_available(vcpu) < 0)
+               goto out_unlock;
+       r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn,
+                        prefault, is_tdp && lpage_disallowed);
+
+out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+                               u32 error_code, bool prefault)
+{
+       pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+       /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+       return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
+                                PT_DIRECTORY_LEVEL, false);
+}
+
  int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                 u64 fault_address, char *insn, int insn_len)
  {
         int r = 1;
  
+#ifndef CONFIG_X86_64
+       /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+       if (WARN_ON_ONCE(fault_address >> 32))
+               return -EFAULT;
+#endif
+
         vcpu->arch.l1tf_flush_l1d = true;
         switch (vcpu->arch.apf.host_apf_reason) {
         default:
@@ -4249,76 +4264,23 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
  }
  EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
  
-static bool
-check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
-{
-       int page_num = KVM_PAGES_PER_HPAGE(level);
-
-       gfn &= ~(page_num - 1);
-
-       return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                           bool prefault)
  {
-       kvm_pfn_t pfn;
-       int r;
-       int level;
-       bool force_pt_level;
-       gfn_t gfn = gpa >> PAGE_SHIFT;
-       unsigned long mmu_seq;
-       int write = error_code & PFERR_WRITE_MASK;
-       bool map_writable;
-       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
-                               is_nx_huge_page_enabled();
+       int max_level;
  
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
+       for (max_level = PT_MAX_HUGEPAGE_LEVEL;
+            max_level > PT_PAGE_TABLE_LEVEL;
+            max_level--) {
+               int page_num = KVM_PAGES_PER_HPAGE(max_level);
+               gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
  
-       if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return RET_PF_EMULATE;
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       force_pt_level =
-               lpage_disallowed ||
-               !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
-       level = mapping_level(vcpu, gfn, &force_pt_level);
-       if (likely(!force_pt_level)) {
-               if (level > PT_DIRECTORY_LEVEL &&
-                   !check_hugepage_cache_consistency(vcpu, gfn, level))
-                       level = PT_DIRECTORY_LEVEL;
-               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+               if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+                       break;
         }
  
-       if (fast_page_fault(vcpu, gpa, level, error_code))
-               return RET_PF_RETRY;
-
-       mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       smp_rmb();
-
-       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
-               return RET_PF_RETRY;
-
-       if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
-               return r;
-
-       r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
-               goto out_unlock;
-       if (make_mmu_pages_available(vcpu) < 0)
-               goto out_unlock;
-       if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
-                        prefault, lpage_disallowed);
-out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_release_pfn_clean(pfn);
-       return r;
+       return direct_page_fault(vcpu, gpa, error_code, prefault,
+                                max_level, true);
  }
  
  static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -5496,47 +5458,30 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
  
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
-{
-       LIST_HEAD(invalid_list);
-
-       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
-               return 0;
-
-       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
-               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
-                       break;
-
-               ++vcpu->kvm->stat.mmu_recycled;
-       }
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
-       if (!kvm_mmu_available_pages(vcpu->kvm))
-               return -ENOSPC;
-       return 0;
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                        void *insn, int insn_len)
  {
         int r, emulation_type = 0;
         bool direct = vcpu->arch.mmu->direct_map;
  
+       if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+               return RET_PF_RETRY;
+
         /* With shadow page tables, fault_address contains a GVA or nGPA.  */
         if (vcpu->arch.mmu->direct_map) {
                 vcpu->arch.gpa_available = true;
-               vcpu->arch.gpa_val = cr2;
+               vcpu->arch.gpa_val = cr2_or_gpa;
         }
  
         r = RET_PF_INVALID;
         if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
                 if (r == RET_PF_EMULATE)
                         goto emulate;
         }
  
         if (r == RET_PF_INVALID) {
-               r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+               r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
                                                lower_32_bits(error_code),
                                                false);
                 WARN_ON(r == RET_PF_INVALID);
@@ -5556,7 +5501,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
          */
         if (vcpu->arch.mmu->direct_map &&
             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
                 return 1;
         }
  
@@ -5571,7 +5516,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
          * explicitly shadowing L1's page tables, i.e. unprotecting something
          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
          */
-       if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+       if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
                 emulation_type = EMULTYPE_ALLOW_RETRY;
  emulate:
         /*
@@ -5586,7 +5531,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
                         return 1;
         }
  
-       return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+       return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
                                        insn_len);
  }
  EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -6016,7 +5961,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                  */
                 if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
                     !kvm_is_zone_device_pfn(pfn) &&
-                   PageTransCompoundMap(pfn_to_page(pfn))) {
+                   kvm_is_transparent_hugepage(pfn)) {
                         pte_list_remove(rmap_head, sptep);
  
                         if (kvm_available_flush_tlb_with_range())
@@ -6249,7 +6194,7 @@ static void kvm_set_mmio_spte_mask(void)
          * If reserved bit is not supported, clear the present bit to disable
          * mmio page fault.
          */
-       if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+       if (shadow_phys_bits == 52)
                 mask &= ~1ull;
  
         kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);