]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - arch/x86/kvm/mmu/mmu.c
KVM: Use vcpu-specific gva->hva translation when querying host page size
[linux.git] / arch / x86 / kvm / mmu / mmu.c
index 6f92b40d798cab7d4b9f4a451621319e687e29df..e4458c9aec8c8316ea1b565572e434ad444b007a 100644 (file)
@@ -418,22 +418,24 @@ static inline bool is_access_track_spte(u64 spte)
  * requires a full MMU zap).  The flag is instead explicitly queried when
  * checking for MMIO spte cache hits.
  */
-#define MMIO_SPTE_GEN_MASK             GENMASK_ULL(18, 0)
+#define MMIO_SPTE_GEN_MASK             GENMASK_ULL(17, 0)
 
 #define MMIO_SPTE_GEN_LOW_START                3
 #define MMIO_SPTE_GEN_LOW_END          11
 #define MMIO_SPTE_GEN_LOW_MASK         GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
                                                    MMIO_SPTE_GEN_LOW_START)
 
-#define MMIO_SPTE_GEN_HIGH_START       52
-#define MMIO_SPTE_GEN_HIGH_END         61
+#define MMIO_SPTE_GEN_HIGH_START       PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_END         62
 #define MMIO_SPTE_GEN_HIGH_MASK                GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
                                                    MMIO_SPTE_GEN_HIGH_START)
+
 static u64 generation_mmio_spte_mask(u64 gen)
 {
        u64 mask;
 
        WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+       BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
 
        mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
        mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
@@ -444,8 +446,6 @@ static u64 get_mmio_spte_generation(u64 spte)
 {
        u64 gen;
 
-       spte &= ~shadow_mmio_mask;
-
        gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
        gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
        return gen;
@@ -538,16 +538,20 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 static u8 kvm_get_shadow_phys_bits(void)
 {
        /*
-        * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
-        * in CPU detection code, but MKTME treats those reduced bits as
-        * 'keyID' thus they are not reserved bits. Therefore for MKTME
-        * we should still return physical address bits reported by CPUID.
+        * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+        * in CPU detection code, but the processor treats those reduced bits as
+        * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+        * the physical address bits reported by CPUID.
         */
-       if (!boot_cpu_has(X86_FEATURE_TME) ||
-           WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
-               return boot_cpu_data.x86_phys_bits;
+       if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+               return cpuid_eax(0x80000008) & 0xff;
 
-       return cpuid_eax(0x80000008) & 0xff;
+       /*
+        * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+        * custom CPUID.  Proceed with whatever the kernel found since these features
+        * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+        */
+       return boot_cpu_data.x86_phys_bits;
 }
 
 static void kvm_mmu_reset_all_pte_masks(void)
@@ -1282,12 +1286,12 @@ static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
        return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
 }
 
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        unsigned long page_size;
        int i, ret = 0;
 
-       page_size = kvm_host_page_size(kvm, gfn);
+       page_size = kvm_host_page_size(vcpu, gfn);
 
        for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                if (page_size >= KVM_HPAGE_SIZE(i))
@@ -1324,31 +1328,42 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 }
 
 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
-                        bool *force_pt_level)
+                        int *max_levelp)
 {
-       int host_level, level, max_level;
+       int host_level, max_level = *max_levelp;
        struct kvm_memory_slot *slot;
 
-       if (unlikely(*force_pt_level))
+       if (unlikely(max_level == PT_PAGE_TABLE_LEVEL))
                return PT_PAGE_TABLE_LEVEL;
 
        slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
-       *force_pt_level = !memslot_valid_for_gpte(slot, true);
-       if (unlikely(*force_pt_level))
+       if (!memslot_valid_for_gpte(slot, true)) {
+               *max_levelp = PT_PAGE_TABLE_LEVEL;
                return PT_PAGE_TABLE_LEVEL;
+       }
 
-       host_level = host_mapping_level(vcpu->kvm, large_gfn);
-
-       if (host_level == PT_PAGE_TABLE_LEVEL)
-               return host_level;
+       max_level = min(max_level, kvm_x86_ops->get_lpage_level());
+       for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) {
+               if (!__mmu_gfn_lpage_is_disallowed(large_gfn, max_level, slot))
+                       break;
+       }
 
-       max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
+       *max_levelp = max_level;
 
-       for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-               if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
-                       break;
+       if (max_level == PT_PAGE_TABLE_LEVEL)
+               return PT_PAGE_TABLE_LEVEL;
 
-       return level - 1;
+       /*
+        * Note, host_mapping_level() does *not* handle transparent huge pages.
+        * As suggested by "mapping", it reflects the page size established by
+        * the associated vma, if there is one, i.e. host_mapping_level() will
+        * return a huge page level if and only if a vma exists and the backing
+        * implementation for the vma uses huge pages, e.g. hugetlbfs and dax.
+        * So, do not propagate host_mapping_level() to max_level as KVM can
+        * still promote the guest mapping to a huge page in the THP case.
+        */
+       host_level = host_mapping_level(vcpu, large_gfn);
+       return min(host_level, max_level);
 }
 
 /*
@@ -1410,7 +1425,7 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
        if (j != 0)
                return;
        if (!prev_desc && !desc->more)
-               rmap_head->val = (unsigned long)desc->sptes[0];
+               rmap_head->val = 0;
        else
                if (prev_desc)
                        prev_desc->more = desc->more;
@@ -1525,7 +1540,7 @@ struct rmap_iterator {
 /*
  * Iteration must be started by this function.  This should also be used after
  * removing/dropping sptes from the rmap link because in such cases the
- * information in the itererator may not be valid.
+ * information in the iterator may not be valid.
  *
  * Returns sptep if found, NULL otherwise.
  */
@@ -2899,6 +2914,26 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
        return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 }
 
+static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
+{
+       LIST_HEAD(invalid_list);
+
+       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
+               return 0;
+
+       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
+               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
+                       break;
+
+               ++vcpu->kvm->stat.mmu_recycled;
+       }
+       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+       if (!kvm_mmu_available_pages(vcpu->kvm))
+               return -ENOSPC;
+       return 0;
+}
+
 /*
  * Changing the number of mmu pages allocated to the vm
  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
@@ -3294,6 +3329,35 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
        __direct_pte_prefetch(vcpu, sp, sptep);
 }
 
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+                                       gfn_t gfn, kvm_pfn_t *pfnp,
+                                       int *levelp)
+{
+       kvm_pfn_t pfn = *pfnp;
+       int level = *levelp;
+
+       /*
+        * Check if it's a transparent hugepage. If this would be an
+        * hugetlbfs page, level wouldn't be set to
+        * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+        * here.
+        */
+       if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
+           !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
+           kvm_is_transparent_hugepage(pfn)) {
+               unsigned long mask;
+
+               /*
+                * mmu_notifier_retry() was successful and mmu_lock is held, so
+                * the pmd can't be split from under us.
+                */
+               *levelp = level = PT_DIRECTORY_LEVEL;
+               mask = KVM_PAGES_PER_HPAGE(level) - 1;
+               VM_BUG_ON((gfn & mask) != (pfn & mask));
+               *pfnp = pfn & ~mask;
+       }
+}
+
 static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
                                       gfn_t gfn, kvm_pfn_t *pfnp, int *levelp)
 {
@@ -3318,8 +3382,9 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it,
 }
 
 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
-                       int map_writable, int level, kvm_pfn_t pfn,
-                       bool prefault, bool lpage_disallowed)
+                       int map_writable, int level, int max_level,
+                       kvm_pfn_t pfn, bool prefault,
+                       bool account_disallowed_nx_lpage)
 {
        struct kvm_shadow_walk_iterator it;
        struct kvm_mmu_page *sp;
@@ -3327,9 +3392,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
        gfn_t gfn = gpa >> PAGE_SHIFT;
        gfn_t base_gfn = gfn;
 
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
+       if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
                return RET_PF_RETRY;
 
+       if (likely(max_level > PT_PAGE_TABLE_LEVEL))
+               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
+
        trace_kvm_mmu_spte_requested(gpa, level, pfn);
        for_each_shadow_entry(vcpu, gpa, it) {
                /*
@@ -3348,7 +3416,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
                                              it.level - 1, true, ACC_ALL);
 
                        link_shadow_page(vcpu, it.sptep, sp);
-                       if (lpage_disallowed)
+                       if (account_disallowed_nx_lpage)
                                account_huge_nx_page(vcpu->kvm, sp);
                }
        }
@@ -3384,45 +3452,6 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
        return -EFAULT;
 }
 
-static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
-                                       gfn_t gfn, kvm_pfn_t *pfnp,
-                                       int *levelp)
-{
-       kvm_pfn_t pfn = *pfnp;
-       int level = *levelp;
-
-       /*
-        * Check if it's a transparent hugepage. If this would be an
-        * hugetlbfs page, level wouldn't be set to
-        * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
-        * here.
-        */
-       if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
-           !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL &&
-           PageTransCompoundMap(pfn_to_page(pfn)) &&
-           !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
-               unsigned long mask;
-               /*
-                * mmu_notifier_retry was successful and we hold the
-                * mmu_lock here, so the pmd can't become splitting
-                * from under us, and in turn
-                * __split_huge_page_refcount() can't run from under
-                * us and we can safely transfer the refcount from
-                * PG_tail to PG_head as we switch the pfn to tail to
-                * head.
-                */
-               *levelp = level = PT_DIRECTORY_LEVEL;
-               mask = KVM_PAGES_PER_HPAGE(level) - 1;
-               VM_BUG_ON((gfn & mask) != (pfn & mask));
-               if (pfn & mask) {
-                       kvm_release_pfn_clean(pfn);
-                       pfn &= ~mask;
-                       kvm_get_pfn(pfn);
-                       *pfnp = pfn;
-               }
-       }
-}
-
 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                kvm_pfn_t pfn, unsigned access, int *ret_val)
 {
@@ -3528,7 +3557,7 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte)
  * - true: let the vcpu to access on the same address again.
  * - false: let the real page fault path to fix it.
  */
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
                            u32 error_code)
 {
        struct kvm_shadow_walk_iterator iterator;
@@ -3537,9 +3566,6 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        u64 spte = 0ull;
        uint retry_count = 0;
 
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return false;
-
        if (!page_fault_can_be_fast(error_code))
                return false;
 
@@ -3548,7 +3574,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
        do {
                u64 new_spte;
 
-               for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+               for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
                        if (!is_shadow_present_pte(spte) ||
                            iterator.level < level)
                                break;
@@ -3626,71 +3652,13 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
 
        } while (true);
 
-       trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+       trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
                              spte, fault_handled);
        walk_shadow_page_lockless_end(vcpu);
 
        return fault_handled;
 }
 
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
-
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
-                        gfn_t gfn, bool prefault)
-{
-       int r;
-       int level;
-       bool force_pt_level;
-       kvm_pfn_t pfn;
-       unsigned long mmu_seq;
-       bool map_writable, write = error_code & PFERR_WRITE_MASK;
-       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
-                               is_nx_huge_page_enabled();
-
-       force_pt_level = lpage_disallowed;
-       level = mapping_level(vcpu, gfn, &force_pt_level);
-       if (likely(!force_pt_level)) {
-               /*
-                * This path builds a PAE pagetable - so we can map
-                * 2mb pages at maximum. Therefore check if the level
-                * is larger than that.
-                */
-               if (level > PT_DIRECTORY_LEVEL)
-                       level = PT_DIRECTORY_LEVEL;
-
-               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-       }
-
-       if (fast_page_fault(vcpu, v, level, error_code))
-               return RET_PF_RETRY;
-
-       mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       smp_rmb();
-
-       if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
-               return RET_PF_RETRY;
-
-       if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
-               return r;
-
-       r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
-               goto out_unlock;
-       if (make_mmu_pages_available(vcpu) < 0)
-               goto out_unlock;
-       if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, v, write, map_writable, level, pfn,
-                        prefault, false);
-out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_release_pfn_clean(pfn);
-       return r;
-}
-
 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
                               struct list_head *invalid_list)
 {
@@ -3981,7 +3949,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
 
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                  u32 access, struct x86_exception *exception)
 {
        if (exception)
@@ -3989,7 +3957,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
        return vaddr;
 }
 
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
                                         u32 access,
                                         struct x86_exception *exception)
 {
@@ -4001,20 +3969,14 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
 static bool
 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
 {
-       int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
+       int bit7 = (pte >> 7) & 1;
 
-       return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
-               ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
+       return pte & rsvd_check->rsvd_bits_mask[bit7][level-1];
 }
 
-static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
+static bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check, u64 pte)
 {
-       return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
-}
-
-static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
-{
-       return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
+       return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
 }
 
 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
@@ -4038,11 +4000,11 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
        struct kvm_shadow_walk_iterator iterator;
        u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
+       struct rsvd_bits_validate *rsvd_check;
        int root, leaf;
        bool reserved = false;
 
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               goto exit;
+       rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
 
        walk_shadow_page_lockless_begin(vcpu);
 
@@ -4058,8 +4020,13 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                if (!is_shadow_present_pte(spte))
                        break;
 
-               reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
-                                                   iterator.level);
+               /*
+                * Use a bitwise-OR instead of a logical-OR to aggregate the
+                * reserved bit and EPT's invalid memtype/XWR checks to avoid
+                * adding a Jcc in the loop.
+                */
+               reserved |= __is_bad_mt_xwr(rsvd_check, spte) |
+                           __is_rsvd_bits_set(rsvd_check, spte, iterator.level);
        }
 
        walk_shadow_page_lockless_end(vcpu);
@@ -4073,7 +4040,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
                        root--;
                }
        }
-exit:
+
        *sptep = spte;
        return reserved;
 }
@@ -4137,9 +4104,6 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
        struct kvm_shadow_walk_iterator iterator;
        u64 spte;
 
-       if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
-               return;
-
        walk_shadow_page_lockless_begin(vcpu);
        for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
                clear_sp_write_flooding_count(iterator.sptep);
@@ -4149,29 +4113,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
        walk_shadow_page_lockless_end(vcpu);
 }
 
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
-                               u32 error_code, bool prefault)
-{
-       gfn_t gfn = gva >> PAGE_SHIFT;
-       int r;
-
-       pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
-
-       if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return RET_PF_EMULATE;
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
-
-
-       return nonpaging_map(vcpu, gva & PAGE_MASK,
-                            error_code, gfn, prefault);
-}
-
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+                                  gfn_t gfn)
 {
        struct kvm_arch_async_pf arch;
 
@@ -4180,11 +4123,13 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
        arch.direct_map = vcpu->arch.mmu->direct_map;
        arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
 
-       return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+       return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+                                 kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-                        gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+                        gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+                        bool *writable)
 {
        struct kvm_memory_slot *slot;
        bool async;
@@ -4204,12 +4149,12 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
                return false; /* *pfn has correct page already */
 
        if (!prefault && kvm_can_do_async_pf(vcpu)) {
-               trace_kvm_try_async_get_page(gva, gfn);
+               trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
                if (kvm_find_async_pf_gfn(vcpu, gfn)) {
-                       trace_kvm_async_pf_doublefault(gva, gfn);
+                       trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
                        kvm_make_request(KVM_REQ_APF_HALT, vcpu);
                        return true;
-               } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+               } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
                        return true;
        }
 
@@ -4217,11 +4162,81 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
        return false;
 }
 
+static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
+                            bool prefault, int max_level, bool is_tdp)
+{
+       bool write = error_code & PFERR_WRITE_MASK;
+       bool exec = error_code & PFERR_FETCH_MASK;
+       bool lpage_disallowed = exec && is_nx_huge_page_enabled();
+       bool map_writable;
+
+       gfn_t gfn = gpa >> PAGE_SHIFT;
+       unsigned long mmu_seq;
+       kvm_pfn_t pfn;
+       int level, r;
+
+       if (page_fault_handle_page_track(vcpu, error_code, gfn))
+               return RET_PF_EMULATE;
+
+       r = mmu_topup_memory_caches(vcpu);
+       if (r)
+               return r;
+
+       if (lpage_disallowed)
+               max_level = PT_PAGE_TABLE_LEVEL;
+
+       level = mapping_level(vcpu, gfn, &max_level);
+       if (level > PT_PAGE_TABLE_LEVEL)
+               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+
+       if (fast_page_fault(vcpu, gpa, level, error_code))
+               return RET_PF_RETRY;
+
+       mmu_seq = vcpu->kvm->mmu_notifier_seq;
+       smp_rmb();
+
+       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+               return RET_PF_RETRY;
+
+       if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+               return r;
+
+       r = RET_PF_RETRY;
+       spin_lock(&vcpu->kvm->mmu_lock);
+       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+               goto out_unlock;
+       if (make_mmu_pages_available(vcpu) < 0)
+               goto out_unlock;
+       r = __direct_map(vcpu, gpa, write, map_writable, level, max_level, pfn,
+                        prefault, is_tdp && lpage_disallowed);
+
+out_unlock:
+       spin_unlock(&vcpu->kvm->mmu_lock);
+       kvm_release_pfn_clean(pfn);
+       return r;
+}
+
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
+                               u32 error_code, bool prefault)
+{
+       pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
+
+       /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */
+       return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault,
+                                PT_DIRECTORY_LEVEL, false);
+}
+
 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
                                u64 fault_address, char *insn, int insn_len)
 {
        int r = 1;
 
+#ifndef CONFIG_X86_64
+       /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+       if (WARN_ON_ONCE(fault_address >> 32))
+               return -EFAULT;
+#endif
+
        vcpu->arch.l1tf_flush_l1d = true;
        switch (vcpu->arch.apf.host_apf_reason) {
        default:
@@ -4249,76 +4264,23 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
 }
 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
 
-static bool
-check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
-{
-       int page_num = KVM_PAGES_PER_HPAGE(level);
-
-       gfn &= ~(page_num - 1);
-
-       return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
-}
-
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
                          bool prefault)
 {
-       kvm_pfn_t pfn;
-       int r;
-       int level;
-       bool force_pt_level;
-       gfn_t gfn = gpa >> PAGE_SHIFT;
-       unsigned long mmu_seq;
-       int write = error_code & PFERR_WRITE_MASK;
-       bool map_writable;
-       bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
-                               is_nx_huge_page_enabled();
+       int max_level;
 
-       MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
+       for (max_level = PT_MAX_HUGEPAGE_LEVEL;
+            max_level > PT_PAGE_TABLE_LEVEL;
+            max_level--) {
+               int page_num = KVM_PAGES_PER_HPAGE(max_level);
+               gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1);
 
-       if (page_fault_handle_page_track(vcpu, error_code, gfn))
-               return RET_PF_EMULATE;
-
-       r = mmu_topup_memory_caches(vcpu);
-       if (r)
-               return r;
-
-       force_pt_level =
-               lpage_disallowed ||
-               !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL);
-       level = mapping_level(vcpu, gfn, &force_pt_level);
-       if (likely(!force_pt_level)) {
-               if (level > PT_DIRECTORY_LEVEL &&
-                   !check_hugepage_cache_consistency(vcpu, gfn, level))
-                       level = PT_DIRECTORY_LEVEL;
-               gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+               if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
+                       break;
        }
 
-       if (fast_page_fault(vcpu, gpa, level, error_code))
-               return RET_PF_RETRY;
-
-       mmu_seq = vcpu->kvm->mmu_notifier_seq;
-       smp_rmb();
-
-       if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
-               return RET_PF_RETRY;
-
-       if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
-               return r;
-
-       r = RET_PF_RETRY;
-       spin_lock(&vcpu->kvm->mmu_lock);
-       if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
-               goto out_unlock;
-       if (make_mmu_pages_available(vcpu) < 0)
-               goto out_unlock;
-       if (likely(!force_pt_level))
-               transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
-       r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
-                        prefault, lpage_disallowed);
-out_unlock:
-       spin_unlock(&vcpu->kvm->mmu_lock);
-       kvm_release_pfn_clean(pfn);
-       return r;
+       return direct_page_fault(vcpu, gpa, error_code, prefault,
+                                max_level, true);
 }
 
 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -5496,47 +5458,30 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
-static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
-{
-       LIST_HEAD(invalid_list);
-
-       if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
-               return 0;
-
-       while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
-               if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
-                       break;
-
-               ++vcpu->kvm->stat.mmu_recycled;
-       }
-       kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
-
-       if (!kvm_mmu_available_pages(vcpu->kvm))
-               return -ENOSPC;
-       return 0;
-}
-
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
                       void *insn, int insn_len)
 {
        int r, emulation_type = 0;
        bool direct = vcpu->arch.mmu->direct_map;
 
+       if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa)))
+               return RET_PF_RETRY;
+
        /* With shadow page tables, fault_address contains a GVA or nGPA.  */
        if (vcpu->arch.mmu->direct_map) {
                vcpu->arch.gpa_available = true;
-               vcpu->arch.gpa_val = cr2;
+               vcpu->arch.gpa_val = cr2_or_gpa;
        }
 
        r = RET_PF_INVALID;
        if (unlikely(error_code & PFERR_RSVD_MASK)) {
-               r = handle_mmio_page_fault(vcpu, cr2, direct);
+               r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
                if (r == RET_PF_EMULATE)
                        goto emulate;
        }
 
        if (r == RET_PF_INVALID) {
-               r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+               r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
                                               lower_32_bits(error_code),
                                               false);
                WARN_ON(r == RET_PF_INVALID);
@@ -5556,7 +5501,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
         */
        if (vcpu->arch.mmu->direct_map &&
            (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
-               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+               kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
                return 1;
        }
 
@@ -5571,7 +5516,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
         * explicitly shadowing L1's page tables, i.e. unprotecting something
         * for L1 isn't going to magically fix whatever issue cause L2 to fail.
         */
-       if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+       if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
                emulation_type = EMULTYPE_ALLOW_RETRY;
 emulate:
        /*
@@ -5586,7 +5531,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
                        return 1;
        }
 
-       return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+       return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
                                       insn_len);
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -6016,7 +5961,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
                 */
                if (sp->role.direct && !kvm_is_reserved_pfn(pfn) &&
                    !kvm_is_zone_device_pfn(pfn) &&
-                   PageTransCompoundMap(pfn_to_page(pfn))) {
+                   kvm_is_transparent_hugepage(pfn)) {
                        pte_list_remove(rmap_head, sptep);
 
                        if (kvm_available_flush_tlb_with_range())
@@ -6249,7 +6194,7 @@ static void kvm_set_mmio_spte_mask(void)
         * If reserved bit is not supported, clear the present bit to disable
         * mmio page fault.
         */
-       if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
+       if (shadow_phys_bits == 52)
                mask &= ~1ull;
 
        kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);