arch/x86/kvm/mmu.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Kernel-based Virtual Machine driver for Linux
   4  *
   5  * This module enables machines with Intel VT-x extensions to run virtual
   6  * machines without emulation or binary translation.
   7  *
   8  * MMU support
   9  *
  10  * Copyright (C) 2006 Qumranet, Inc.
  11  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  12  *
  13  * Authors:
  14  *   Yaniv Kamay  <yaniv@qumranet.com>
  15  *   Avi Kivity   <avi@qumranet.com>
  16  */
  17
  18 #include "irq.h"
  19 #include "mmu.h"
  20 #include "x86.h"
  21 #include "kvm_cache_regs.h"
  22 #include "cpuid.h"
  23
  24 #include <linux/kvm_host.h>
  25 #include <linux/types.h>
  26 #include <linux/string.h>
  27 #include <linux/mm.h>
  28 #include <linux/highmem.h>
  29 #include <linux/moduleparam.h>
  30 #include <linux/export.h>
  31 #include <linux/swap.h>
  32 #include <linux/hugetlb.h>
  33 #include <linux/compiler.h>
  34 #include <linux/srcu.h>
  35 #include <linux/slab.h>
  36 #include <linux/sched/signal.h>
  37 #include <linux/uaccess.h>
  38 #include <linux/hash.h>
  39 #include <linux/kern_levels.h>
  40
  41 #include <asm/page.h>
  42 #include <asm/pat.h>
  43 #include <asm/cmpxchg.h>
  44 #include <asm/e820/api.h>
  45 #include <asm/io.h>
  46 #include <asm/vmx.h>
  47 #include <asm/kvm_page_track.h>
  48 #include "trace.h"
  49
  50 /*
  51  * When setting this variable to true it enables Two-Dimensional-Paging
  52  * where the hardware walks 2 page tables:
  53  * 1. the guest-virtual to guest-physical
  54  * 2. while doing 1. it walks guest-physical to host-physical
  55  * If the hardware supports that we don't need to do shadow paging.
  56  */
  57 bool tdp_enabled = false;
  58
  59 enum {
  60         AUDIT_PRE_PAGE_FAULT,
  61         AUDIT_POST_PAGE_FAULT,
  62         AUDIT_PRE_PTE_WRITE,
  63         AUDIT_POST_PTE_WRITE,
  64         AUDIT_PRE_SYNC,
  65         AUDIT_POST_SYNC
  66 };
  67
  68 #undef MMU_DEBUG
  69
  70 #ifdef MMU_DEBUG
  71 static bool dbg = 0;
  72 module_param(dbg, bool, 0644);
  73
  74 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
  75 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
  76 #define MMU_WARN_ON(x) WARN_ON(x)
  77 #else
  78 #define pgprintk(x...) do { } while (0)
  79 #define rmap_printk(x...) do { } while (0)
  80 #define MMU_WARN_ON(x) do { } while (0)
  81 #endif
  82
  83 #define PTE_PREFETCH_NUM                8
  84
  85 #define PT_FIRST_AVAIL_BITS_SHIFT 10
  86 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
  87
  88 #define PT64_LEVEL_BITS 9
  89
  90 #define PT64_LEVEL_SHIFT(level) \
  91                 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
  92
  93 #define PT64_INDEX(address, level)\
  94         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
  95
  96
  97 #define PT32_LEVEL_BITS 10
  98
  99 #define PT32_LEVEL_SHIFT(level) \
 100                 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
 101
 102 #define PT32_LVL_OFFSET_MASK(level) \
 103         (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 104                                                 * PT32_LEVEL_BITS))) - 1))
 105
 106 #define PT32_INDEX(address, level)\
 107         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 108
 109
 110 #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
 111 #define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
 112 #else
 113 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
 114 #endif
 115 #define PT64_LVL_ADDR_MASK(level) \
 116         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 117                                                 * PT64_LEVEL_BITS))) - 1))
 118 #define PT64_LVL_OFFSET_MASK(level) \
 119         (PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
 120                                                 * PT64_LEVEL_BITS))) - 1))
 121
 122 #define PT32_BASE_ADDR_MASK PAGE_MASK
 123 #define PT32_DIR_BASE_ADDR_MASK \
 124         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
 125 #define PT32_LVL_ADDR_MASK(level) \
 126         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 127                                             * PT32_LEVEL_BITS))) - 1))
 128
 129 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
 130                         | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
 131
 132 #define ACC_EXEC_MASK    1
 133 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 134 #define ACC_USER_MASK    PT_USER_MASK
 135 #define ACC_ALL          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
 136
 137 /* The mask for the R/X bits in EPT PTEs */
 138 #define PT64_EPT_READABLE_MASK                  0x1ull
 139 #define PT64_EPT_EXECUTABLE_MASK                0x4ull
 140
 141 #include <trace/events/kvm.h>
 142
 143 #define SPTE_HOST_WRITEABLE     (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
 144 #define SPTE_MMU_WRITEABLE      (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
 145
 146 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
 147
 148 /* make pte_list_desc fit well in cache line */
 149 #define PTE_LIST_EXT 3
 150
 151 /*
 152  * Return values of handle_mmio_page_fault and mmu.page_fault:
 153  * RET_PF_RETRY: let CPU fault again on the address.
 154  * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
 155  *
 156  * For handle_mmio_page_fault only:
 157  * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
 158  */
 159 enum {
 160         RET_PF_RETRY = 0,
 161         RET_PF_EMULATE = 1,
 162         RET_PF_INVALID = 2,
 163 };
 164
 165 struct pte_list_desc {
 166         u64 *sptes[PTE_LIST_EXT];
 167         struct pte_list_desc *more;
 168 };
 169
 170 struct kvm_shadow_walk_iterator {
 171         u64 addr;
 172         hpa_t shadow_addr;
 173         u64 *sptep;
 174         int level;
 175         unsigned index;
 176 };
 177
 178 static const union kvm_mmu_page_role mmu_base_role_mask = {
 179         .cr0_wp = 1,
 180         .gpte_is_8_bytes = 1,
 181         .nxe = 1,
 182         .smep_andnot_wp = 1,
 183         .smap_andnot_wp = 1,
 184         .smm = 1,
 185         .guest_mode = 1,
 186         .ad_disabled = 1,
 187 };
 188
 189 #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
 190         for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
 191                                          (_root), (_addr));                \
 192              shadow_walk_okay(&(_walker));                                 \
 193              shadow_walk_next(&(_walker)))
 194
 195 #define for_each_shadow_entry(_vcpu, _addr, _walker)            \
 196         for (shadow_walk_init(&(_walker), _vcpu, _addr);        \
 197              shadow_walk_okay(&(_walker));                      \
 198              shadow_walk_next(&(_walker)))
 199
 200 #define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte)     \
 201         for (shadow_walk_init(&(_walker), _vcpu, _addr);                \
 202              shadow_walk_okay(&(_walker)) &&                            \
 203                 ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; });  \
 204              __shadow_walk_next(&(_walker), spte))
 205
 206 static struct kmem_cache *pte_list_desc_cache;
 207 static struct kmem_cache *mmu_page_header_cache;
 208 static struct percpu_counter kvm_total_used_mmu_pages;
 209
 210 static u64 __read_mostly shadow_nx_mask;
 211 static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 212 static u64 __read_mostly shadow_user_mask;
 213 static u64 __read_mostly shadow_accessed_mask;
 214 static u64 __read_mostly shadow_dirty_mask;
 215 static u64 __read_mostly shadow_mmio_mask;
 216 static u64 __read_mostly shadow_mmio_value;
 217 static u64 __read_mostly shadow_mmio_access_mask;
 218 static u64 __read_mostly shadow_present_mask;
 219 static u64 __read_mostly shadow_me_mask;
 220
 221 /*
 222  * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
 223  * Non-present SPTEs with shadow_acc_track_value set are in place for access
 224  * tracking.
 225  */
 226 static u64 __read_mostly shadow_acc_track_mask;
 227 static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
 228
 229 /*
 230  * The mask/shift to use for saving the original R/X bits when marking the PTE
 231  * as not-present for access tracking purposes. We do not save the W bit as the
 232  * PTEs being access tracked also need to be dirty tracked, so the W bit will be
 233  * restored only when a write is attempted to the page.
 234  */
 235 static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 236                                                     PT64_EPT_EXECUTABLE_MASK;
 237 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
 238
 239 /*
 240  * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
 241  * to guard against L1TF attacks.
 242  */
 243 static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
 244
 245 /*
 246  * The number of high-order 1 bits to use in the mask above.
 247  */
 248 static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
 249
 250 /*
 251  * In some cases, we need to preserve the GFN of a non-present or reserved
 252  * SPTE when we usurp the upper five bits of the physical address space to
 253  * defend against L1TF, e.g. for MMIO SPTEs.  To preserve the GFN, we'll
 254  * shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
 255  * left into the reserved bits, i.e. the GFN in the SPTE will be split into
 256  * high and low parts.  This mask covers the lower bits of the GFN.
 257  */
 258 static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
 259
 260 /*
 261  * The number of non-reserved physical address bits irrespective of features
 262  * that repurpose legal bits, e.g. MKTME.
 263  */
 264 static u8 __read_mostly shadow_phys_bits;
 265
 266 static void mmu_spte_set(u64 *sptep, u64 spte);
 267 static bool is_executable_pte(u64 spte);
 268 static union kvm_mmu_page_role
 269 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
 270
 271 #define CREATE_TRACE_POINTS
 272 #include "mmutrace.h"
 273
 274
 275 static inline bool kvm_available_flush_tlb_with_range(void)
 276 {
 277         return kvm_x86_ops->tlb_remote_flush_with_range;
 278 }
 279
 280 static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm,
 281                 struct kvm_tlb_range *range)
 282 {
 283         int ret = -ENOTSUPP;
 284
 285         if (range && kvm_x86_ops->tlb_remote_flush_with_range)
 286                 ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range);
 287
 288         if (ret)
 289                 kvm_flush_remote_tlbs(kvm);
 290 }
 291
 292 static void kvm_flush_remote_tlbs_with_address(struct kvm *kvm,
 293                 u64 start_gfn, u64 pages)
 294 {
 295         struct kvm_tlb_range range;
 296
 297         range.start_gfn = start_gfn;
 298         range.pages = pages;
 299
 300         kvm_flush_remote_tlbs_with_range(kvm, &range);
 301 }
 302
 303 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
 304 {
 305         BUG_ON((u64)(unsigned)access_mask != access_mask);
 306         BUG_ON((mmio_mask & mmio_value) != mmio_value);
 307         shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
 308         shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
 309         shadow_mmio_access_mask = access_mask;
 310 }
 311 EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
 312
 313 static bool is_mmio_spte(u64 spte)
 314 {
 315         return (spte & shadow_mmio_mask) == shadow_mmio_value;
 316 }
 317
 318 static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
 319 {
 320         return sp->role.ad_disabled;
 321 }
 322
 323 static inline bool spte_ad_enabled(u64 spte)
 324 {
 325         MMU_WARN_ON(is_mmio_spte(spte));
 326         return !(spte & shadow_acc_track_value);
 327 }
 328
 329 static inline u64 spte_shadow_accessed_mask(u64 spte)
 330 {
 331         MMU_WARN_ON(is_mmio_spte(spte));
 332         return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
 333 }
 334
 335 static inline u64 spte_shadow_dirty_mask(u64 spte)
 336 {
 337         MMU_WARN_ON(is_mmio_spte(spte));
 338         return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
 339 }
 340
 341 static inline bool is_access_track_spte(u64 spte)
 342 {
 343         return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
 344 }
 345
 346 /*
 347  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
 348  * the memslots generation and is derived as follows:
 349  *
 350  * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
 351  * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
 352  *
 353  * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
 354  * the MMIO generation number, as doing so would require stealing a bit from
 355  * the "real" generation number and thus effectively halve the maximum number
 356  * of MMIO generations that can be handled before encountering a wrap (which
 357  * requires a full MMU zap).  The flag is instead explicitly queried when
 358  * checking for MMIO spte cache hits.
 359  */
 360 #define MMIO_SPTE_GEN_MASK              GENMASK_ULL(18, 0)
 361
 362 #define MMIO_SPTE_GEN_LOW_START         3
 363 #define MMIO_SPTE_GEN_LOW_END           11
 364 #define MMIO_SPTE_GEN_LOW_MASK          GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
 365                                                     MMIO_SPTE_GEN_LOW_START)
 366
 367 #define MMIO_SPTE_GEN_HIGH_START        52
 368 #define MMIO_SPTE_GEN_HIGH_END          61
 369 #define MMIO_SPTE_GEN_HIGH_MASK         GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
 370                                                     MMIO_SPTE_GEN_HIGH_START)
 371 static u64 generation_mmio_spte_mask(u64 gen)
 372 {
 373         u64 mask;
 374
 375         WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 376
 377         mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 378         mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 379         return mask;
 380 }
 381
 382 static u64 get_mmio_spte_generation(u64 spte)
 383 {
 384         u64 gen;
 385
 386         spte &= ~shadow_mmio_mask;
 387
 388         gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 389         gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
 390         return gen;
 391 }
 392
 393 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 394                            unsigned access)
 395 {
 396         u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
 397         u64 mask = generation_mmio_spte_mask(gen);
 398         u64 gpa = gfn << PAGE_SHIFT;
 399
 400         access &= shadow_mmio_access_mask;
 401         mask |= shadow_mmio_value | access;
 402         mask |= gpa | shadow_nonpresent_or_rsvd_mask;
 403         mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 404                 << shadow_nonpresent_or_rsvd_mask_len;
 405
 406         trace_mark_mmio_spte(sptep, gfn, access, gen);
 407         mmu_spte_set(sptep, mask);
 408 }
 409
 410 static gfn_t get_mmio_spte_gfn(u64 spte)
 411 {
 412         u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
 413
 414         gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
 415                & shadow_nonpresent_or_rsvd_mask;
 416
 417         return gpa >> PAGE_SHIFT;
 418 }
 419
 420 static unsigned get_mmio_spte_access(u64 spte)
 421 {
 422         return spte & shadow_mmio_access_mask;
 423 }
 424
 425 static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 426                           kvm_pfn_t pfn, unsigned access)
 427 {
 428         if (unlikely(is_noslot_pfn(pfn))) {
 429                 mark_mmio_spte(vcpu, sptep, gfn, access);
 430                 return true;
 431         }
 432
 433         return false;
 434 }
 435
 436 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 437 {
 438         u64 kvm_gen, spte_gen, gen;
 439
 440         gen = kvm_vcpu_memslots(vcpu)->generation;
 441         if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
 442                 return false;
 443
 444         kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 445         spte_gen = get_mmio_spte_generation(spte);
 446
 447         trace_check_mmio_spte(spte, kvm_gen, spte_gen);
 448         return likely(kvm_gen == spte_gen);
 449 }
 450
 451 /*
 452  * Sets the shadow PTE masks used by the MMU.
 453  *
 454  * Assumptions:
 455  *  - Setting either @accessed_mask or @dirty_mask requires setting both
 456  *  - At least one of @accessed_mask or @acc_track_mask must be set
 457  */
 458 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 459                 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
 460                 u64 acc_track_mask, u64 me_mask)
 461 {
 462         BUG_ON(!dirty_mask != !accessed_mask);
 463         BUG_ON(!accessed_mask && !acc_track_mask);
 464         BUG_ON(acc_track_mask & shadow_acc_track_value);
 465
 466         shadow_user_mask = user_mask;
 467         shadow_accessed_mask = accessed_mask;
 468         shadow_dirty_mask = dirty_mask;
 469         shadow_nx_mask = nx_mask;
 470         shadow_x_mask = x_mask;
 471         shadow_present_mask = p_mask;
 472         shadow_acc_track_mask = acc_track_mask;
 473         shadow_me_mask = me_mask;
 474 }
 475 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 476
 477 static u8 kvm_get_shadow_phys_bits(void)
 478 {
 479         /*
 480          * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
 481          * in CPU detection code, but MKTME treats those reduced bits as
 482          * 'keyID' thus they are not reserved bits. Therefore for MKTME
 483          * we should still return physical address bits reported by CPUID.
 484          */
 485         if (!boot_cpu_has(X86_FEATURE_TME) ||
 486             WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
 487                 return boot_cpu_data.x86_phys_bits;
 488
 489         return cpuid_eax(0x80000008) & 0xff;
 490 }
 491
 492 static void kvm_mmu_reset_all_pte_masks(void)
 493 {
 494         u8 low_phys_bits;
 495
 496         shadow_user_mask = 0;
 497         shadow_accessed_mask = 0;
 498         shadow_dirty_mask = 0;
 499         shadow_nx_mask = 0;
 500         shadow_x_mask = 0;
 501         shadow_mmio_mask = 0;
 502         shadow_present_mask = 0;
 503         shadow_acc_track_mask = 0;
 504
 505         shadow_phys_bits = kvm_get_shadow_phys_bits();
 506
 507         /*
 508          * If the CPU has 46 or less physical address bits, then set an
 509          * appropriate mask to guard against L1TF attacks. Otherwise, it is
 510          * assumed that the CPU is not vulnerable to L1TF.
 511          *
 512          * Some Intel CPUs address the L1 cache using more PA bits than are
 513          * reported by CPUID. Use the PA width of the L1 cache when possible
 514          * to achieve more effective mitigation, e.g. if system RAM overlaps
 515          * the most significant bits of legal physical address space.
 516          */
 517         shadow_nonpresent_or_rsvd_mask = 0;
 518         low_phys_bits = boot_cpu_data.x86_cache_bits;
 519         if (boot_cpu_data.x86_cache_bits <
 520             52 - shadow_nonpresent_or_rsvd_mask_len) {
 521                 shadow_nonpresent_or_rsvd_mask =
 522                         rsvd_bits(boot_cpu_data.x86_cache_bits -
 523                                   shadow_nonpresent_or_rsvd_mask_len,
 524                                   boot_cpu_data.x86_cache_bits - 1);
 525                 low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
 526         } else
 527                 WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
 528
 529         shadow_nonpresent_or_rsvd_lower_gfn_mask =
 530                 GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
 531 }
 532
 533 static int is_cpuid_PSE36(void)
 534 {
 535         return 1;
 536 }
 537
 538 static int is_nx(struct kvm_vcpu *vcpu)
 539 {
 540         return vcpu->arch.efer & EFER_NX;
 541 }
 542
 543 static int is_shadow_present_pte(u64 pte)
 544 {
 545         return (pte != 0) && !is_mmio_spte(pte);
 546 }
 547
 548 static int is_large_pte(u64 pte)
 549 {
 550         return pte & PT_PAGE_SIZE_MASK;
 551 }
 552
 553 static int is_last_spte(u64 pte, int level)
 554 {
 555         if (level == PT_PAGE_TABLE_LEVEL)
 556                 return 1;
 557         if (is_large_pte(pte))
 558                 return 1;
 559         return 0;
 560 }
 561
 562 static bool is_executable_pte(u64 spte)
 563 {
 564         return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
 565 }
 566
 567 static kvm_pfn_t spte_to_pfn(u64 pte)
 568 {
 569         return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
 570 }
 571
 572 static gfn_t pse36_gfn_delta(u32 gpte)
 573 {
 574         int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
 575
 576         return (gpte & PT32_DIR_PSE36_MASK) << shift;
 577 }
 578
 579 #ifdef CONFIG_X86_64
 580 static void __set_spte(u64 *sptep, u64 spte)
 581 {
 582         WRITE_ONCE(*sptep, spte);
 583 }
 584
 585 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 586 {
 587         WRITE_ONCE(*sptep, spte);
 588 }
 589
 590 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 591 {
 592         return xchg(sptep, spte);
 593 }
 594
 595 static u64 __get_spte_lockless(u64 *sptep)
 596 {
 597         return READ_ONCE(*sptep);
 598 }
 599 #else
 600 union split_spte {
 601         struct {
 602                 u32 spte_low;
 603                 u32 spte_high;
 604         };
 605         u64 spte;
 606 };
 607
 608 static void count_spte_clear(u64 *sptep, u64 spte)
 609 {
 610         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 611
 612         if (is_shadow_present_pte(spte))
 613                 return;
 614
 615         /* Ensure the spte is completely set before we increase the count */
 616         smp_wmb();
 617         sp->clear_spte_count++;
 618 }
 619
 620 static void __set_spte(u64 *sptep, u64 spte)
 621 {
 622         union split_spte *ssptep, sspte;
 623
 624         ssptep = (union split_spte *)sptep;
 625         sspte = (union split_spte)spte;
 626
 627         ssptep->spte_high = sspte.spte_high;
 628
 629         /*
 630          * If we map the spte from nonpresent to present, We should store
 631          * the high bits firstly, then set present bit, so cpu can not
 632          * fetch this spte while we are setting the spte.
 633          */
 634         smp_wmb();
 635
 636         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 637 }
 638
 639 static void __update_clear_spte_fast(u64 *sptep, u64 spte)
 640 {
 641         union split_spte *ssptep, sspte;
 642
 643         ssptep = (union split_spte *)sptep;
 644         sspte = (union split_spte)spte;
 645
 646         WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
 647
 648         /*
 649          * If we map the spte from present to nonpresent, we should clear
 650          * present bit firstly to avoid vcpu fetch the old high bits.
 651          */
 652         smp_wmb();
 653
 654         ssptep->spte_high = sspte.spte_high;
 655         count_spte_clear(sptep, spte);
 656 }
 657
 658 static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
 659 {
 660         union split_spte *ssptep, sspte, orig;
 661
 662         ssptep = (union split_spte *)sptep;
 663         sspte = (union split_spte)spte;
 664
 665         /* xchg acts as a barrier before the setting of the high bits */
 666         orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
 667         orig.spte_high = ssptep->spte_high;
 668         ssptep->spte_high = sspte.spte_high;
 669         count_spte_clear(sptep, spte);
 670
 671         return orig.spte;
 672 }
 673
 674 /*
 675  * The idea using the light way get the spte on x86_32 guest is from
 676  * gup_get_pte (mm/gup.c).
 677  *
 678  * An spte tlb flush may be pending, because kvm_set_pte_rmapp
 679  * coalesces them and we are running out of the MMU lock.  Therefore
 680  * we need to protect against in-progress updates of the spte.
 681  *
 682  * Reading the spte while an update is in progress may get the old value
 683  * for the high part of the spte.  The race is fine for a present->non-present
 684  * change (because the high part of the spte is ignored for non-present spte),
 685  * but for a present->present change we must reread the spte.
 686  *
 687  * All such changes are done in two steps (present->non-present and
 688  * non-present->present), hence it is enough to count the number of
 689  * present->non-present updates: if it changed while reading the spte,
 690  * we might have hit the race.  This is done using clear_spte_count.
 691  */
 692 static u64 __get_spte_lockless(u64 *sptep)
 693 {
 694         struct kvm_mmu_page *sp =  page_header(__pa(sptep));
 695         union split_spte spte, *orig = (union split_spte *)sptep;
 696         int count;
 697
 698 retry:
 699         count = sp->clear_spte_count;
 700         smp_rmb();
 701
 702         spte.spte_low = orig->spte_low;
 703         smp_rmb();
 704
 705         spte.spte_high = orig->spte_high;
 706         smp_rmb();
 707
 708         if (unlikely(spte.spte_low != orig->spte_low ||
 709               count != sp->clear_spte_count))
 710                 goto retry;
 711
 712         return spte.spte;
 713 }
 714 #endif
 715
 716 static bool spte_can_locklessly_be_made_writable(u64 spte)
 717 {
 718         return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
 719                 (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
 720 }
 721
 722 static bool spte_has_volatile_bits(u64 spte)
 723 {
 724         if (!is_shadow_present_pte(spte))
 725                 return false;
 726
 727         /*
 728          * Always atomically update spte if it can be updated
 729          * out of mmu-lock, it can ensure dirty bit is not lost,
 730          * also, it can help us to get a stable is_writable_pte()
 731          * to ensure tlb flush is not missed.
 732          */
 733         if (spte_can_locklessly_be_made_writable(spte) ||
 734             is_access_track_spte(spte))
 735                 return true;
 736
 737         if (spte_ad_enabled(spte)) {
 738                 if ((spte & shadow_accessed_mask) == 0 ||
 739                     (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
 740                         return true;
 741         }
 742
 743         return false;
 744 }
 745
 746 static bool is_accessed_spte(u64 spte)
 747 {
 748         u64 accessed_mask = spte_shadow_accessed_mask(spte);
 749
 750         return accessed_mask ? spte & accessed_mask
 751                              : !is_access_track_spte(spte);
 752 }
 753
 754 static bool is_dirty_spte(u64 spte)
 755 {
 756         u64 dirty_mask = spte_shadow_dirty_mask(spte);
 757
 758         return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
 759 }
 760
 761 /* Rules for using mmu_spte_set:
 762  * Set the sptep from nonpresent to present.
 763  * Note: the sptep being assigned *must* be either not present
 764  * or in a state where the hardware will not attempt to update
 765  * the spte.
 766  */
 767 static void mmu_spte_set(u64 *sptep, u64 new_spte)
 768 {
 769         WARN_ON(is_shadow_present_pte(*sptep));
 770         __set_spte(sptep, new_spte);
 771 }
 772
 773 /*
 774  * Update the SPTE (excluding the PFN), but do not track changes in its
 775  * accessed/dirty status.
 776  */
 777 static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
 778 {
 779         u64 old_spte = *sptep;
 780
 781         WARN_ON(!is_shadow_present_pte(new_spte));
 782
 783         if (!is_shadow_present_pte(old_spte)) {
 784                 mmu_spte_set(sptep, new_spte);
 785                 return old_spte;
 786         }
 787
 788         if (!spte_has_volatile_bits(old_spte))
 789                 __update_clear_spte_fast(sptep, new_spte);
 790         else
 791                 old_spte = __update_clear_spte_slow(sptep, new_spte);
 792
 793         WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
 794
 795         return old_spte;
 796 }
 797
 798 /* Rules for using mmu_spte_update:
 799  * Update the state bits, it means the mapped pfn is not changed.
 800  *
 801  * Whenever we overwrite a writable spte with a read-only one we
 802  * should flush remote TLBs. Otherwise rmap_write_protect
 803  * will find a read-only spte, even though the writable spte
 804  * might be cached on a CPU's TLB, the return value indicates this
 805  * case.
 806  *
 807  * Returns true if the TLB needs to be flushed
 808  */
 809 static bool mmu_spte_update(u64 *sptep, u64 new_spte)
 810 {
 811         bool flush = false;
 812         u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
 813
 814         if (!is_shadow_present_pte(old_spte))
 815                 return false;
 816
 817         /*
 818          * For the spte updated out of mmu-lock is safe, since
 819          * we always atomically update it, see the comments in
 820          * spte_has_volatile_bits().
 821          */
 822         if (spte_can_locklessly_be_made_writable(old_spte) &&
 823               !is_writable_pte(new_spte))
 824                 flush = true;
 825
 826         /*
 827          * Flush TLB when accessed/dirty states are changed in the page tables,
 828          * to guarantee consistency between TLB and page tables.
 829          */
 830
 831         if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
 832                 flush = true;
 833                 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
 834         }
 835
 836         if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
 837                 flush = true;
 838                 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
 839         }
 840
 841         return flush;
 842 }
 843
 844 /*
 845  * Rules for using mmu_spte_clear_track_bits:
 846  * It sets the sptep from present to nonpresent, and track the
 847  * state bits, it is used to clear the last level sptep.
 848  * Returns non-zero if the PTE was previously valid.
 849  */
 850 static int mmu_spte_clear_track_bits(u64 *sptep)
 851 {
 852         kvm_pfn_t pfn;
 853         u64 old_spte = *sptep;
 854
 855         if (!spte_has_volatile_bits(old_spte))
 856                 __update_clear_spte_fast(sptep, 0ull);
 857         else
 858                 old_spte = __update_clear_spte_slow(sptep, 0ull);
 859
 860         if (!is_shadow_present_pte(old_spte))
 861                 return 0;
 862
 863         pfn = spte_to_pfn(old_spte);
 864
 865         /*
 866          * KVM does not hold the refcount of the page used by
 867          * kvm mmu, before reclaiming the page, we should
 868          * unmap it from mmu first.
 869          */
 870         WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
 871
 872         if (is_accessed_spte(old_spte))
 873                 kvm_set_pfn_accessed(pfn);
 874
 875         if (is_dirty_spte(old_spte))
 876                 kvm_set_pfn_dirty(pfn);
 877
 878         return 1;
 879 }
 880
 881 /*
 882  * Rules for using mmu_spte_clear_no_track:
 883  * Directly clear spte without caring the state bits of sptep,
 884  * it is used to set the upper level spte.
 885  */
 886 static void mmu_spte_clear_no_track(u64 *sptep)
 887 {
 888         __update_clear_spte_fast(sptep, 0ull);
 889 }
 890
 891 static u64 mmu_spte_get_lockless(u64 *sptep)
 892 {
 893         return __get_spte_lockless(sptep);
 894 }
 895
 896 static u64 mark_spte_for_access_track(u64 spte)
 897 {
 898         if (spte_ad_enabled(spte))
 899                 return spte & ~shadow_accessed_mask;
 900
 901         if (is_access_track_spte(spte))
 902                 return spte;
 903
 904         /*
 905          * Making an Access Tracking PTE will result in removal of write access
 906          * from the PTE. So, verify that we will be able to restore the write
 907          * access in the fast page fault path later on.
 908          */
 909         WARN_ONCE((spte & PT_WRITABLE_MASK) &&
 910                   !spte_can_locklessly_be_made_writable(spte),
 911                   "kvm: Writable SPTE is not locklessly dirty-trackable\n");
 912
 913         WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
 914                           shadow_acc_track_saved_bits_shift),
 915                   "kvm: Access Tracking saved bit locations are not zero\n");
 916
 917         spte |= (spte & shadow_acc_track_saved_bits_mask) <<
 918                 shadow_acc_track_saved_bits_shift;
 919         spte &= ~shadow_acc_track_mask;
 920
 921         return spte;
 922 }
 923
 924 /* Restore an acc-track PTE back to a regular PTE */
 925 static u64 restore_acc_track_spte(u64 spte)
 926 {
 927         u64 new_spte = spte;
 928         u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
 929                          & shadow_acc_track_saved_bits_mask;
 930
 931         WARN_ON_ONCE(spte_ad_enabled(spte));
 932         WARN_ON_ONCE(!is_access_track_spte(spte));
 933
 934         new_spte &= ~shadow_acc_track_mask;
 935         new_spte &= ~(shadow_acc_track_saved_bits_mask <<
 936                       shadow_acc_track_saved_bits_shift);
 937         new_spte |= saved_bits;
 938
 939         return new_spte;
 940 }
 941
 942 /* Returns the Accessed status of the PTE and resets it at the same time. */
 943 static bool mmu_spte_age(u64 *sptep)
 944 {
 945         u64 spte = mmu_spte_get_lockless(sptep);
 946
 947         if (!is_accessed_spte(spte))
 948                 return false;
 949
 950         if (spte_ad_enabled(spte)) {
 951                 clear_bit((ffs(shadow_accessed_mask) - 1),
 952                           (unsigned long *)sptep);
 953         } else {
 954                 /*
 955                  * Capture the dirty status of the page, so that it doesn't get
 956                  * lost when the SPTE is marked for access tracking.
 957                  */
 958                 if (is_writable_pte(spte))
 959                         kvm_set_pfn_dirty(spte_to_pfn(spte));
 960
 961                 spte = mark_spte_for_access_track(spte);
 962                 mmu_spte_update_no_track(sptep, spte);
 963         }
 964
 965         return true;
 966 }
 967
 968 static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
 969 {
 970         /*
 971          * Prevent page table teardown by making any free-er wait during
 972          * kvm_flush_remote_tlbs() IPI to all active vcpus.
 973          */
 974         local_irq_disable();
 975
 976         /*
 977          * Make sure a following spte read is not reordered ahead of the write
 978          * to vcpu->mode.
 979          */
 980         smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
 981 }
 982
 983 static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
 984 {
 985         /*
 986          * Make sure the write to vcpu->mode is not reordered in front of
 987          * reads to sptes.  If it does, kvm_mmu_commit_zap_page() can see us
 988          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
 989          */
 990         smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
 991         local_irq_enable();
 992 }
 993
 994 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 995                                   struct kmem_cache *base_cache, int min)
 996 {
 997         void *obj;
 998
 999         if (cache->nobjs >= min)
1000                 return 0;
1001         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1002                 obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
1003                 if (!obj)
1004                         return cache->nobjs >= min ? 0 : -ENOMEM;
1005                 cache->objects[cache->nobjs++] = obj;
1006         }
1007         return 0;
1008 }
1009
1010 static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
1011 {
1012         return cache->nobjs;
1013 }
1014
1015 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
1016                                   struct kmem_cache *cache)
1017 {
1018         while (mc->nobjs)
1019                 kmem_cache_free(cache, mc->objects[--mc->nobjs]);
1020 }
1021
1022 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
1023                                        int min)
1024 {
1025         void *page;
1026
1027         if (cache->nobjs >= min)
1028                 return 0;
1029         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
1030                 page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
1031                 if (!page)
1032                         return cache->nobjs >= min ? 0 : -ENOMEM;
1033                 cache->objects[cache->nobjs++] = page;
1034         }
1035         return 0;
1036 }
1037
1038 static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
1039 {
1040         while (mc->nobjs)
1041                 free_page((unsigned long)mc->objects[--mc->nobjs]);
1042 }
1043
1044 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
1045 {
1046         int r;
1047
1048         r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1049                                    pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
1050         if (r)
1051                 goto out;
1052         r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
1053         if (r)
1054                 goto out;
1055         r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
1056                                    mmu_page_header_cache, 4);
1057 out:
1058         return r;
1059 }
1060
1061 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
1062 {
1063         mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
1064                                 pte_list_desc_cache);
1065         mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
1066         mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
1067                                 mmu_page_header_cache);
1068 }
1069
1070 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
1071 {
1072         void *p;
1073
1074         BUG_ON(!mc->nobjs);
1075         p = mc->objects[--mc->nobjs];
1076         return p;
1077 }
1078
1079 static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu)
1080 {
1081         return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
1082 }
1083
1084 static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
1085 {
1086         kmem_cache_free(pte_list_desc_cache, pte_list_desc);
1087 }
1088
1089 static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
1090 {
1091         if (!sp->role.direct)
1092                 return sp->gfns[index];
1093
1094         return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
1095 }
1096
1097 static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
1098 {
1099         if (!sp->role.direct) {
1100                 sp->gfns[index] = gfn;
1101                 return;
1102         }
1103
1104         if (WARN_ON(gfn != kvm_mmu_page_get_gfn(sp, index)))
1105                 pr_err_ratelimited("gfn mismatch under direct page %llx "
1106                                    "(expected %llx, got %llx)\n",
1107                                    sp->gfn,
1108                                    kvm_mmu_page_get_gfn(sp, index), gfn);
1109 }
1110
1111 /*
1112  * Return the pointer to the large page information for a given gfn,
1113  * handling slots that are not large page aligned.
1114  */
1115 static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
1116                                               struct kvm_memory_slot *slot,
1117                                               int level)
1118 {
1119         unsigned long idx;
1120
1121         idx = gfn_to_index(gfn, slot->base_gfn, level);
1122         return &slot->arch.lpage_info[level - 2][idx];
1123 }
1124
1125 static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
1126                                             gfn_t gfn, int count)
1127 {
1128         struct kvm_lpage_info *linfo;
1129         int i;
1130
1131         for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1132                 linfo = lpage_info_slot(gfn, slot, i);
1133                 linfo->disallow_lpage += count;
1134                 WARN_ON(linfo->disallow_lpage < 0);
1135         }
1136 }
1137
1138 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1139 {
1140         update_gfn_disallow_lpage_count(slot, gfn, 1);
1141 }
1142
1143 void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
1144 {
1145         update_gfn_disallow_lpage_count(slot, gfn, -1);
1146 }
1147
1148 static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1149 {
1150         struct kvm_memslots *slots;
1151         struct kvm_memory_slot *slot;
1152         gfn_t gfn;
1153
1154         kvm->arch.indirect_shadow_pages++;
1155         gfn = sp->gfn;
1156         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1157         slot = __gfn_to_memslot(slots, gfn);
1158
1159         /* the non-leaf shadow pages are keeping readonly. */
1160         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1161                 return kvm_slot_page_track_add_page(kvm, slot, gfn,
1162                                                     KVM_PAGE_TRACK_WRITE);
1163
1164         kvm_mmu_gfn_disallow_lpage(slot, gfn);
1165 }
1166
1167 static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
1168 {
1169         struct kvm_memslots *slots;
1170         struct kvm_memory_slot *slot;
1171         gfn_t gfn;
1172
1173         kvm->arch.indirect_shadow_pages--;
1174         gfn = sp->gfn;
1175         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1176         slot = __gfn_to_memslot(slots, gfn);
1177         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
1178                 return kvm_slot_page_track_remove_page(kvm, slot, gfn,
1179                                                        KVM_PAGE_TRACK_WRITE);
1180
1181         kvm_mmu_gfn_allow_lpage(slot, gfn);
1182 }
1183
1184 static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
1185                                           struct kvm_memory_slot *slot)
1186 {
1187         struct kvm_lpage_info *linfo;
1188
1189         if (slot) {
1190                 linfo = lpage_info_slot(gfn, slot, level);
1191                 return !!linfo->disallow_lpage;
1192         }
1193
1194         return true;
1195 }
1196
1197 static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
1198                                         int level)
1199 {
1200         struct kvm_memory_slot *slot;
1201
1202         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1203         return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
1204 }
1205
1206 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
1207 {
1208         unsigned long page_size;
1209         int i, ret = 0;
1210
1211         page_size = kvm_host_page_size(kvm, gfn);
1212
1213         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1214                 if (page_size >= KVM_HPAGE_SIZE(i))
1215                         ret = i;
1216                 else
1217                         break;
1218         }
1219
1220         return ret;
1221 }
1222
1223 static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
1224                                           bool no_dirty_log)
1225 {
1226         if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
1227                 return false;
1228         if (no_dirty_log && slot->dirty_bitmap)
1229                 return false;
1230
1231         return true;
1232 }
1233
1234 static struct kvm_memory_slot *
1235 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
1236                             bool no_dirty_log)
1237 {
1238         struct kvm_memory_slot *slot;
1239
1240         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1241         if (!memslot_valid_for_gpte(slot, no_dirty_log))
1242                 slot = NULL;
1243
1244         return slot;
1245 }
1246
1247 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
1248                          bool *force_pt_level)
1249 {
1250         int host_level, level, max_level;
1251         struct kvm_memory_slot *slot;
1252
1253         if (unlikely(*force_pt_level))
1254                 return PT_PAGE_TABLE_LEVEL;
1255
1256         slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
1257         *force_pt_level = !memslot_valid_for_gpte(slot, true);
1258         if (unlikely(*force_pt_level))
1259                 return PT_PAGE_TABLE_LEVEL;
1260
1261         host_level = host_mapping_level(vcpu->kvm, large_gfn);
1262
1263         if (host_level == PT_PAGE_TABLE_LEVEL)
1264                 return host_level;
1265
1266         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
1267
1268         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
1269                 if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
1270                         break;
1271
1272         return level - 1;
1273 }
1274
1275 /*
1276  * About rmap_head encoding:
1277  *
1278  * If the bit zero of rmap_head->val is clear, then it points to the only spte
1279  * in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
1280  * pte_list_desc containing more mappings.
1281  */
1282
1283 /*
1284  * Returns the number of pointers in the rmap chain, not counting the new one.
1285  */
1286 static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte,
1287                         struct kvm_rmap_head *rmap_head)
1288 {
1289         struct pte_list_desc *desc;
1290         int i, count = 0;
1291
1292         if (!rmap_head->val) {
1293                 rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
1294                 rmap_head->val = (unsigned long)spte;
1295         } else if (!(rmap_head->val & 1)) {
1296                 rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
1297                 desc = mmu_alloc_pte_list_desc(vcpu);
1298                 desc->sptes[0] = (u64 *)rmap_head->val;
1299                 desc->sptes[1] = spte;
1300                 rmap_head->val = (unsigned long)desc | 1;
1301                 ++count;
1302         } else {
1303                 rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
1304                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1305                 while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
1306                         desc = desc->more;
1307                         count += PTE_LIST_EXT;
1308                 }
1309                 if (desc->sptes[PTE_LIST_EXT-1]) {
1310                         desc->more = mmu_alloc_pte_list_desc(vcpu);
1311                         desc = desc->more;
1312                 }
1313                 for (i = 0; desc->sptes[i]; ++i)
1314                         ++count;
1315                 desc->sptes[i] = spte;
1316         }
1317         return count;
1318 }
1319
1320 static void
1321 pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
1322                            struct pte_list_desc *desc, int i,
1323                            struct pte_list_desc *prev_desc)
1324 {
1325         int j;
1326
1327         for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
1328                 ;
1329         desc->sptes[i] = desc->sptes[j];
1330         desc->sptes[j] = NULL;
1331         if (j != 0)
1332                 return;
1333         if (!prev_desc && !desc->more)
1334                 rmap_head->val = (unsigned long)desc->sptes[0];
1335         else
1336                 if (prev_desc)
1337                         prev_desc->more = desc->more;
1338                 else
1339                         rmap_head->val = (unsigned long)desc->more | 1;
1340         mmu_free_pte_list_desc(desc);
1341 }
1342
1343 static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head)
1344 {
1345         struct pte_list_desc *desc;
1346         struct pte_list_desc *prev_desc;
1347         int i;
1348
1349         if (!rmap_head->val) {
1350                 pr_err("%s: %p 0->BUG\n", __func__, spte);
1351                 BUG();
1352         } else if (!(rmap_head->val & 1)) {
1353                 rmap_printk("%s:  %p 1->0\n", __func__, spte);
1354                 if ((u64 *)rmap_head->val != spte) {
1355                         pr_err("%s:  %p 1->BUG\n", __func__, spte);
1356                         BUG();
1357                 }
1358                 rmap_head->val = 0;
1359         } else {
1360                 rmap_printk("%s:  %p many->many\n", __func__, spte);
1361                 desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1362                 prev_desc = NULL;
1363                 while (desc) {
1364                         for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
1365                                 if (desc->sptes[i] == spte) {
1366                                         pte_list_desc_remove_entry(rmap_head,
1367                                                         desc, i, prev_desc);
1368                                         return;
1369                                 }
1370                         }
1371                         prev_desc = desc;
1372                         desc = desc->more;
1373                 }
1374                 pr_err("%s: %p many->many\n", __func__, spte);
1375                 BUG();
1376         }
1377 }
1378
1379 static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep)
1380 {
1381         mmu_spte_clear_track_bits(sptep);
1382         __pte_list_remove(sptep, rmap_head);
1383 }
1384
1385 static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
1386                                            struct kvm_memory_slot *slot)
1387 {
1388         unsigned long idx;
1389
1390         idx = gfn_to_index(gfn, slot->base_gfn, level);
1391         return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
1392 }
1393
1394 static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn,
1395                                          struct kvm_mmu_page *sp)
1396 {
1397         struct kvm_memslots *slots;
1398         struct kvm_memory_slot *slot;
1399
1400         slots = kvm_memslots_for_spte_role(kvm, sp->role);
1401         slot = __gfn_to_memslot(slots, gfn);
1402         return __gfn_to_rmap(gfn, sp->role.level, slot);
1403 }
1404
1405 static bool rmap_can_add(struct kvm_vcpu *vcpu)
1406 {
1407         struct kvm_mmu_memory_cache *cache;
1408
1409         cache = &vcpu->arch.mmu_pte_list_desc_cache;
1410         return mmu_memory_cache_free_objects(cache);
1411 }
1412
1413 static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
1414 {
1415         struct kvm_mmu_page *sp;
1416         struct kvm_rmap_head *rmap_head;
1417
1418         sp = page_header(__pa(spte));
1419         kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
1420         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
1421         return pte_list_add(vcpu, spte, rmap_head);
1422 }
1423
1424 static void rmap_remove(struct kvm *kvm, u64 *spte)
1425 {
1426         struct kvm_mmu_page *sp;
1427         gfn_t gfn;
1428         struct kvm_rmap_head *rmap_head;
1429
1430         sp = page_header(__pa(spte));
1431         gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
1432         rmap_head = gfn_to_rmap(kvm, gfn, sp);
1433         __pte_list_remove(spte, rmap_head);
1434 }
1435
1436 /*
1437  * Used by the following functions to iterate through the sptes linked by a
1438  * rmap.  All fields are private and not assumed to be used outside.
1439  */
1440 struct rmap_iterator {
1441         /* private fields */
1442         struct pte_list_desc *desc;     /* holds the sptep if not NULL */
1443         int pos;                        /* index of the sptep */
1444 };
1445
1446 /*
1447  * Iteration must be started by this function.  This should also be used after
1448  * removing/dropping sptes from the rmap link because in such cases the
1449  * information in the itererator may not be valid.
1450  *
1451  * Returns sptep if found, NULL otherwise.
1452  */
1453 static u64 *rmap_get_first(struct kvm_rmap_head *rmap_head,
1454                            struct rmap_iterator *iter)
1455 {
1456         u64 *sptep;
1457
1458         if (!rmap_head->val)
1459                 return NULL;
1460
1461         if (!(rmap_head->val & 1)) {
1462                 iter->desc = NULL;
1463                 sptep = (u64 *)rmap_head->val;
1464                 goto out;
1465         }
1466
1467         iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
1468         iter->pos = 0;
1469         sptep = iter->desc->sptes[iter->pos];
1470 out:
1471         BUG_ON(!is_shadow_present_pte(*sptep));
1472         return sptep;
1473 }
1474
1475 /*
1476  * Must be used with a valid iterator: e.g. after rmap_get_first().
1477  *
1478  * Returns sptep if found, NULL otherwise.
1479  */
1480 static u64 *rmap_get_next(struct rmap_iterator *iter)
1481 {
1482         u64 *sptep;
1483
1484         if (iter->desc) {
1485                 if (iter->pos < PTE_LIST_EXT - 1) {
1486                         ++iter->pos;
1487                         sptep = iter->desc->sptes[iter->pos];
1488                         if (sptep)
1489                                 goto out;
1490                 }
1491
1492                 iter->desc = iter->desc->more;
1493
1494                 if (iter->desc) {
1495                         iter->pos = 0;
1496                         /* desc->sptes[0] cannot be NULL */
1497                         sptep = iter->desc->sptes[iter->pos];
1498                         goto out;
1499                 }
1500         }
1501
1502         return NULL;
1503 out:
1504         BUG_ON(!is_shadow_present_pte(*sptep));
1505         return sptep;
1506 }
1507
1508 #define for_each_rmap_spte(_rmap_head_, _iter_, _spte_)                 \
1509         for (_spte_ = rmap_get_first(_rmap_head_, _iter_);              \
1510              _spte_; _spte_ = rmap_get_next(_iter_))
1511
1512 static void drop_spte(struct kvm *kvm, u64 *sptep)
1513 {
1514         if (mmu_spte_clear_track_bits(sptep))
1515                 rmap_remove(kvm, sptep);
1516 }
1517
1518
1519 static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
1520 {
1521         if (is_large_pte(*sptep)) {
1522                 WARN_ON(page_header(__pa(sptep))->role.level ==
1523                         PT_PAGE_TABLE_LEVEL);
1524                 drop_spte(kvm, sptep);
1525                 --kvm->stat.lpages;
1526                 return true;
1527         }
1528
1529         return false;
1530 }
1531
1532 static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
1533 {
1534         if (__drop_large_spte(vcpu->kvm, sptep)) {
1535                 struct kvm_mmu_page *sp = page_header(__pa(sptep));
1536
1537                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
1538                         KVM_PAGES_PER_HPAGE(sp->role.level));
1539         }
1540 }
1541
1542 /*
1543  * Write-protect on the specified @sptep, @pt_protect indicates whether
1544  * spte write-protection is caused by protecting shadow page table.
1545  *
1546  * Note: write protection is difference between dirty logging and spte
1547  * protection:
1548  * - for dirty logging, the spte can be set to writable at anytime if
1549  *   its dirty bitmap is properly set.
1550  * - for spte protection, the spte can be writable only after unsync-ing
1551  *   shadow page.
1552  *
1553  * Return true if tlb need be flushed.
1554  */
1555 static bool spte_write_protect(u64 *sptep, bool pt_protect)
1556 {
1557         u64 spte = *sptep;
1558
1559         if (!is_writable_pte(spte) &&
1560               !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
1561                 return false;
1562
1563         rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
1564
1565         if (pt_protect)
1566                 spte &= ~SPTE_MMU_WRITEABLE;
1567         spte = spte & ~PT_WRITABLE_MASK;
1568
1569         return mmu_spte_update(sptep, spte);
1570 }
1571
1572 static bool __rmap_write_protect(struct kvm *kvm,
1573                                  struct kvm_rmap_head *rmap_head,
1574                                  bool pt_protect)
1575 {
1576         u64 *sptep;
1577         struct rmap_iterator iter;
1578         bool flush = false;
1579
1580         for_each_rmap_spte(rmap_head, &iter, sptep)
1581                 flush |= spte_write_protect(sptep, pt_protect);
1582
1583         return flush;
1584 }
1585
1586 static bool spte_clear_dirty(u64 *sptep)
1587 {
1588         u64 spte = *sptep;
1589
1590         rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
1591
1592         spte &= ~shadow_dirty_mask;
1593
1594         return mmu_spte_update(sptep, spte);
1595 }
1596
1597 static bool wrprot_ad_disabled_spte(u64 *sptep)
1598 {
1599         bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
1600                                                (unsigned long *)sptep);
1601         if (was_writable)
1602                 kvm_set_pfn_dirty(spte_to_pfn(*sptep));
1603
1604         return was_writable;
1605 }
1606
1607 /*
1608  * Gets the GFN ready for another round of dirty logging by clearing the
1609  *      - D bit on ad-enabled SPTEs, and
1610  *      - W bit on ad-disabled SPTEs.
1611  * Returns true iff any D or W bits were cleared.
1612  */
1613 static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1614 {
1615         u64 *sptep;
1616         struct rmap_iterator iter;
1617         bool flush = false;
1618
1619         for_each_rmap_spte(rmap_head, &iter, sptep)
1620                 if (spte_ad_enabled(*sptep))
1621                         flush |= spte_clear_dirty(sptep);
1622                 else
1623                         flush |= wrprot_ad_disabled_spte(sptep);
1624
1625         return flush;
1626 }
1627
1628 static bool spte_set_dirty(u64 *sptep)
1629 {
1630         u64 spte = *sptep;
1631
1632         rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
1633
1634         spte |= shadow_dirty_mask;
1635
1636         return mmu_spte_update(sptep, spte);
1637 }
1638
1639 static bool __rmap_set_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1640 {
1641         u64 *sptep;
1642         struct rmap_iterator iter;
1643         bool flush = false;
1644
1645         for_each_rmap_spte(rmap_head, &iter, sptep)
1646                 if (spte_ad_enabled(*sptep))
1647                         flush |= spte_set_dirty(sptep);
1648
1649         return flush;
1650 }
1651
1652 /**
1653  * kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1654  * @kvm: kvm instance
1655  * @slot: slot to protect
1656  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1657  * @mask: indicates which pages we should protect
1658  *
1659  * Used when we do not need to care about huge page mappings: e.g. during dirty
1660  * logging we do not have any such mappings.
1661  */
1662 static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1663                                      struct kvm_memory_slot *slot,
1664                                      gfn_t gfn_offset, unsigned long mask)
1665 {
1666         struct kvm_rmap_head *rmap_head;
1667
1668         while (mask) {
1669                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1670                                           PT_PAGE_TABLE_LEVEL, slot);
1671                 __rmap_write_protect(kvm, rmap_head, false);
1672
1673                 /* clear the first set bit */
1674                 mask &= mask - 1;
1675         }
1676 }
1677
1678 /**
1679  * kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1680  * protect the page if the D-bit isn't supported.
1681  * @kvm: kvm instance
1682  * @slot: slot to clear D-bit
1683  * @gfn_offset: start of the BITS_PER_LONG pages we care about
1684  * @mask: indicates which pages we should clear D-bit
1685  *
1686  * Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1687  */
1688 void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1689                                      struct kvm_memory_slot *slot,
1690                                      gfn_t gfn_offset, unsigned long mask)
1691 {
1692         struct kvm_rmap_head *rmap_head;
1693
1694         while (mask) {
1695                 rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
1696                                           PT_PAGE_TABLE_LEVEL, slot);
1697                 __rmap_clear_dirty(kvm, rmap_head);
1698
1699                 /* clear the first set bit */
1700                 mask &= mask - 1;
1701         }
1702 }
1703 EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
1704
1705 /**
1706  * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1707  * PT level pages.
1708  *
1709  * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1710  * enable dirty logging for them.
1711  *
1712  * Used when we do not need to care about huge page mappings: e.g. during dirty
1713  * logging we do not have any such mappings.
1714  */
1715 void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1716                                 struct kvm_memory_slot *slot,
1717                                 gfn_t gfn_offset, unsigned long mask)
1718 {
1719         if (kvm_x86_ops->enable_log_dirty_pt_masked)
1720                 kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
1721                                 mask);
1722         else
1723                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1724 }
1725
1726 /**
1727  * kvm_arch_write_log_dirty - emulate dirty page logging
1728  * @vcpu: Guest mode vcpu
1729  *
1730  * Emulate arch specific page modification logging for the
1731  * nested hypervisor
1732  */
1733 int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
1734 {
1735         if (kvm_x86_ops->write_log_dirty)
1736                 return kvm_x86_ops->write_log_dirty(vcpu);
1737
1738         return 0;
1739 }
1740
1741 bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1742                                     struct kvm_memory_slot *slot, u64 gfn)
1743 {
1744         struct kvm_rmap_head *rmap_head;
1745         int i;
1746         bool write_protected = false;
1747
1748         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
1749                 rmap_head = __gfn_to_rmap(gfn, i, slot);
1750                 write_protected |= __rmap_write_protect(kvm, rmap_head, true);
1751         }
1752
1753         return write_protected;
1754 }
1755
1756 static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
1757 {
1758         struct kvm_memory_slot *slot;
1759
1760         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1761         return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
1762 }
1763
1764 static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
1765 {
1766         u64 *sptep;
1767         struct rmap_iterator iter;
1768         bool flush = false;
1769
1770         while ((sptep = rmap_get_first(rmap_head, &iter))) {
1771                 rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
1772
1773                 pte_list_remove(rmap_head, sptep);
1774                 flush = true;
1775         }
1776
1777         return flush;
1778 }
1779
1780 static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1781                            struct kvm_memory_slot *slot, gfn_t gfn, int level,
1782                            unsigned long data)
1783 {
1784         return kvm_zap_rmapp(kvm, rmap_head);
1785 }
1786
1787 static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1788                              struct kvm_memory_slot *slot, gfn_t gfn, int level,
1789                              unsigned long data)
1790 {
1791         u64 *sptep;
1792         struct rmap_iterator iter;
1793         int need_flush = 0;
1794         u64 new_spte;
1795         pte_t *ptep = (pte_t *)data;
1796         kvm_pfn_t new_pfn;
1797
1798         WARN_ON(pte_huge(*ptep));
1799         new_pfn = pte_pfn(*ptep);
1800
1801 restart:
1802         for_each_rmap_spte(rmap_head, &iter, sptep) {
1803                 rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
1804                             sptep, *sptep, gfn, level);
1805
1806                 need_flush = 1;
1807
1808                 if (pte_write(*ptep)) {
1809                         pte_list_remove(rmap_head, sptep);
1810                         goto restart;
1811                 } else {
1812                         new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
1813                         new_spte |= (u64)new_pfn << PAGE_SHIFT;
1814
1815                         new_spte &= ~PT_WRITABLE_MASK;
1816                         new_spte &= ~SPTE_HOST_WRITEABLE;
1817
1818                         new_spte = mark_spte_for_access_track(new_spte);
1819
1820                         mmu_spte_clear_track_bits(sptep);
1821                         mmu_spte_set(sptep, new_spte);
1822                 }
1823         }
1824
1825         if (need_flush && kvm_available_flush_tlb_with_range()) {
1826                 kvm_flush_remote_tlbs_with_address(kvm, gfn, 1);
1827                 return 0;
1828         }
1829
1830         return need_flush;
1831 }
1832
1833 struct slot_rmap_walk_iterator {
1834         /* input fields. */
1835         struct kvm_memory_slot *slot;
1836         gfn_t start_gfn;
1837         gfn_t end_gfn;
1838         int start_level;
1839         int end_level;
1840
1841         /* output fields. */
1842         gfn_t gfn;
1843         struct kvm_rmap_head *rmap;
1844         int level;
1845
1846         /* private field. */
1847         struct kvm_rmap_head *end_rmap;
1848 };
1849
1850 static void
1851 rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
1852 {
1853         iterator->level = level;
1854         iterator->gfn = iterator->start_gfn;
1855         iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
1856         iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
1857                                            iterator->slot);
1858 }
1859
1860 static void
1861 slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1862                     struct kvm_memory_slot *slot, int start_level,
1863                     int end_level, gfn_t start_gfn, gfn_t end_gfn)
1864 {
1865         iterator->slot = slot;
1866         iterator->start_level = start_level;
1867         iterator->end_level = end_level;
1868         iterator->start_gfn = start_gfn;
1869         iterator->end_gfn = end_gfn;
1870
1871         rmap_walk_init_level(iterator, iterator->start_level);
1872 }
1873
1874 static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1875 {
1876         return !!iterator->rmap;
1877 }
1878
1879 static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1880 {
1881         if (++iterator->rmap <= iterator->end_rmap) {
1882                 iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
1883                 return;
1884         }
1885
1886         if (++iterator->level > iterator->end_level) {
1887                 iterator->rmap = NULL;
1888                 return;
1889         }
1890
1891         rmap_walk_init_level(iterator, iterator->level);
1892 }
1893
1894 #define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_,    \
1895            _start_gfn, _end_gfn, _iter_)                                \
1896         for (slot_rmap_walk_init(_iter_, _slot_, _start_level_,         \
1897                                  _end_level_, _start_gfn, _end_gfn);    \
1898              slot_rmap_walk_okay(_iter_);                               \
1899              slot_rmap_walk_next(_iter_))
1900
1901 static int kvm_handle_hva_range(struct kvm *kvm,
1902                                 unsigned long start,
1903                                 unsigned long end,
1904                                 unsigned long data,
1905                                 int (*handler)(struct kvm *kvm,
1906                                                struct kvm_rmap_head *rmap_head,
1907                                                struct kvm_memory_slot *slot,
1908                                                gfn_t gfn,
1909                                                int level,
1910                                                unsigned long data))
1911 {
1912         struct kvm_memslots *slots;
1913         struct kvm_memory_slot *memslot;
1914         struct slot_rmap_walk_iterator iterator;
1915         int ret = 0;
1916         int i;
1917
1918         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
1919                 slots = __kvm_memslots(kvm, i);
1920                 kvm_for_each_memslot(memslot, slots) {
1921                         unsigned long hva_start, hva_end;
1922                         gfn_t gfn_start, gfn_end;
1923
1924                         hva_start = max(start, memslot->userspace_addr);
1925                         hva_end = min(end, memslot->userspace_addr +
1926                                       (memslot->npages << PAGE_SHIFT));
1927                         if (hva_start >= hva_end)
1928                                 continue;
1929                         /*
1930                          * {gfn(page) | page intersects with [hva_start, hva_end)} =
1931                          * {gfn_start, gfn_start+1, ..., gfn_end-1}.
1932                          */
1933                         gfn_start = hva_to_gfn_memslot(hva_start, memslot);
1934                         gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
1935
1936                         for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
1937                                                  PT_MAX_HUGEPAGE_LEVEL,
1938                                                  gfn_start, gfn_end - 1,
1939                                                  &iterator)
1940                                 ret |= handler(kvm, iterator.rmap, memslot,
1941                                                iterator.gfn, iterator.level, data);
1942                 }
1943         }
1944
1945         return ret;
1946 }
1947
1948 static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
1949                           unsigned long data,
1950                           int (*handler)(struct kvm *kvm,
1951                                          struct kvm_rmap_head *rmap_head,
1952                                          struct kvm_memory_slot *slot,
1953                                          gfn_t gfn, int level,
1954                                          unsigned long data))
1955 {
1956         return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
1957 }
1958
1959 int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
1960 {
1961         return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
1962 }
1963
1964 int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
1965 {
1966         return kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
1967 }
1968
1969 static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1970                          struct kvm_memory_slot *slot, gfn_t gfn, int level,
1971                          unsigned long data)
1972 {
1973         u64 *sptep;
1974         struct rmap_iterator uninitialized_var(iter);
1975         int young = 0;
1976
1977         for_each_rmap_spte(rmap_head, &iter, sptep)
1978                 young |= mmu_spte_age(sptep);
1979
1980         trace_kvm_age_page(gfn, level, slot, young);
1981         return young;
1982 }
1983
1984 static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
1985                               struct kvm_memory_slot *slot, gfn_t gfn,
1986                               int level, unsigned long data)
1987 {
1988         u64 *sptep;
1989         struct rmap_iterator iter;
1990
1991         for_each_rmap_spte(rmap_head, &iter, sptep)
1992                 if (is_accessed_spte(*sptep))
1993                         return 1;
1994         return 0;
1995 }
1996
1997 #define RMAP_RECYCLE_THRESHOLD 1000
1998
1999 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
2000 {
2001         struct kvm_rmap_head *rmap_head;
2002         struct kvm_mmu_page *sp;
2003
2004         sp = page_header(__pa(spte));
2005
2006         rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
2007
2008         kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
2009         kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn,
2010                         KVM_PAGES_PER_HPAGE(sp->role.level));
2011 }
2012
2013 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
2014 {
2015         return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
2016 }
2017
2018 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
2019 {
2020         return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
2021 }
2022
2023 #ifdef MMU_DEBUG
2024 static int is_empty_shadow_page(u64 *spt)
2025 {
2026         u64 *pos;
2027         u64 *end;
2028
2029         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
2030                 if (is_shadow_present_pte(*pos)) {
2031                         printk(KERN_ERR "%s: %p %llx\n", __func__,
2032                                pos, *pos);
2033                         return 0;
2034                 }
2035         return 1;
2036 }
2037 #endif
2038
2039 /*
2040  * This value is the sum of all of the kvm instances's
2041  * kvm->arch.n_used_mmu_pages values.  We need a global,
2042  * aggregate version in order to make the slab shrinker
2043  * faster
2044  */
2045 static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
2046 {
2047         kvm->arch.n_used_mmu_pages += nr;
2048         percpu_counter_add(&kvm_total_used_mmu_pages, nr);
2049 }
2050
2051 static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
2052 {
2053         MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
2054         hlist_del(&sp->hash_link);
2055         list_del(&sp->link);
2056         free_page((unsigned long)sp->spt);
2057         if (!sp->role.direct)
2058                 free_page((unsigned long)sp->gfns);
2059         kmem_cache_free(mmu_page_header_cache, sp);
2060 }
2061
2062 static unsigned kvm_page_table_hashfn(gfn_t gfn)
2063 {
2064         return hash_64(gfn, KVM_MMU_HASH_SHIFT);
2065 }
2066
2067 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
2068                                     struct kvm_mmu_page *sp, u64 *parent_pte)
2069 {
2070         if (!parent_pte)
2071                 return;
2072
2073         pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
2074 }
2075
2076 static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
2077                                        u64 *parent_pte)
2078 {
2079         __pte_list_remove(parent_pte, &sp->parent_ptes);
2080 }
2081
2082 static void drop_parent_pte(struct kvm_mmu_page *sp,
2083                             u64 *parent_pte)
2084 {
2085         mmu_page_remove_parent_pte(sp, parent_pte);
2086         mmu_spte_clear_no_track(parent_pte);
2087 }
2088
2089 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct)
2090 {
2091         struct kvm_mmu_page *sp;
2092
2093         sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
2094         sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2095         if (!direct)
2096                 sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
2097         set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
2098
2099         /*
2100          * active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2101          * depends on valid pages being added to the head of the list.  See
2102          * comments in kvm_zap_obsolete_pages().
2103          */
2104         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
2105         list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
2106         kvm_mod_used_mmu_pages(vcpu->kvm, +1);
2107         return sp;
2108 }
2109
2110 static void mark_unsync(u64 *spte);
2111 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
2112 {
2113         u64 *sptep;
2114         struct rmap_iterator iter;
2115
2116         for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
2117                 mark_unsync(sptep);
2118         }
2119 }
2120
2121 static void mark_unsync(u64 *spte)
2122 {
2123         struct kvm_mmu_page *sp;
2124         unsigned int index;
2125
2126         sp = page_header(__pa(spte));
2127         index = spte - sp->spt;
2128         if (__test_and_set_bit(index, sp->unsync_child_bitmap))
2129                 return;
2130         if (sp->unsync_children++)
2131                 return;
2132         kvm_mmu_mark_parents_unsync(sp);
2133 }
2134
2135 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
2136                                struct kvm_mmu_page *sp)
2137 {
2138         return 0;
2139 }
2140
2141 static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
2142 {
2143 }
2144
2145 static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
2146                                  struct kvm_mmu_page *sp, u64 *spte,
2147                                  const void *pte)
2148 {
2149         WARN_ON(1);
2150 }
2151
2152 #define KVM_PAGE_ARRAY_NR 16
2153
2154 struct kvm_mmu_pages {
2155         struct mmu_page_and_offset {
2156                 struct kvm_mmu_page *sp;
2157                 unsigned int idx;
2158         } page[KVM_PAGE_ARRAY_NR];
2159         unsigned int nr;
2160 };
2161
2162 static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
2163                          int idx)
2164 {
2165         int i;
2166
2167         if (sp->unsync)
2168                 for (i=0; i < pvec->nr; i++)
2169                         if (pvec->page[i].sp == sp)
2170                                 return 0;
2171
2172         pvec->page[pvec->nr].sp = sp;
2173         pvec->page[pvec->nr].idx = idx;
2174         pvec->nr++;
2175         return (pvec->nr == KVM_PAGE_ARRAY_NR);
2176 }
2177
2178 static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
2179 {
2180         --sp->unsync_children;
2181         WARN_ON((int)sp->unsync_children < 0);
2182         __clear_bit(idx, sp->unsync_child_bitmap);
2183 }
2184
2185 static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
2186                            struct kvm_mmu_pages *pvec)
2187 {
2188         int i, ret, nr_unsync_leaf = 0;
2189
2190         for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
2191                 struct kvm_mmu_page *child;
2192                 u64 ent = sp->spt[i];
2193
2194                 if (!is_shadow_present_pte(ent) || is_large_pte(ent)) {
2195                         clear_unsync_child_bit(sp, i);
2196                         continue;
2197                 }
2198
2199                 child = page_header(ent & PT64_BASE_ADDR_MASK);
2200
2201                 if (child->unsync_children) {
2202                         if (mmu_pages_add(pvec, child, i))
2203                                 return -ENOSPC;
2204
2205                         ret = __mmu_unsync_walk(child, pvec);
2206                         if (!ret) {
2207                                 clear_unsync_child_bit(sp, i);
2208                                 continue;
2209                         } else if (ret > 0) {
2210                                 nr_unsync_leaf += ret;
2211                         } else
2212                                 return ret;
2213                 } else if (child->unsync) {
2214                         nr_unsync_leaf++;
2215                         if (mmu_pages_add(pvec, child, i))
2216                                 return -ENOSPC;
2217                 } else
2218                         clear_unsync_child_bit(sp, i);
2219         }
2220
2221         return nr_unsync_leaf;
2222 }
2223
2224 #define INVALID_INDEX (-1)
2225
2226 static int mmu_unsync_walk(struct kvm_mmu_page *sp,
2227                            struct kvm_mmu_pages *pvec)
2228 {
2229         pvec->nr = 0;
2230         if (!sp->unsync_children)
2231                 return 0;
2232
2233         mmu_pages_add(pvec, sp, INVALID_INDEX);
2234         return __mmu_unsync_walk(sp, pvec);
2235 }
2236
2237 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
2238 {
2239         WARN_ON(!sp->unsync);
2240         trace_kvm_mmu_sync_page(sp);
2241         sp->unsync = 0;
2242         --kvm->stat.mmu_unsync;
2243 }
2244
2245 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2246                                      struct list_head *invalid_list);
2247 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2248                                     struct list_head *invalid_list);
2249
2250
2251 #define for_each_valid_sp(_kvm, _sp, _gfn)                              \
2252         hlist_for_each_entry(_sp,                                       \
2253           &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
2254                 if (is_obsolete_sp((_kvm), (_sp))) {                    \
2255                 } else
2256
2257 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)                 \
2258         for_each_valid_sp(_kvm, _sp, _gfn)                              \
2259                 if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
2260
2261 static inline bool is_ept_sp(struct kvm_mmu_page *sp)
2262 {
2263         return sp->role.cr0_wp && sp->role.smap_andnot_wp;
2264 }
2265
2266 /* @sp->gfn should be write-protected at the call site */
2267 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2268                             struct list_head *invalid_list)
2269 {
2270         if ((!is_ept_sp(sp) && sp->role.gpte_is_8_bytes != !!is_pae(vcpu)) ||
2271             vcpu->arch.mmu->sync_page(vcpu, sp) == 0) {
2272                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
2273                 return false;
2274         }
2275
2276         return true;
2277 }
2278
2279 static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
2280                                         struct list_head *invalid_list,
2281                                         bool remote_flush)
2282 {
2283         if (!remote_flush && list_empty(invalid_list))
2284                 return false;
2285
2286         if (!list_empty(invalid_list))
2287                 kvm_mmu_commit_zap_page(kvm, invalid_list);
2288         else
2289                 kvm_flush_remote_tlbs(kvm);
2290         return true;
2291 }
2292
2293 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
2294                                  struct list_head *invalid_list,
2295                                  bool remote_flush, bool local_flush)
2296 {
2297         if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
2298                 return;
2299
2300         if (local_flush)
2301                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2302 }
2303
2304 #ifdef CONFIG_KVM_MMU_AUDIT
2305 #include "mmu_audit.c"
2306 #else
2307 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
2308 static void mmu_audit_disable(void) { }
2309 #endif
2310
2311 static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
2312 {
2313         return sp->role.invalid ||
2314                unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2315 }
2316
2317 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
2318                          struct list_head *invalid_list)
2319 {
2320         kvm_unlink_unsync_page(vcpu->kvm, sp);
2321         return __kvm_sync_page(vcpu, sp, invalid_list);
2322 }
2323
2324 /* @gfn should be write-protected at the call site */
2325 static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
2326                            struct list_head *invalid_list)
2327 {
2328         struct kvm_mmu_page *s;
2329         bool ret = false;
2330
2331         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
2332                 if (!s->unsync)
2333                         continue;
2334
2335                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
2336                 ret |= kvm_sync_page(vcpu, s, invalid_list);
2337         }
2338
2339         return ret;
2340 }
2341
2342 struct mmu_page_path {
2343         struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2344         unsigned int idx[PT64_ROOT_MAX_LEVEL];
2345 };
2346
2347 #define for_each_sp(pvec, sp, parents, i)                       \
2348                 for (i = mmu_pages_first(&pvec, &parents);      \
2349                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
2350                         i = mmu_pages_next(&pvec, &parents, i))
2351
2352 static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2353                           struct mmu_page_path *parents,
2354                           int i)
2355 {
2356         int n;
2357
2358         for (n = i+1; n < pvec->nr; n++) {
2359                 struct kvm_mmu_page *sp = pvec->page[n].sp;
2360                 unsigned idx = pvec->page[n].idx;
2361                 int level = sp->role.level;
2362
2363                 parents->idx[level-1] = idx;
2364                 if (level == PT_PAGE_TABLE_LEVEL)
2365                         break;
2366
2367                 parents->parent[level-2] = sp;
2368         }
2369
2370         return n;
2371 }
2372
2373 static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2374                            struct mmu_page_path *parents)
2375 {
2376         struct kvm_mmu_page *sp;
2377         int level;
2378
2379         if (pvec->nr == 0)
2380                 return 0;
2381
2382         WARN_ON(pvec->page[0].idx != INVALID_INDEX);
2383
2384         sp = pvec->page[0].sp;
2385         level = sp->role.level;
2386         WARN_ON(level == PT_PAGE_TABLE_LEVEL);
2387
2388         parents->parent[level-2] = sp;
2389
2390         /* Also set up a sentinel.  Further entries in pvec are all
2391          * children of sp, so this element is never overwritten.
2392          */
2393         parents->parent[level-1] = NULL;
2394         return mmu_pages_next(pvec, parents, 0);
2395 }
2396
2397 static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2398 {
2399         struct kvm_mmu_page *sp;
2400         unsigned int level = 0;
2401
2402         do {
2403                 unsigned int idx = parents->idx[level];
2404                 sp = parents->parent[level];
2405                 if (!sp)
2406                         return;
2407
2408                 WARN_ON(idx == INVALID_INDEX);
2409                 clear_unsync_child_bit(sp, idx);
2410                 level++;
2411         } while (!sp->unsync_children);
2412 }
2413
2414 static void mmu_sync_children(struct kvm_vcpu *vcpu,
2415                               struct kvm_mmu_page *parent)
2416 {
2417         int i;
2418         struct kvm_mmu_page *sp;
2419         struct mmu_page_path parents;
2420         struct kvm_mmu_pages pages;
2421         LIST_HEAD(invalid_list);
2422         bool flush = false;
2423
2424         while (mmu_unsync_walk(parent, &pages)) {
2425                 bool protected = false;
2426
2427                 for_each_sp(pages, sp, parents, i)
2428                         protected |= rmap_write_protect(vcpu, sp->gfn);
2429
2430                 if (protected) {
2431                         kvm_flush_remote_tlbs(vcpu->kvm);
2432                         flush = false;
2433                 }
2434
2435                 for_each_sp(pages, sp, parents, i) {
2436                         flush |= kvm_sync_page(vcpu, sp, &invalid_list);
2437                         mmu_pages_clear_parents(&parents);
2438                 }
2439                 if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
2440                         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2441                         cond_resched_lock(&vcpu->kvm->mmu_lock);
2442                         flush = false;
2443                 }
2444         }
2445
2446         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2447 }
2448
2449 static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2450 {
2451         atomic_set(&sp->write_flooding_count,  0);
2452 }
2453
2454 static void clear_sp_write_flooding_count(u64 *spte)
2455 {
2456         struct kvm_mmu_page *sp =  page_header(__pa(spte));
2457
2458         __clear_sp_write_flooding_count(sp);
2459 }
2460
2461 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
2462                                              gfn_t gfn,
2463                                              gva_t gaddr,
2464                                              unsigned level,
2465                                              int direct,
2466                                              unsigned access)
2467 {
2468         union kvm_mmu_page_role role;
2469         unsigned quadrant;
2470         struct kvm_mmu_page *sp;
2471         bool need_sync = false;
2472         bool flush = false;
2473         int collisions = 0;
2474         LIST_HEAD(invalid_list);
2475
2476         role = vcpu->arch.mmu->mmu_role.base;
2477         role.level = level;
2478         role.direct = direct;
2479         if (role.direct)
2480                 role.gpte_is_8_bytes = true;
2481         role.access = access;
2482         if (!vcpu->arch.mmu->direct_map
2483             && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) {
2484                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
2485                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
2486                 role.quadrant = quadrant;
2487         }
2488         for_each_valid_sp(vcpu->kvm, sp, gfn) {
2489                 if (sp->gfn != gfn) {
2490                         collisions++;
2491                         continue;
2492                 }
2493
2494                 if (!need_sync && sp->unsync)
2495                         need_sync = true;
2496
2497                 if (sp->role.word != role.word)
2498                         continue;
2499
2500                 if (sp->unsync) {
2501                         /* The page is good, but __kvm_sync_page might still end
2502                          * up zapping it.  If so, break in order to rebuild it.
2503                          */
2504                         if (!__kvm_sync_page(vcpu, sp, &invalid_list))
2505                                 break;
2506
2507                         WARN_ON(!list_empty(&invalid_list));
2508                         kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2509                 }
2510
2511                 if (sp->unsync_children)
2512                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2513
2514                 __clear_sp_write_flooding_count(sp);
2515                 trace_kvm_mmu_get_page(sp, false);
2516                 goto out;
2517         }
2518
2519         ++vcpu->kvm->stat.mmu_cache_miss;
2520
2521         sp = kvm_mmu_alloc_page(vcpu, direct);
2522
2523         sp->gfn = gfn;
2524         sp->role = role;
2525         hlist_add_head(&sp->hash_link,
2526                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
2527         if (!direct) {
2528                 /*
2529                  * we should do write protection before syncing pages
2530                  * otherwise the content of the synced shadow page may
2531                  * be inconsistent with guest page table.
2532                  */
2533                 account_shadowed(vcpu->kvm, sp);
2534                 if (level == PT_PAGE_TABLE_LEVEL &&
2535                       rmap_write_protect(vcpu, gfn))
2536                         kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
2537
2538                 if (level > PT_PAGE_TABLE_LEVEL && need_sync)
2539                         flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
2540         }
2541         clear_page(sp->spt);
2542         trace_kvm_mmu_get_page(sp, true);
2543
2544         kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
2545 out:
2546         if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
2547                 vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
2548         return sp;
2549 }
2550
2551 static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2552                                         struct kvm_vcpu *vcpu, hpa_t root,
2553                                         u64 addr)
2554 {
2555         iterator->addr = addr;
2556         iterator->shadow_addr = root;
2557         iterator->level = vcpu->arch.mmu->shadow_root_level;
2558
2559         if (iterator->level == PT64_ROOT_4LEVEL &&
2560             vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
2561             !vcpu->arch.mmu->direct_map)
2562                 --iterator->level;
2563
2564         if (iterator->level == PT32E_ROOT_LEVEL) {
2565                 /*
2566                  * prev_root is currently only used for 64-bit hosts. So only
2567                  * the active root_hpa is valid here.
2568                  */
2569                 BUG_ON(root != vcpu->arch.mmu->root_hpa);
2570
2571                 iterator->shadow_addr
2572                         = vcpu->arch.mmu->pae_root[(addr >> 30) & 3];
2573                 iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
2574                 --iterator->level;
2575                 if (!iterator->shadow_addr)
2576                         iterator->level = 0;
2577         }
2578 }
2579
2580 static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2581                              struct kvm_vcpu *vcpu, u64 addr)
2582 {
2583         shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu->root_hpa,
2584                                     addr);
2585 }
2586
2587 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2588 {
2589         if (iterator->level < PT_PAGE_TABLE_LEVEL)
2590                 return false;
2591
2592         iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
2593         iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2594         return true;
2595 }
2596
2597 static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2598                                u64 spte)
2599 {
2600         if (is_last_spte(spte, iterator->level)) {
2601                 iterator->level = 0;
2602                 return;
2603         }
2604
2605         iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
2606         --iterator->level;
2607 }
2608
2609 static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2610 {
2611         __shadow_walk_next(iterator, *iterator->sptep);
2612 }
2613
2614 static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
2615                              struct kvm_mmu_page *sp)
2616 {
2617         u64 spte;
2618
2619         BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2620
2621         spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
2622                shadow_user_mask | shadow_x_mask | shadow_me_mask;
2623
2624         if (sp_ad_disabled(sp))
2625                 spte |= shadow_acc_track_value;
2626         else
2627                 spte |= shadow_accessed_mask;
2628
2629         mmu_spte_set(sptep, spte);
2630
2631         mmu_page_add_parent_pte(vcpu, sp, sptep);
2632
2633         if (sp->unsync_children || sp->unsync)
2634                 mark_unsync(sptep);
2635 }
2636
2637 static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2638                                    unsigned direct_access)
2639 {
2640         if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
2641                 struct kvm_mmu_page *child;
2642
2643                 /*
2644                  * For the direct sp, if the guest pte's dirty bit
2645                  * changed form clean to dirty, it will corrupt the
2646                  * sp's access: allow writable in the read-only sp,
2647                  * so we should update the spte at this point to get
2648                  * a new sp with the correct access.
2649                  */
2650                 child = page_header(*sptep & PT64_BASE_ADDR_MASK);
2651                 if (child->role.access == direct_access)
2652                         return;
2653
2654                 drop_parent_pte(child, sptep);
2655                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, child->gfn, 1);
2656         }
2657 }
2658
2659 static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
2660                              u64 *spte)
2661 {
2662         u64 pte;
2663         struct kvm_mmu_page *child;
2664
2665         pte = *spte;
2666         if (is_shadow_present_pte(pte)) {
2667                 if (is_last_spte(pte, sp->role.level)) {
2668                         drop_spte(kvm, spte);
2669                         if (is_large_pte(pte))
2670                                 --kvm->stat.lpages;
2671                 } else {
2672                         child = page_header(pte & PT64_BASE_ADDR_MASK);
2673                         drop_parent_pte(child, spte);
2674                 }
2675                 return true;
2676         }
2677
2678         if (is_mmio_spte(pte))
2679                 mmu_spte_clear_no_track(spte);
2680
2681         return false;
2682 }
2683
2684 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
2685                                          struct kvm_mmu_page *sp)
2686 {
2687         unsigned i;
2688
2689         for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
2690                 mmu_page_zap_pte(kvm, sp, sp->spt + i);
2691 }
2692
2693 static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
2694 {
2695         u64 *sptep;
2696         struct rmap_iterator iter;
2697
2698         while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
2699                 drop_parent_pte(sp, sptep);
2700 }
2701
2702 static int mmu_zap_unsync_children(struct kvm *kvm,
2703                                    struct kvm_mmu_page *parent,
2704                                    struct list_head *invalid_list)
2705 {
2706         int i, zapped = 0;
2707         struct mmu_page_path parents;
2708         struct kvm_mmu_pages pages;
2709
2710         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
2711                 return 0;
2712
2713         while (mmu_unsync_walk(parent, &pages)) {
2714                 struct kvm_mmu_page *sp;
2715
2716                 for_each_sp(pages, sp, parents, i) {
2717                         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2718                         mmu_pages_clear_parents(&parents);
2719                         zapped++;
2720                 }
2721         }
2722
2723         return zapped;
2724 }
2725
2726 static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2727                                        struct kvm_mmu_page *sp,
2728                                        struct list_head *invalid_list,
2729                                        int *nr_zapped)
2730 {
2731         bool list_unstable;
2732
2733         trace_kvm_mmu_prepare_zap_page(sp);
2734         ++kvm->stat.mmu_shadow_zapped;
2735         *nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
2736         kvm_mmu_page_unlink_children(kvm, sp);
2737         kvm_mmu_unlink_parents(kvm, sp);
2738
2739         /* Zapping children means active_mmu_pages has become unstable. */
2740         list_unstable = *nr_zapped;
2741
2742         if (!sp->role.invalid && !sp->role.direct)
2743                 unaccount_shadowed(kvm, sp);
2744
2745         if (sp->unsync)
2746                 kvm_unlink_unsync_page(kvm, sp);
2747         if (!sp->root_count) {
2748                 /* Count self */
2749                 (*nr_zapped)++;
2750                 list_move(&sp->link, invalid_list);
2751                 kvm_mod_used_mmu_pages(kvm, -1);
2752         } else {
2753                 list_move(&sp->link, &kvm->arch.active_mmu_pages);
2754
2755                 /*
2756                  * Obsolete pages cannot be used on any vCPUs, see the comment
2757                  * in kvm_mmu_zap_all_fast().  Note, is_obsolete_sp() also
2758                  * treats invalid shadow pages as being obsolete.
2759                  */
2760                 if (!is_obsolete_sp(kvm, sp))
2761                         kvm_reload_remote_mmus(kvm);
2762         }
2763
2764         sp->role.invalid = 1;
2765         return list_unstable;
2766 }
2767
2768 static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
2769                                      struct list_head *invalid_list)
2770 {
2771         int nr_zapped;
2772
2773         __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
2774         return nr_zapped;
2775 }
2776
2777 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2778                                     struct list_head *invalid_list)
2779 {
2780         struct kvm_mmu_page *sp, *nsp;
2781
2782         if (list_empty(invalid_list))
2783                 return;
2784
2785         /*
2786          * We need to make sure everyone sees our modifications to
2787          * the page tables and see changes to vcpu->mode here. The barrier
2788          * in the kvm_flush_remote_tlbs() achieves this. This pairs
2789          * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2790          *
2791          * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2792          * guest mode and/or lockless shadow page table walks.
2793          */
2794         kvm_flush_remote_tlbs(kvm);
2795
2796         list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2797                 WARN_ON(!sp->role.invalid || sp->root_count);
2798                 kvm_mmu_free_page(sp);
2799         }
2800 }
2801
2802 static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
2803                                         struct list_head *invalid_list)
2804 {
2805         struct kvm_mmu_page *sp;
2806
2807         if (list_empty(&kvm->arch.active_mmu_pages))
2808                 return false;
2809
2810         sp = list_last_entry(&kvm->arch.active_mmu_pages,
2811                              struct kvm_mmu_page, link);
2812         return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2813 }
2814
2815 /*
2816  * Changing the number of mmu pages allocated to the vm
2817  * Note: if goal_nr_mmu_pages is too small, you will get dead lock
2818  */
2819 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages)
2820 {
2821         LIST_HEAD(invalid_list);
2822
2823         spin_lock(&kvm->mmu_lock);
2824
2825         if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2826                 /* Need to free some mmu pages to achieve the goal. */
2827                 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
2828                         if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
2829                                 break;
2830
2831                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
2832                 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2833         }
2834
2835         kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2836
2837         spin_unlock(&kvm->mmu_lock);
2838 }
2839
2840 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2841 {
2842         struct kvm_mmu_page *sp;
2843         LIST_HEAD(invalid_list);
2844         int r;
2845
2846         pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
2847         r = 0;
2848         spin_lock(&kvm->mmu_lock);
2849         for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
2850                 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
2851                          sp->role.word);
2852                 r = 1;
2853                 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
2854         }
2855         kvm_mmu_commit_zap_page(kvm, &invalid_list);
2856         spin_unlock(&kvm->mmu_lock);
2857
2858         return r;
2859 }
2860 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
2861
2862 static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
2863 {
2864         trace_kvm_mmu_unsync_page(sp);
2865         ++vcpu->kvm->stat.mmu_unsync;
2866         sp->unsync = 1;
2867
2868         kvm_mmu_mark_parents_unsync(sp);
2869 }
2870
2871 static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
2872                                    bool can_unsync)
2873 {
2874         struct kvm_mmu_page *sp;
2875
2876         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
2877                 return true;
2878
2879         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
2880                 if (!can_unsync)
2881                         return true;
2882
2883                 if (sp->unsync)
2884                         continue;
2885
2886                 WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
2887                 kvm_unsync_page(vcpu, sp);
2888         }
2889
2890         /*
2891          * We need to ensure that the marking of unsync pages is visible
2892          * before the SPTE is updated to allow writes because
2893          * kvm_mmu_sync_roots() checks the unsync flags without holding
2894          * the MMU lock and so can race with this. If the SPTE was updated
2895          * before the page had been marked as unsync-ed, something like the
2896          * following could happen:
2897          *
2898          * CPU 1                    CPU 2
2899          * ---------------------------------------------------------------------
2900          * 1.2 Host updates SPTE
2901          *     to be writable
2902          *                      2.1 Guest writes a GPTE for GVA X.
2903          *                          (GPTE being in the guest page table shadowed
2904          *                           by the SP from CPU 1.)
2905          *                          This reads SPTE during the page table walk.
2906          *                          Since SPTE.W is read as 1, there is no
2907          *                          fault.
2908          *
2909          *                      2.2 Guest issues TLB flush.
2910          *                          That causes a VM Exit.
2911          *
2912          *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
2913          *                          Since it is false, so it just returns.
2914          *
2915          *                      2.4 Guest accesses GVA X.
2916          *                          Since the mapping in the SP was not updated,
2917          *                          so the old mapping for GVA X incorrectly
2918          *                          gets used.
2919          * 1.1 Host marks SP
2920          *     as unsync
2921          *     (sp->unsync = true)
2922          *
2923          * The write barrier below ensures that 1.1 happens before 1.2 and thus
2924          * the situation in 2.4 does not arise. The implicit barrier in 2.2
2925          * pairs with this write barrier.
2926          */
2927         smp_wmb();
2928
2929         return false;
2930 }
2931
2932 static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
2933 {
2934         if (pfn_valid(pfn))
2935                 return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
2936                         /*
2937                          * Some reserved pages, such as those from NVDIMM
2938                          * DAX devices, are not for MMIO, and can be mapped
2939                          * with cached memory type for better performance.
2940                          * However, the above check misconceives those pages
2941                          * as MMIO, and results in KVM mapping them with UC
2942                          * memory type, which would hurt the performance.
2943                          * Therefore, we check the host memory type in addition
2944                          * and only treat UC/UC-/WC pages as MMIO.
2945                          */
2946                         (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn));
2947
2948         return !e820__mapped_raw_any(pfn_to_hpa(pfn),
2949                                      pfn_to_hpa(pfn + 1) - 1,
2950                                      E820_TYPE_RAM);
2951 }
2952
2953 /* Bits which may be returned by set_spte() */
2954 #define SET_SPTE_WRITE_PROTECTED_PT     BIT(0)
2955 #define SET_SPTE_NEED_REMOTE_TLB_FLUSH  BIT(1)
2956
2957 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2958                     unsigned pte_access, int level,
2959                     gfn_t gfn, kvm_pfn_t pfn, bool speculative,
2960                     bool can_unsync, bool host_writable)
2961 {
2962         u64 spte = 0;
2963         int ret = 0;
2964         struct kvm_mmu_page *sp;
2965
2966         if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
2967                 return 0;
2968
2969         sp = page_header(__pa(sptep));
2970         if (sp_ad_disabled(sp))
2971                 spte |= shadow_acc_track_value;
2972
2973         /*
2974          * For the EPT case, shadow_present_mask is 0 if hardware
2975          * supports exec-only page table entries.  In that case,
2976          * ACC_USER_MASK and shadow_user_mask are used to represent
2977          * read access.  See FNAME(gpte_access) in paging_tmpl.h.
2978          */
2979         spte |= shadow_present_mask;
2980         if (!speculative)
2981                 spte |= spte_shadow_accessed_mask(spte);
2982
2983         if (pte_access & ACC_EXEC_MASK)
2984                 spte |= shadow_x_mask;
2985         else
2986                 spte |= shadow_nx_mask;
2987
2988         if (pte_access & ACC_USER_MASK)
2989                 spte |= shadow_user_mask;
2990
2991         if (level > PT_PAGE_TABLE_LEVEL)
2992                 spte |= PT_PAGE_SIZE_MASK;
2993         if (tdp_enabled)
2994                 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
2995                         kvm_is_mmio_pfn(pfn));
2996
2997         if (host_writable)
2998                 spte |= SPTE_HOST_WRITEABLE;
2999         else
3000                 pte_access &= ~ACC_WRITE_MASK;
3001
3002         if (!kvm_is_mmio_pfn(pfn))
3003                 spte |= shadow_me_mask;
3004
3005         spte |= (u64)pfn << PAGE_SHIFT;
3006
3007         if (pte_access & ACC_WRITE_MASK) {
3008
3009                 /*
3010                  * Other vcpu creates new sp in the window between
3011                  * mapping_level() and acquiring mmu-lock. We can
3012                  * allow guest to retry the access, the mapping can
3013                  * be fixed if guest refault.
3014                  */
3015                 if (level > PT_PAGE_TABLE_LEVEL &&
3016                     mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
3017                         goto done;
3018
3019                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
3020
3021                 /*
3022                  * Optimization: for pte sync, if spte was writable the hash
3023                  * lookup is unnecessary (and expensive). Write protection
3024                  * is responsibility of mmu_get_page / kvm_sync_page.
3025                  * Same reasoning can be applied to dirty page accounting.
3026                  */
3027                 if (!can_unsync && is_writable_pte(*sptep))
3028                         goto set_pte;
3029
3030                 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
3031                         pgprintk("%s: found shadow page for %llx, marking ro\n",
3032                                  __func__, gfn);
3033                         ret |= SET_SPTE_WRITE_PROTECTED_PT;
3034                         pte_access &= ~ACC_WRITE_MASK;
3035                         spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
3036                 }
3037         }
3038
3039         if (pte_access & ACC_WRITE_MASK) {
3040                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3041                 spte |= spte_shadow_dirty_mask(spte);
3042         }
3043
3044         if (speculative)
3045                 spte = mark_spte_for_access_track(spte);
3046
3047 set_pte:
3048         if (mmu_spte_update(sptep, spte))
3049                 ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
3050 done:
3051         return ret;
3052 }
3053
3054 static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
3055                         int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
3056                         bool speculative, bool host_writable)
3057 {
3058         int was_rmapped = 0;
3059         int rmap_count;
3060         int set_spte_ret;
3061         int ret = RET_PF_RETRY;
3062         bool flush = false;
3063
3064         pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
3065                  *sptep, write_fault, gfn);
3066
3067         if (is_shadow_present_pte(*sptep)) {
3068                 /*
3069                  * If we overwrite a PTE page pointer with a 2MB PMD, unlink
3070                  * the parent of the now unreachable PTE.
3071                  */
3072                 if (level > PT_PAGE_TABLE_LEVEL &&
3073                     !is_large_pte(*sptep)) {
3074                         struct kvm_mmu_page *child;
3075                         u64 pte = *sptep;
3076
3077                         child = page_header(pte & PT64_BASE_ADDR_MASK);
3078                         drop_parent_pte(child, sptep);
3079                         flush = true;
3080                 } else if (pfn != spte_to_pfn(*sptep)) {
3081                         pgprintk("hfn old %llx new %llx\n",
3082                                  spte_to_pfn(*sptep), pfn);
3083                         drop_spte(vcpu->kvm, sptep);
3084                         flush = true;
3085                 } else
3086                         was_rmapped = 1;
3087         }
3088
3089         set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
3090                                 speculative, true, host_writable);
3091         if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
3092                 if (write_fault)
3093                         ret = RET_PF_EMULATE;
3094                 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3095         }
3096
3097         if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
3098                 kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn,
3099                                 KVM_PAGES_PER_HPAGE(level));
3100
3101         if (unlikely(is_mmio_spte(*sptep)))
3102                 ret = RET_PF_EMULATE;
3103
3104         pgprintk("%s: setting spte %llx\n", __func__, *sptep);
3105         trace_kvm_mmu_set_spte(level, gfn, sptep);
3106         if (!was_rmapped && is_large_pte(*sptep))
3107                 ++vcpu->kvm->stat.lpages;
3108
3109         if (is_shadow_present_pte(*sptep)) {
3110                 if (!was_rmapped) {
3111                         rmap_count = rmap_add(vcpu, sptep, gfn);
3112                         if (rmap_count > RMAP_RECYCLE_THRESHOLD)
3113                                 rmap_recycle(vcpu, sptep, gfn);
3114                 }
3115         }
3116
3117         return ret;
3118 }
3119
3120 static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
3121                                      bool no_dirty_log)
3122 {
3123         struct kvm_memory_slot *slot;
3124
3125         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
3126         if (!slot)
3127                 return KVM_PFN_ERR_FAULT;
3128
3129         return gfn_to_pfn_memslot_atomic(slot, gfn);
3130 }
3131
3132 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
3133                                     struct kvm_mmu_page *sp,
3134                                     u64 *start, u64 *end)
3135 {
3136         struct page *pages[PTE_PREFETCH_NUM];
3137         struct kvm_memory_slot *slot;
3138         unsigned access = sp->role.access;
3139         int i, ret;
3140         gfn_t gfn;
3141
3142         gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
3143         slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
3144         if (!slot)
3145                 return -1;
3146
3147         ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
3148         if (ret <= 0)
3149                 return -1;
3150
3151         for (i = 0; i < ret; i++, gfn++, start++) {
3152                 mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
3153                              page_to_pfn(pages[i]), true, true);
3154                 put_page(pages[i]);
3155         }
3156
3157         return 0;
3158 }
3159
3160 static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3161                                   struct kvm_mmu_page *sp, u64 *sptep)
3162 {
3163         u64 *spte, *start = NULL;
3164         int i;
3165
3166         WARN_ON(!sp->role.direct);
3167
3168         i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
3169         spte = sp->spt + i;
3170
3171         for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
3172                 if (is_shadow_present_pte(*spte) || spte == sptep) {
3173                         if (!start)
3174                                 continue;
3175                         if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
3176                                 break;
3177                         start = NULL;
3178                 } else if (!start)
3179                         start = spte;
3180         }
3181 }
3182
3183 static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
3184 {
3185         struct kvm_mmu_page *sp;
3186
3187         sp = page_header(__pa(sptep));
3188
3189         /*
3190          * Without accessed bits, there's no way to distinguish between
3191          * actually accessed translations and prefetched, so disable pte
3192          * prefetch if accessed bits aren't available.
3193          */
3194         if (sp_ad_disabled(sp))
3195                 return;
3196
3197         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3198                 return;
3199
3200         __direct_pte_prefetch(vcpu, sp, sptep);
3201 }
3202
3203 static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, int write,
3204                         int map_writable, int level, kvm_pfn_t pfn,
3205                         bool prefault)
3206 {
3207         struct kvm_shadow_walk_iterator it;
3208         struct kvm_mmu_page *sp;
3209         int ret;
3210         gfn_t gfn = gpa >> PAGE_SHIFT;
3211         gfn_t base_gfn = gfn;
3212
3213         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3214                 return RET_PF_RETRY;
3215
3216         trace_kvm_mmu_spte_requested(gpa, level, pfn);
3217         for_each_shadow_entry(vcpu, gpa, it) {
3218                 base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
3219                 if (it.level == level)
3220                         break;
3221
3222                 drop_large_spte(vcpu, it.sptep);
3223                 if (!is_shadow_present_pte(*it.sptep)) {
3224                         sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr,
3225                                               it.level - 1, true, ACC_ALL);
3226
3227                         link_shadow_page(vcpu, it.sptep, sp);
3228                 }
3229         }
3230
3231         ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL,
3232                            write, level, base_gfn, pfn, prefault,
3233                            map_writable);
3234         direct_pte_prefetch(vcpu, it.sptep);
3235         ++vcpu->stat.pf_fixed;
3236         return ret;
3237 }
3238
3239 static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
3240 {
3241         send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
3242 }
3243
3244 static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
3245 {
3246         /*
3247          * Do not cache the mmio info caused by writing the readonly gfn
3248          * into the spte otherwise read access on readonly gfn also can
3249          * caused mmio page fault and treat it as mmio access.
3250          */
3251         if (pfn == KVM_PFN_ERR_RO_FAULT)
3252                 return RET_PF_EMULATE;
3253
3254         if (pfn == KVM_PFN_ERR_HWPOISON) {
3255                 kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
3256                 return RET_PF_RETRY;
3257         }
3258
3259         return -EFAULT;
3260 }
3261
3262 static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
3263                                         gfn_t gfn, kvm_pfn_t *pfnp,
3264                                         int *levelp)
3265 {
3266         kvm_pfn_t pfn = *pfnp;
3267         int level = *levelp;
3268
3269         /*
3270          * Check if it's a transparent hugepage. If this would be an
3271          * hugetlbfs page, level wouldn't be set to
3272          * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
3273          * here.
3274          */
3275         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
3276             level == PT_PAGE_TABLE_LEVEL &&
3277             PageTransCompoundMap(pfn_to_page(pfn)) &&
3278             !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
3279                 unsigned long mask;
3280                 /*
3281                  * mmu_notifier_retry was successful and we hold the
3282                  * mmu_lock here, so the pmd can't become splitting
3283                  * from under us, and in turn
3284                  * __split_huge_page_refcount() can't run from under
3285                  * us and we can safely transfer the refcount from
3286                  * PG_tail to PG_head as we switch the pfn to tail to
3287                  * head.
3288                  */
3289                 *levelp = level = PT_DIRECTORY_LEVEL;
3290                 mask = KVM_PAGES_PER_HPAGE(level) - 1;
3291                 VM_BUG_ON((gfn & mask) != (pfn & mask));
3292                 if (pfn & mask) {
3293                         kvm_release_pfn_clean(pfn);
3294                         pfn &= ~mask;
3295                         kvm_get_pfn(pfn);
3296                         *pfnp = pfn;
3297                 }
3298         }
3299 }
3300
3301 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
3302                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
3303 {
3304         /* The pfn is invalid, report the error! */
3305         if (unlikely(is_error_pfn(pfn))) {
3306                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
3307                 return true;
3308         }
3309
3310         if (unlikely(is_noslot_pfn(pfn)))
3311                 vcpu_cache_mmio_info(vcpu, gva, gfn,
3312                                      access & shadow_mmio_access_mask);
3313
3314         return false;
3315 }
3316
3317 static bool page_fault_can_be_fast(u32 error_code)
3318 {
3319         /*
3320          * Do not fix the mmio spte with invalid generation number which
3321          * need to be updated by slow page fault path.
3322          */
3323         if (unlikely(error_code & PFERR_RSVD_MASK))
3324                 return false;
3325
3326         /* See if the page fault is due to an NX violation */
3327         if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
3328                       == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
3329                 return false;
3330
3331         /*
3332          * #PF can be fast if:
3333          * 1. The shadow page table entry is not present, which could mean that
3334          *    the fault is potentially caused by access tracking (if enabled).
3335          * 2. The shadow page table entry is present and the fault
3336          *    is caused by write-protect, that means we just need change the W
3337          *    bit of the spte which can be done out of mmu-lock.
3338          *
3339          * However, if access tracking is disabled we know that a non-present
3340          * page must be a genuine page fault where we have to create a new SPTE.
3341          * So, if access tracking is disabled, we return true only for write
3342          * accesses to a present page.
3343          */
3344
3345         return shadow_acc_track_mask != 0 ||
3346                ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
3347                 == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
3348 }
3349
3350 /*
3351  * Returns true if the SPTE was fixed successfully. Otherwise,
3352  * someone else modified the SPTE from its original value.
3353  */
3354 static bool
3355 fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
3356                         u64 *sptep, u64 old_spte, u64 new_spte)
3357 {
3358         gfn_t gfn;
3359
3360         WARN_ON(!sp->role.direct);
3361
3362         /*
3363          * Theoretically we could also set dirty bit (and flush TLB) here in
3364          * order to eliminate unnecessary PML logging. See comments in
3365          * set_spte. But fast_page_fault is very unlikely to happen with PML
3366          * enabled, so we do not do this. This might result in the same GPA
3367          * to be logged in PML buffer again when the write really happens, and
3368          * eventually to be called by mark_page_dirty twice. But it's also no
3369          * harm. This also avoids the TLB flush needed after setting dirty bit
3370          * so non-PML cases won't be impacted.
3371          *
3372          * Compare with set_spte where instead shadow_dirty_mask is set.
3373          */
3374         if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
3375                 return false;
3376
3377         if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
3378                 /*
3379                  * The gfn of direct spte is stable since it is
3380                  * calculated by sp->gfn.
3381                  */
3382                 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
3383                 kvm_vcpu_mark_page_dirty(vcpu, gfn);
3384         }
3385
3386         return true;
3387 }
3388
3389 static bool is_access_allowed(u32 fault_err_code, u64 spte)
3390 {
3391         if (fault_err_code & PFERR_FETCH_MASK)
3392                 return is_executable_pte(spte);
3393
3394         if (fault_err_code & PFERR_WRITE_MASK)
3395                 return is_writable_pte(spte);
3396
3397         /* Fault was on Read access */
3398         return spte & PT_PRESENT_MASK;
3399 }
3400
3401 /*
3402  * Return value:
3403  * - true: let the vcpu to access on the same address again.
3404  * - false: let the real page fault path to fix it.
3405  */
3406 static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
3407                             u32 error_code)
3408 {
3409         struct kvm_shadow_walk_iterator iterator;
3410         struct kvm_mmu_page *sp;
3411         bool fault_handled = false;
3412         u64 spte = 0ull;
3413         uint retry_count = 0;
3414
3415         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3416                 return false;
3417
3418         if (!page_fault_can_be_fast(error_code))
3419                 return false;
3420
3421         walk_shadow_page_lockless_begin(vcpu);
3422
3423         do {
3424                 u64 new_spte;
3425
3426                 for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
3427                         if (!is_shadow_present_pte(spte) ||
3428                             iterator.level < level)
3429                                 break;
3430
3431                 sp = page_header(__pa(iterator.sptep));
3432                 if (!is_last_spte(spte, sp->role.level))
3433                         break;
3434
3435                 /*
3436                  * Check whether the memory access that caused the fault would
3437                  * still cause it if it were to be performed right now. If not,
3438                  * then this is a spurious fault caused by TLB lazily flushed,
3439                  * or some other CPU has already fixed the PTE after the
3440                  * current CPU took the fault.
3441                  *
3442                  * Need not check the access of upper level table entries since
3443                  * they are always ACC_ALL.
3444                  */
3445                 if (is_access_allowed(error_code, spte)) {
3446                         fault_handled = true;
3447                         break;
3448                 }
3449
3450                 new_spte = spte;
3451
3452                 if (is_access_track_spte(spte))
3453                         new_spte = restore_acc_track_spte(new_spte);
3454
3455                 /*
3456                  * Currently, to simplify the code, write-protection can
3457                  * be removed in the fast path only if the SPTE was
3458                  * write-protected for dirty-logging or access tracking.
3459                  */
3460                 if ((error_code & PFERR_WRITE_MASK) &&
3461                     spte_can_locklessly_be_made_writable(spte))
3462                 {
3463                         new_spte |= PT_WRITABLE_MASK;
3464
3465                         /*
3466                          * Do not fix write-permission on the large spte.  Since
3467                          * we only dirty the first page into the dirty-bitmap in
3468                          * fast_pf_fix_direct_spte(), other pages are missed
3469                          * if its slot has dirty logging enabled.
3470                          *
3471                          * Instead, we let the slow page fault path create a
3472                          * normal spte to fix the access.
3473                          *
3474                          * See the comments in kvm_arch_commit_memory_region().
3475                          */
3476                         if (sp->role.level > PT_PAGE_TABLE_LEVEL)
3477                                 break;
3478                 }
3479
3480                 /* Verify that the fault can be handled in the fast path */
3481                 if (new_spte == spte ||
3482                     !is_access_allowed(error_code, new_spte))
3483                         break;
3484
3485                 /*
3486                  * Currently, fast page fault only works for direct mapping
3487                  * since the gfn is not stable for indirect shadow page. See
3488                  * Documentation/virt/kvm/locking.txt to get more detail.
3489                  */
3490                 fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
3491                                                         iterator.sptep, spte,
3492                                                         new_spte);
3493                 if (fault_handled)
3494                         break;
3495
3496                 if (++retry_count > 4) {
3497                         printk_once(KERN_WARNING
3498                                 "kvm: Fast #PF retrying more than 4 times.\n");
3499                         break;
3500                 }
3501
3502         } while (true);
3503
3504         trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
3505                               spte, fault_handled);
3506         walk_shadow_page_lockless_end(vcpu);
3507
3508         return fault_handled;
3509 }
3510
3511 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
3512                          gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
3513 static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
3514
3515 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
3516                          gfn_t gfn, bool prefault)
3517 {
3518         int r;
3519         int level;
3520         bool force_pt_level = false;
3521         kvm_pfn_t pfn;
3522         unsigned long mmu_seq;
3523         bool map_writable, write = error_code & PFERR_WRITE_MASK;
3524
3525         level = mapping_level(vcpu, gfn, &force_pt_level);
3526         if (likely(!force_pt_level)) {
3527                 /*
3528                  * This path builds a PAE pagetable - so we can map
3529                  * 2mb pages at maximum. Therefore check if the level
3530                  * is larger than that.
3531                  */
3532                 if (level > PT_DIRECTORY_LEVEL)
3533                         level = PT_DIRECTORY_LEVEL;
3534
3535                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
3536         }
3537
3538         if (fast_page_fault(vcpu, v, level, error_code))
3539                 return RET_PF_RETRY;
3540
3541         mmu_seq = vcpu->kvm->mmu_notifier_seq;
3542         smp_rmb();
3543
3544         if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
3545                 return RET_PF_RETRY;
3546
3547         if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
3548                 return r;
3549
3550         r = RET_PF_RETRY;
3551         spin_lock(&vcpu->kvm->mmu_lock);
3552         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
3553                 goto out_unlock;
3554         if (make_mmu_pages_available(vcpu) < 0)
3555                 goto out_unlock;
3556         if (likely(!force_pt_level))
3557                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
3558         r = __direct_map(vcpu, v, write, map_writable, level, pfn, prefault);
3559 out_unlock:
3560         spin_unlock(&vcpu->kvm->mmu_lock);
3561         kvm_release_pfn_clean(pfn);
3562         return r;
3563 }
3564
3565 static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
3566                                struct list_head *invalid_list)
3567 {
3568         struct kvm_mmu_page *sp;
3569
3570         if (!VALID_PAGE(*root_hpa))
3571                 return;
3572
3573         sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
3574         --sp->root_count;
3575         if (!sp->root_count && sp->role.invalid)
3576                 kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3577
3578         *root_hpa = INVALID_PAGE;
3579 }
3580
3581 /* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
3582 void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
3583                         ulong roots_to_free)
3584 {
3585         int i;
3586         LIST_HEAD(invalid_list);
3587         bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
3588
3589         BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3590
3591         /* Before acquiring the MMU lock, see if we need to do any real work. */
3592         if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
3593                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3594                         if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3595                             VALID_PAGE(mmu->prev_roots[i].hpa))
3596                                 break;
3597
3598                 if (i == KVM_MMU_NUM_PREV_ROOTS)
3599                         return;
3600         }
3601
3602         spin_lock(&vcpu->kvm->mmu_lock);
3603
3604         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3605                 if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3606                         mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
3607                                            &invalid_list);
3608
3609         if (free_active_root) {
3610                 if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
3611                     (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
3612                         mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
3613                                            &invalid_list);
3614                 } else {
3615                         for (i = 0; i < 4; ++i)
3616                                 if (mmu->pae_root[i] != 0)
3617                                         mmu_free_root_page(vcpu->kvm,
3618                                                            &mmu->pae_root[i],
3619                                                            &invalid_list);
3620                         mmu->root_hpa = INVALID_PAGE;
3621                 }
3622                 mmu->root_cr3 = 0;
3623         }
3624
3625         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
3626         spin_unlock(&vcpu->kvm->mmu_lock);
3627 }
3628 EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3629
3630 static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
3631 {
3632         int ret = 0;
3633
3634         if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
3635                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3636                 ret = 1;
3637         }
3638
3639         return ret;
3640 }
3641
3642 static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3643 {
3644         struct kvm_mmu_page *sp;
3645         unsigned i;
3646
3647         if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
3648                 spin_lock(&vcpu->kvm->mmu_lock);
3649                 if(make_mmu_pages_available(vcpu) < 0) {
3650                         spin_unlock(&vcpu->kvm->mmu_lock);
3651                         return -ENOSPC;
3652                 }
3653                 sp = kvm_mmu_get_page(vcpu, 0, 0,
3654                                 vcpu->arch.mmu->shadow_root_level, 1, ACC_ALL);
3655                 ++sp->root_count;
3656                 spin_unlock(&vcpu->kvm->mmu_lock);
3657                 vcpu->arch.mmu->root_hpa = __pa(sp->spt);
3658         } else if (vcpu->arch.mmu->shadow_root_level == PT32E_ROOT_LEVEL) {
3659                 for (i = 0; i < 4; ++i) {
3660                         hpa_t root = vcpu->arch.mmu->pae_root[i];
3661
3662                         MMU_WARN_ON(VALID_PAGE(root));
3663                         spin_lock(&vcpu->kvm->mmu_lock);
3664                         if (make_mmu_pages_available(vcpu) < 0) {
3665                                 spin_unlock(&vcpu->kvm->mmu_lock);
3666                                 return -ENOSPC;
3667                         }
3668                         sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
3669                                         i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
3670                         root = __pa(sp->spt);
3671                         ++sp->root_count;
3672                         spin_unlock(&vcpu->kvm->mmu_lock);
3673                         vcpu->arch.mmu->pae_root[i] = root | PT_PRESENT_MASK;
3674                 }
3675                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3676         } else
3677                 BUG();
3678         vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3679
3680         return 0;
3681 }
3682
3683 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3684 {
3685         struct kvm_mmu_page *sp;
3686         u64 pdptr, pm_mask;
3687         gfn_t root_gfn, root_cr3;
3688         int i;
3689
3690         root_cr3 = vcpu->arch.mmu->get_cr3(vcpu);
3691         root_gfn = root_cr3 >> PAGE_SHIFT;
3692
3693         if (mmu_check_root(vcpu, root_gfn))
3694                 return 1;
3695
3696         /*
3697          * Do we shadow a long mode page table? If so we need to
3698          * write-protect the guests page table root.
3699          */
3700         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3701                 hpa_t root = vcpu->arch.mmu->root_hpa;
3702
3703                 MMU_WARN_ON(VALID_PAGE(root));
3704
3705                 spin_lock(&vcpu->kvm->mmu_lock);
3706                 if (make_mmu_pages_available(vcpu) < 0) {
3707                         spin_unlock(&vcpu->kvm->mmu_lock);
3708                         return -ENOSPC;
3709                 }
3710                 sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
3711                                 vcpu->arch.mmu->shadow_root_level, 0, ACC_ALL);
3712                 root = __pa(sp->spt);
3713                 ++sp->root_count;
3714                 spin_unlock(&vcpu->kvm->mmu_lock);
3715                 vcpu->arch.mmu->root_hpa = root;
3716                 goto set_root_cr3;
3717         }
3718
3719         /*
3720          * We shadow a 32 bit page table. This may be a legacy 2-level
3721          * or a PAE 3-level page table. In either case we need to be aware that
3722          * the shadow page table may be a PAE or a long mode page table.
3723          */
3724         pm_mask = PT_PRESENT_MASK;
3725         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL)
3726                 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
3727
3728         for (i = 0; i < 4; ++i) {
3729                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3730
3731                 MMU_WARN_ON(VALID_PAGE(root));
3732                 if (vcpu->arch.mmu->root_level == PT32E_ROOT_LEVEL) {
3733                         pdptr = vcpu->arch.mmu->get_pdptr(vcpu, i);
3734                         if (!(pdptr & PT_PRESENT_MASK)) {
3735                                 vcpu->arch.mmu->pae_root[i] = 0;
3736                                 continue;
3737                         }
3738                         root_gfn = pdptr >> PAGE_SHIFT;
3739                         if (mmu_check_root(vcpu, root_gfn))
3740                                 return 1;
3741                 }
3742                 spin_lock(&vcpu->kvm->mmu_lock);
3743                 if (make_mmu_pages_available(vcpu) < 0) {
3744                         spin_unlock(&vcpu->kvm->mmu_lock);
3745                         return -ENOSPC;
3746                 }
3747                 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
3748                                       0, ACC_ALL);
3749                 root = __pa(sp->spt);
3750                 ++sp->root_count;
3751                 spin_unlock(&vcpu->kvm->mmu_lock);
3752
3753                 vcpu->arch.mmu->pae_root[i] = root | pm_mask;
3754         }
3755         vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root);
3756
3757         /*
3758          * If we shadow a 32 bit page table with a long mode page
3759          * table we enter this path.
3760          */
3761         if (vcpu->arch.mmu->shadow_root_level == PT64_ROOT_4LEVEL) {
3762                 if (vcpu->arch.mmu->lm_root == NULL) {
3763                         /*
3764                          * The additional page necessary for this is only
3765                          * allocated on demand.
3766                          */
3767
3768                         u64 *lm_root;
3769
3770                         lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3771                         if (lm_root == NULL)
3772                                 return 1;
3773
3774                         lm_root[0] = __pa(vcpu->arch.mmu->pae_root) | pm_mask;
3775
3776                         vcpu->arch.mmu->lm_root = lm_root;
3777                 }
3778
3779                 vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root);
3780         }
3781
3782 set_root_cr3:
3783         vcpu->arch.mmu->root_cr3 = root_cr3;
3784
3785         return 0;
3786 }
3787
3788 static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
3789 {
3790         if (vcpu->arch.mmu->direct_map)
3791                 return mmu_alloc_direct_roots(vcpu);
3792         else
3793                 return mmu_alloc_shadow_roots(vcpu);
3794 }
3795
3796 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
3797 {
3798         int i;
3799         struct kvm_mmu_page *sp;
3800
3801         if (vcpu->arch.mmu->direct_map)
3802                 return;
3803
3804         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3805                 return;
3806
3807         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
3808
3809         if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
3810                 hpa_t root = vcpu->arch.mmu->root_hpa;
3811                 sp = page_header(root);
3812
3813                 /*
3814                  * Even if another CPU was marking the SP as unsync-ed
3815                  * simultaneously, any guest page table changes are not
3816                  * guaranteed to be visible anyway until this VCPU issues a TLB
3817                  * flush strictly after those changes are made. We only need to
3818                  * ensure that the other CPU sets these flags before any actual
3819                  * changes to the page tables are made. The comments in
3820                  * mmu_need_write_protect() describe what could go wrong if this
3821                  * requirement isn't satisfied.
3822                  */
3823                 if (!smp_load_acquire(&sp->unsync) &&
3824                     !smp_load_acquire(&sp->unsync_children))
3825                         return;
3826
3827                 spin_lock(&vcpu->kvm->mmu_lock);
3828                 kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3829
3830                 mmu_sync_children(vcpu, sp);
3831
3832                 kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3833                 spin_unlock(&vcpu->kvm->mmu_lock);
3834                 return;
3835         }
3836
3837         spin_lock(&vcpu->kvm->mmu_lock);
3838         kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
3839
3840         for (i = 0; i < 4; ++i) {
3841                 hpa_t root = vcpu->arch.mmu->pae_root[i];
3842
3843                 if (root && VALID_PAGE(root)) {
3844                         root &= PT64_BASE_ADDR_MASK;
3845                         sp = page_header(root);
3846                         mmu_sync_children(vcpu, sp);
3847                 }
3848         }
3849
3850         kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
3851         spin_unlock(&vcpu->kvm->mmu_lock);
3852 }
3853 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
3854
3855 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
3856                                   u32 access, struct x86_exception *exception)
3857 {
3858         if (exception)
3859                 exception->error_code = 0;
3860         return vaddr;
3861 }
3862
3863 static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
3864                                          u32 access,
3865                                          struct x86_exception *exception)
3866 {
3867         if (exception)
3868                 exception->error_code = 0;
3869         return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
3870 }
3871
3872 static bool
3873 __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
3874 {
3875         int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
3876
3877         return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) |
3878                 ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
3879 }
3880
3881 static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
3882 {
3883         return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
3884 }
3885
3886 static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
3887 {
3888         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
3889 }
3890
3891 static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3892 {
3893         /*
3894          * A nested guest cannot use the MMIO cache if it is using nested
3895          * page tables, because cr2 is a nGPA while the cache stores GPAs.
3896          */
3897         if (mmu_is_nested(vcpu))
3898                 return false;
3899
3900         if (direct)
3901                 return vcpu_match_mmio_gpa(vcpu, addr);
3902
3903         return vcpu_match_mmio_gva(vcpu, addr);
3904 }
3905
3906 /* return true if reserved bit is detected on spte. */
3907 static bool
3908 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
3909 {
3910         struct kvm_shadow_walk_iterator iterator;
3911         u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
3912         int root, leaf;
3913         bool reserved = false;
3914
3915         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
3916                 goto exit;
3917
3918         walk_shadow_page_lockless_begin(vcpu);
3919
3920         for (shadow_walk_init(&iterator, vcpu, addr),
3921                  leaf = root = iterator.level;
3922              shadow_walk_okay(&iterator);
3923              __shadow_walk_next(&iterator, spte)) {
3924                 spte = mmu_spte_get_lockless(iterator.sptep);
3925
3926                 sptes[leaf - 1] = spte;
3927                 leaf--;
3928
3929                 if (!is_shadow_present_pte(spte))
3930                         break;
3931
3932                 reserved |= is_shadow_zero_bits_set(vcpu->arch.mmu, spte,
3933                                                     iterator.level);
3934         }
3935
3936         walk_shadow_page_lockless_end(vcpu);
3937
3938         if (reserved) {
3939                 pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
3940                        __func__, addr);
3941                 while (root > leaf) {
3942                         pr_err("------ spte 0x%llx level %d.\n",
3943                                sptes[root - 1], root);
3944                         root--;
3945                 }
3946         }
3947 exit:
3948         *sptep = spte;
3949         return reserved;
3950 }
3951
3952 static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
3953 {
3954         u64 spte;
3955         bool reserved;
3956
3957         if (mmio_info_in_cache(vcpu, addr, direct))
3958                 return RET_PF_EMULATE;
3959
3960         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
3961         if (WARN_ON(reserved))
3962                 return -EINVAL;
3963
3964         if (is_mmio_spte(spte)) {
3965                 gfn_t gfn = get_mmio_spte_gfn(spte);
3966                 unsigned access = get_mmio_spte_access(spte);
3967
3968                 if (!check_mmio_spte(vcpu, spte))
3969                         return RET_PF_INVALID;
3970
3971                 if (direct)
3972                         addr = 0;
3973
3974                 trace_handle_mmio_page_fault(addr, gfn, access);
3975                 vcpu_cache_mmio_info(vcpu, addr, gfn, access);
3976                 return RET_PF_EMULATE;
3977         }
3978
3979         /*
3980          * If the page table is zapped by other cpus, let CPU fault again on
3981          * the address.
3982          */
3983         return RET_PF_RETRY;
3984 }
3985
3986 static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
3987                                          u32 error_code, gfn_t gfn)
3988 {
3989         if (unlikely(error_code & PFERR_RSVD_MASK))
3990                 return false;
3991
3992         if (!(error_code & PFERR_PRESENT_MASK) ||
3993               !(error_code & PFERR_WRITE_MASK))
3994                 return false;
3995
3996         /*
3997          * guest is writing the page which is write tracked which can
3998          * not be fixed by page fault handler.
3999          */
4000         if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
4001                 return true;
4002
4003         return false;
4004 }
4005
4006 static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4007 {
4008         struct kvm_shadow_walk_iterator iterator;
4009         u64 spte;
4010
4011         if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
4012                 return;
4013
4014         walk_shadow_page_lockless_begin(vcpu);
4015         for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
4016                 clear_sp_write_flooding_count(iterator.sptep);
4017                 if (!is_shadow_present_pte(spte))
4018                         break;
4019         }
4020         walk_shadow_page_lockless_end(vcpu);
4021 }
4022
4023 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
4024                                 u32 error_code, bool prefault)
4025 {
4026         gfn_t gfn = gva >> PAGE_SHIFT;
4027         int r;
4028
4029         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
4030
4031         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4032                 return RET_PF_EMULATE;
4033
4034         r = mmu_topup_memory_caches(vcpu);
4035         if (r)
4036                 return r;
4037
4038         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4039
4040
4041         return nonpaging_map(vcpu, gva & PAGE_MASK,
4042                              error_code, gfn, prefault);
4043 }
4044
4045 static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
4046 {
4047         struct kvm_arch_async_pf arch;
4048
4049         arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
4050         arch.gfn = gfn;
4051         arch.direct_map = vcpu->arch.mmu->direct_map;
4052         arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
4053
4054         return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
4055 }
4056
4057 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
4058                          gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
4059 {
4060         struct kvm_memory_slot *slot;
4061         bool async;
4062
4063         /*
4064          * Don't expose private memslots to L2.
4065          */
4066         if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
4067                 *pfn = KVM_PFN_NOSLOT;
4068                 return false;
4069         }
4070
4071         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
4072         async = false;
4073         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
4074         if (!async)
4075                 return false; /* *pfn has correct page already */
4076
4077         if (!prefault && kvm_can_do_async_pf(vcpu)) {
4078                 trace_kvm_try_async_get_page(gva, gfn);
4079                 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
4080                         trace_kvm_async_pf_doublefault(gva, gfn);
4081                         kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4082                         return true;
4083                 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
4084                         return true;
4085         }
4086
4087         *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
4088         return false;
4089 }
4090
4091 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4092                                 u64 fault_address, char *insn, int insn_len)
4093 {
4094         int r = 1;
4095
4096         vcpu->arch.l1tf_flush_l1d = true;
4097         switch (vcpu->arch.apf.host_apf_reason) {
4098         default:
4099                 trace_kvm_page_fault(fault_address, error_code);
4100
4101                 if (kvm_event_needs_reinjection(vcpu))
4102                         kvm_mmu_unprotect_page_virt(vcpu, fault_address);
4103                 r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
4104                                 insn_len);
4105                 break;
4106         case KVM_PV_REASON_PAGE_NOT_PRESENT:
4107                 vcpu->arch.apf.host_apf_reason = 0;
4108                 local_irq_disable();
4109                 kvm_async_pf_task_wait(fault_address, 0);
4110                 local_irq_enable();
4111                 break;
4112         case KVM_PV_REASON_PAGE_READY:
4113                 vcpu->arch.apf.host_apf_reason = 0;
4114                 local_irq_disable();
4115                 kvm_async_pf_task_wake(fault_address);
4116                 local_irq_enable();
4117                 break;
4118         }
4119         return r;
4120 }
4121 EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4122
4123 static bool
4124 check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
4125 {
4126         int page_num = KVM_PAGES_PER_HPAGE(level);
4127
4128         gfn &= ~(page_num - 1);
4129
4130         return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
4131 }
4132
4133 static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
4134                           bool prefault)
4135 {
4136         kvm_pfn_t pfn;
4137         int r;
4138         int level;
4139         bool force_pt_level;
4140         gfn_t gfn = gpa >> PAGE_SHIFT;
4141         unsigned long mmu_seq;
4142         int write = error_code & PFERR_WRITE_MASK;
4143         bool map_writable;
4144
4145         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
4146
4147         if (page_fault_handle_page_track(vcpu, error_code, gfn))
4148                 return RET_PF_EMULATE;
4149
4150         r = mmu_topup_memory_caches(vcpu);
4151         if (r)
4152                 return r;
4153
4154         force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
4155                                                            PT_DIRECTORY_LEVEL);
4156         level = mapping_level(vcpu, gfn, &force_pt_level);
4157         if (likely(!force_pt_level)) {
4158                 if (level > PT_DIRECTORY_LEVEL &&
4159                     !check_hugepage_cache_consistency(vcpu, gfn, level))
4160                         level = PT_DIRECTORY_LEVEL;
4161                 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
4162         }
4163
4164         if (fast_page_fault(vcpu, gpa, level, error_code))
4165                 return RET_PF_RETRY;
4166
4167         mmu_seq = vcpu->kvm->mmu_notifier_seq;
4168         smp_rmb();
4169
4170         if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
4171                 return RET_PF_RETRY;
4172
4173         if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
4174                 return r;
4175
4176         r = RET_PF_RETRY;
4177         spin_lock(&vcpu->kvm->mmu_lock);
4178         if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
4179                 goto out_unlock;
4180         if (make_mmu_pages_available(vcpu) < 0)
4181                 goto out_unlock;
4182         if (likely(!force_pt_level))
4183                 transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
4184         r = __direct_map(vcpu, gpa, write, map_writable, level, pfn, prefault);
4185 out_unlock:
4186         spin_unlock(&vcpu->kvm->mmu_lock);
4187         kvm_release_pfn_clean(pfn);
4188         return r;
4189 }
4190
4191 static void nonpaging_init_context(struct kvm_vcpu *vcpu,
4192                                    struct kvm_mmu *context)
4193 {
4194         context->page_fault = nonpaging_page_fault;
4195         context->gva_to_gpa = nonpaging_gva_to_gpa;
4196         context->sync_page = nonpaging_sync_page;
4197         context->invlpg = nonpaging_invlpg;
4198         context->update_pte = nonpaging_update_pte;
4199         context->root_level = 0;
4200         context->shadow_root_level = PT32E_ROOT_LEVEL;
4201         context->direct_map = true;
4202         context->nx = false;
4203 }
4204
4205 /*
4206  * Find out if a previously cached root matching the new CR3/role is available.
4207  * The current root is also inserted into the cache.
4208  * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
4209  * returned.
4210  * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
4211  * false is returned. This root should now be freed by the caller.
4212  */
4213 static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4214                                   union kvm_mmu_page_role new_role)
4215 {
4216         uint i;
4217         struct kvm_mmu_root_info root;
4218         struct kvm_mmu *mmu = vcpu->arch.mmu;
4219
4220         root.cr3 = mmu->root_cr3;
4221         root.hpa = mmu->root_hpa;
4222
4223         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4224                 swap(root, mmu->prev_roots[i]);
4225
4226                 if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
4227                     page_header(root.hpa) != NULL &&
4228                     new_role.word == page_header(root.hpa)->role.word)
4229                         break;
4230         }
4231
4232         mmu->root_hpa = root.hpa;
4233         mmu->root_cr3 = root.cr3;
4234
4235         return i < KVM_MMU_NUM_PREV_ROOTS;
4236 }
4237
4238 static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4239                             union kvm_mmu_page_role new_role,
4240                             bool skip_tlb_flush)
4241 {
4242         struct kvm_mmu *mmu = vcpu->arch.mmu;
4243
4244         /*
4245          * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
4246          * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
4247          * later if necessary.
4248          */
4249         if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
4250             mmu->root_level >= PT64_ROOT_4LEVEL) {
4251                 if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
4252                         return false;
4253
4254                 if (cached_root_available(vcpu, new_cr3, new_role)) {
4255                         /*
4256                          * It is possible that the cached previous root page is
4257                          * obsolete because of a change in the MMU generation
4258                          * number. However, changing the generation number is
4259                          * accompanied by KVM_REQ_MMU_RELOAD, which will free
4260                          * the root set here and allocate a new one.
4261                          */
4262                         kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
4263                         if (!skip_tlb_flush) {
4264                                 kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4265                                 kvm_x86_ops->tlb_flush(vcpu, true);
4266                         }
4267
4268                         /*
4269                          * The last MMIO access's GVA and GPA are cached in the
4270                          * VCPU. When switching to a new CR3, that GVA->GPA
4271                          * mapping may no longer be valid. So clear any cached
4272                          * MMIO info even when we don't need to sync the shadow
4273                          * page tables.
4274                          */
4275                         vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4276
4277                         __clear_sp_write_flooding_count(
4278                                 page_header(mmu->root_hpa));
4279
4280                         return true;
4281                 }
4282         }
4283
4284         return false;
4285 }
4286
4287 static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
4288                               union kvm_mmu_page_role new_role,
4289                               bool skip_tlb_flush)
4290 {
4291         if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
4292                 kvm_mmu_free_roots(vcpu, vcpu->arch.mmu,
4293                                    KVM_MMU_ROOT_CURRENT);
4294 }
4295
4296 void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
4297 {
4298         __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
4299                           skip_tlb_flush);
4300 }
4301 EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
4302
4303 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
4304 {
4305         return kvm_read_cr3(vcpu);
4306 }
4307
4308 static void inject_page_fault(struct kvm_vcpu *vcpu,
4309                               struct x86_exception *fault)
4310 {
4311         vcpu->arch.mmu->inject_page_fault(vcpu, fault);
4312 }
4313
4314 static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
4315                            unsigned access, int *nr_present)
4316 {
4317         if (unlikely(is_mmio_spte(*sptep))) {
4318                 if (gfn != get_mmio_spte_gfn(*sptep)) {
4319                         mmu_spte_clear_no_track(sptep);
4320                         return true;
4321                 }
4322
4323                 (*nr_present)++;
4324                 mark_mmio_spte(vcpu, sptep, gfn, access);
4325                 return true;
4326         }
4327
4328         return false;
4329 }
4330
4331 static inline bool is_last_gpte(struct kvm_mmu *mmu,
4332                                 unsigned level, unsigned gpte)
4333 {
4334         /*
4335          * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
4336          * If it is clear, there are no large pages at this level, so clear
4337          * PT_PAGE_SIZE_MASK in gpte if that is the case.
4338          */
4339         gpte &= level - mmu->last_nonleaf_level;
4340
4341         /*
4342          * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
4343          * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
4344          * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
4345          */
4346         gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
4347
4348         return gpte & PT_PAGE_SIZE_MASK;
4349 }
4350
4351 #define PTTYPE_EPT 18 /* arbitrary */
4352 #define PTTYPE PTTYPE_EPT
4353 #include "paging_tmpl.h"
4354 #undef PTTYPE
4355
4356 #define PTTYPE 64
4357 #include "paging_tmpl.h"
4358 #undef PTTYPE
4359
4360 #define PTTYPE 32
4361 #include "paging_tmpl.h"
4362 #undef PTTYPE
4363
4364 static void
4365 __reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4366                         struct rsvd_bits_validate *rsvd_check,
4367                         int maxphyaddr, int level, bool nx, bool gbpages,
4368                         bool pse, bool amd)
4369 {
4370         u64 exb_bit_rsvd = 0;
4371         u64 gbpages_bit_rsvd = 0;
4372         u64 nonleaf_bit8_rsvd = 0;
4373
4374         rsvd_check->bad_mt_xwr = 0;
4375
4376         if (!nx)
4377                 exb_bit_rsvd = rsvd_bits(63, 63);
4378         if (!gbpages)
4379                 gbpages_bit_rsvd = rsvd_bits(7, 7);
4380
4381         /*
4382          * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4383          * leaf entries) on AMD CPUs only.
4384          */
4385         if (amd)
4386                 nonleaf_bit8_rsvd = rsvd_bits(8, 8);
4387
4388         switch (level) {
4389         case PT32_ROOT_LEVEL:
4390                 /* no rsvd bits for 2 level 4K page table entries */
4391                 rsvd_check->rsvd_bits_mask[0][1] = 0;
4392                 rsvd_check->rsvd_bits_mask[0][0] = 0;
4393                 rsvd_check->rsvd_bits_mask[1][0] =
4394                         rsvd_check->rsvd_bits_mask[0][0];
4395
4396                 if (!pse) {
4397                         rsvd_check->rsvd_bits_mask[1][1] = 0;
4398                         break;
4399                 }
4400
4401                 if (is_cpuid_PSE36())
4402                         /* 36bits PSE 4MB page */
4403                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
4404                 else
4405                         /* 32 bits PSE 4MB page */
4406                         rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
4407                 break;
4408         case PT32E_ROOT_LEVEL:
4409                 rsvd_check->rsvd_bits_mask[0][2] =
4410                         rsvd_bits(maxphyaddr, 63) |
4411                         rsvd_bits(5, 8) | rsvd_bits(1, 2);      /* PDPTE */
4412                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4413                         rsvd_bits(maxphyaddr, 62);      /* PDE */
4414                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4415                         rsvd_bits(maxphyaddr, 62);      /* PTE */
4416                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4417                         rsvd_bits(maxphyaddr, 62) |
4418                         rsvd_bits(13, 20);              /* large page */
4419                 rsvd_check->rsvd_bits_mask[1][0] =
4420                         rsvd_check->rsvd_bits_mask[0][0];
4421                 break;
4422         case PT64_ROOT_5LEVEL:
4423                 rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd |
4424                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4425                         rsvd_bits(maxphyaddr, 51);
4426                 rsvd_check->rsvd_bits_mask[1][4] =
4427                         rsvd_check->rsvd_bits_mask[0][4];
4428                 /* fall through */
4429         case PT64_ROOT_4LEVEL:
4430                 rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd |
4431                         nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
4432                         rsvd_bits(maxphyaddr, 51);
4433                 rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
4434                         nonleaf_bit8_rsvd | gbpages_bit_rsvd |
4435                         rsvd_bits(maxphyaddr, 51);
4436                 rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
4437                         rsvd_bits(maxphyaddr, 51);
4438                 rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd |
4439                         rsvd_bits(maxphyaddr, 51);
4440                 rsvd_check->rsvd_bits_mask[1][3] =
4441                         rsvd_check->rsvd_bits_mask[0][3];
4442                 rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd |
4443                         gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) |
4444                         rsvd_bits(13, 29);
4445                 rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd |
4446                         rsvd_bits(maxphyaddr, 51) |
4447                         rsvd_bits(13, 20);              /* large page */
4448                 rsvd_check->rsvd_bits_mask[1][0] =
4449                         rsvd_check->rsvd_bits_mask[0][0];
4450                 break;
4451         }
4452 }
4453
4454 static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4455                                   struct kvm_mmu *context)
4456 {
4457         __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
4458                                 cpuid_maxphyaddr(vcpu), context->root_level,
4459                                 context->nx,
4460                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4461                                 is_pse(vcpu), guest_cpuid_is_amd(vcpu));
4462 }
4463
4464 static void
4465 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4466                             int maxphyaddr, bool execonly)
4467 {
4468         u64 bad_mt_xwr;
4469
4470         rsvd_check->rsvd_bits_mask[0][4] =
4471                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4472         rsvd_check->rsvd_bits_mask[0][3] =
4473                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
4474         rsvd_check->rsvd_bits_mask[0][2] =
4475                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4476         rsvd_check->rsvd_bits_mask[0][1] =
4477                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6);
4478         rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
4479
4480         /* large page */
4481         rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
4482         rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
4483         rsvd_check->rsvd_bits_mask[1][2] =
4484                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29);
4485         rsvd_check->rsvd_bits_mask[1][1] =
4486                 rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
4487         rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
4488
4489         bad_mt_xwr = 0xFFull << (2 * 8);        /* bits 3..5 must not be 2 */
4490         bad_mt_xwr |= 0xFFull << (3 * 8);       /* bits 3..5 must not be 3 */
4491         bad_mt_xwr |= 0xFFull << (7 * 8);       /* bits 3..5 must not be 7 */
4492         bad_mt_xwr |= REPEAT_BYTE(1ull << 2);   /* bits 0..2 must not be 010 */
4493         bad_mt_xwr |= REPEAT_BYTE(1ull << 6);   /* bits 0..2 must not be 110 */
4494         if (!execonly) {
4495                 /* bits 0..2 must not be 100 unless VMX capabilities allow it */
4496                 bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
4497         }
4498         rsvd_check->bad_mt_xwr = bad_mt_xwr;
4499 }
4500
4501 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4502                 struct kvm_mmu *context, bool execonly)
4503 {
4504         __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
4505                                     cpuid_maxphyaddr(vcpu), execonly);
4506 }
4507
4508 /*
4509  * the page table on host is the shadow page table for the page
4510  * table in guest or amd nested guest, its mmu features completely
4511  * follow the features in guest.
4512  */
4513 void
4514 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
4515 {
4516         bool uses_nx = context->nx ||
4517                 context->mmu_role.base.smep_andnot_wp;
4518         struct rsvd_bits_validate *shadow_zero_check;
4519         int i;
4520
4521         /*
4522          * Passing "true" to the last argument is okay; it adds a check
4523          * on bit 8 of the SPTEs which KVM doesn't use anyway.
4524          */
4525         shadow_zero_check = &context->shadow_zero_check;
4526         __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4527                                 shadow_phys_bits,
4528                                 context->shadow_root_level, uses_nx,
4529                                 guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
4530                                 is_pse(vcpu), true);
4531
4532         if (!shadow_me_mask)
4533                 return;
4534
4535         for (i = context->shadow_root_level; --i >= 0;) {
4536                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4537                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4538         }
4539
4540 }
4541 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
4542
4543 static inline bool boot_cpu_is_amd(void)
4544 {
4545         WARN_ON_ONCE(!tdp_enabled);
4546         return shadow_x_mask == 0;
4547 }
4548
4549 /*
4550  * the direct page table on host, use as much mmu features as
4551  * possible, however, kvm currently does not do execution-protection.
4552  */
4553 static void
4554 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4555                                 struct kvm_mmu *context)
4556 {
4557         struct rsvd_bits_validate *shadow_zero_check;
4558         int i;
4559
4560         shadow_zero_check = &context->shadow_zero_check;
4561
4562         if (boot_cpu_is_amd())
4563                 __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
4564                                         shadow_phys_bits,
4565                                         context->shadow_root_level, false,
4566                                         boot_cpu_has(X86_FEATURE_GBPAGES),
4567                                         true, true);
4568         else
4569                 __reset_rsvds_bits_mask_ept(shadow_zero_check,
4570                                             shadow_phys_bits,
4571                                             false);
4572
4573         if (!shadow_me_mask)
4574                 return;
4575
4576         for (i = context->shadow_root_level; --i >= 0;) {
4577                 shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
4578                 shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
4579         }
4580 }
4581
4582 /*
4583  * as the comments in reset_shadow_zero_bits_mask() except it
4584  * is the shadow page table for intel nested guest.
4585  */
4586 static void
4587 reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4588                                 struct kvm_mmu *context, bool execonly)
4589 {
4590         __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
4591                                     shadow_phys_bits, execonly);
4592 }
4593
4594 #define BYTE_MASK(access) \
4595         ((1 & (access) ? 2 : 0) | \
4596          (2 & (access) ? 4 : 0) | \
4597          (3 & (access) ? 8 : 0) | \
4598          (4 & (access) ? 16 : 0) | \
4599          (5 & (access) ? 32 : 0) | \
4600          (6 & (access) ? 64 : 0) | \
4601          (7 & (access) ? 128 : 0))
4602
4603
4604 static void update_permission_bitmask(struct kvm_vcpu *vcpu,
4605                                       struct kvm_mmu *mmu, bool ept)
4606 {
4607         unsigned byte;
4608
4609         const u8 x = BYTE_MASK(ACC_EXEC_MASK);
4610         const u8 w = BYTE_MASK(ACC_WRITE_MASK);
4611         const u8 u = BYTE_MASK(ACC_USER_MASK);
4612
4613         bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
4614         bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
4615         bool cr0_wp = is_write_protection(vcpu);
4616
4617         for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
4618                 unsigned pfec = byte << 1;
4619
4620                 /*
4621                  * Each "*f" variable has a 1 bit for each UWX value
4622                  * that causes a fault with the given PFEC.
4623                  */
4624
4625                 /* Faults from writes to non-writable pages */
4626                 u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0;
4627                 /* Faults from user mode accesses to supervisor pages */
4628                 u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0;
4629                 /* Faults from fetches of non-executable pages*/
4630                 u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0;
4631                 /* Faults from kernel mode fetches of user pages */
4632                 u8 smepf = 0;
4633                 /* Faults from kernel mode accesses of user pages */
4634                 u8 smapf = 0;
4635
4636                 if (!ept) {
4637                         /* Faults from kernel mode accesses to user pages */
4638                         u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
4639
4640                         /* Not really needed: !nx will cause pte.nx to fault */
4641                         if (!mmu->nx)
4642                                 ff = 0;
4643
4644                         /* Allow supervisor writes if !cr0.wp */
4645                         if (!cr0_wp)
4646                                 wf = (pfec & PFERR_USER_MASK) ? wf : 0;
4647
4648                         /* Disallow supervisor fetches of user code if cr4.smep */
4649                         if (cr4_smep)
4650                                 smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
4651
4652                         /*
4653                          * SMAP:kernel-mode data accesses from user-mode
4654                          * mappings should fault. A fault is considered
4655                          * as a SMAP violation if all of the following
4656                          * conditions are true:
4657                          *   - X86_CR4_SMAP is set in CR4
4658                          *   - A user page is accessed
4659                          *   - The access is not a fetch
4660                          *   - Page fault in kernel mode
4661                          *   - if CPL = 3 or X86_EFLAGS_AC is clear
4662                          *
4663                          * Here, we cover the first three conditions.
4664                          * The fourth is computed dynamically in permission_fault();
4665                          * PFERR_RSVD_MASK bit will be set in PFEC if the access is
4666                          * *not* subject to SMAP restrictions.
4667                          */
4668                         if (cr4_smap)
4669                                 smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
4670                 }
4671
4672                 mmu->permissions[byte] = ff | uf | wf | smepf | smapf;
4673         }
4674 }
4675
4676 /*
4677 * PKU is an additional mechanism by which the paging controls access to
4678 * user-mode addresses based on the value in the PKRU register.  Protection
4679 * key violations are reported through a bit in the page fault error code.
4680 * Unlike other bits of the error code, the PK bit is not known at the
4681 * call site of e.g. gva_to_gpa; it must be computed directly in
4682 * permission_fault based on two bits of PKRU, on some machine state (CR4,
4683 * CR0, EFER, CPL), and on other bits of the error code and the page tables.
4684 *
4685 * In particular the following conditions come from the error code, the
4686 * page tables and the machine state:
4687 * - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
4688 * - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
4689 * - PK is always zero if U=0 in the page tables
4690 * - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
4691 *
4692 * The PKRU bitmask caches the result of these four conditions.  The error
4693 * code (minus the P bit) and the page table's U bit form an index into the
4694 * PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
4695 * with the two bits of the PKRU register corresponding to the protection key.
4696 * For the first three conditions above the bits will be 00, thus masking
4697 * away both AD and WD.  For all reads or if the last condition holds, WD
4698 * only will be masked away.
4699 */
4700 static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
4701                                 bool ept)
4702 {
4703         unsigned bit;
4704         bool wp;
4705
4706         if (ept) {
4707                 mmu->pkru_mask = 0;
4708                 return;
4709         }
4710
4711         /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
4712         if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
4713                 mmu->pkru_mask = 0;
4714                 return;
4715         }
4716
4717         wp = is_write_protection(vcpu);
4718
4719         for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
4720                 unsigned pfec, pkey_bits;
4721                 bool check_pkey, check_write, ff, uf, wf, pte_user;
4722
4723                 pfec = bit << 1;
4724                 ff = pfec & PFERR_FETCH_MASK;
4725                 uf = pfec & PFERR_USER_MASK;
4726                 wf = pfec & PFERR_WRITE_MASK;
4727
4728                 /* PFEC.RSVD is replaced by ACC_USER_MASK. */
4729                 pte_user = pfec & PFERR_RSVD_MASK;
4730
4731                 /*
4732                  * Only need to check the access which is not an
4733                  * instruction fetch and is to a user page.
4734                  */
4735                 check_pkey = (!ff && pte_user);
4736                 /*
4737                  * write access is controlled by PKRU if it is a
4738                  * user access or CR0.WP = 1.
4739                  */
4740                 check_write = check_pkey && wf && (uf || wp);
4741
4742                 /* PKRU.AD stops both read and write access. */
4743                 pkey_bits = !!check_pkey;
4744                 /* PKRU.WD stops write access. */
4745                 pkey_bits |= (!!check_write) << 1;
4746
4747                 mmu->pkru_mask |= (pkey_bits & 3) << pfec;
4748         }
4749 }
4750
4751 static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
4752 {
4753         unsigned root_level = mmu->root_level;
4754
4755         mmu->last_nonleaf_level = root_level;
4756         if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
4757                 mmu->last_nonleaf_level++;
4758 }
4759
4760 static void paging64_init_context_common(struct kvm_vcpu *vcpu,
4761                                          struct kvm_mmu *context,
4762                                          int level)
4763 {
4764         context->nx = is_nx(vcpu);
4765         context->root_level = level;
4766
4767         reset_rsvds_bits_mask(vcpu, context);
4768         update_permission_bitmask(vcpu, context, false);
4769         update_pkru_bitmask(vcpu, context, false);
4770         update_last_nonleaf_level(vcpu, context);
4771
4772         MMU_WARN_ON(!is_pae(vcpu));
4773         context->page_fault = paging64_page_fault;
4774         context->gva_to_gpa = paging64_gva_to_gpa;
4775         context->sync_page = paging64_sync_page;
4776         context->invlpg = paging64_invlpg;
4777         context->update_pte = paging64_update_pte;
4778         context->shadow_root_level = level;
4779         context->direct_map = false;
4780 }
4781
4782 static void paging64_init_context(struct kvm_vcpu *vcpu,
4783                                   struct kvm_mmu *context)
4784 {
4785         int root_level = is_la57_mode(vcpu) ?
4786                          PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4787
4788         paging64_init_context_common(vcpu, context, root_level);
4789 }
4790
4791 static void paging32_init_context(struct kvm_vcpu *vcpu,
4792                                   struct kvm_mmu *context)
4793 {
4794         context->nx = false;
4795         context->root_level = PT32_ROOT_LEVEL;
4796
4797         reset_rsvds_bits_mask(vcpu, context);
4798         update_permission_bitmask(vcpu, context, false);
4799         update_pkru_bitmask(vcpu, context, false);
4800         update_last_nonleaf_level(vcpu, context);
4801
4802         context->page_fault = paging32_page_fault;
4803         context->gva_to_gpa = paging32_gva_to_gpa;
4804         context->sync_page = paging32_sync_page;
4805         context->invlpg = paging32_invlpg;
4806         context->update_pte = paging32_update_pte;
4807         context->shadow_root_level = PT32E_ROOT_LEVEL;
4808         context->direct_map = false;
4809 }
4810
4811 static void paging32E_init_context(struct kvm_vcpu *vcpu,
4812                                    struct kvm_mmu *context)
4813 {
4814         paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
4815 }
4816
4817 static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu)
4818 {
4819         union kvm_mmu_extended_role ext = {0};
4820
4821         ext.cr0_pg = !!is_paging(vcpu);
4822         ext.cr4_pae = !!is_pae(vcpu);
4823         ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
4824         ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
4825         ext.cr4_pse = !!is_pse(vcpu);
4826         ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE);
4827         ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57);
4828         ext.maxphyaddr = cpuid_maxphyaddr(vcpu);
4829
4830         ext.valid = 1;
4831
4832         return ext;
4833 }
4834
4835 static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
4836                                                    bool base_only)
4837 {
4838         union kvm_mmu_role role = {0};
4839
4840         role.base.access = ACC_ALL;
4841         role.base.nxe = !!is_nx(vcpu);
4842         role.base.cr0_wp = is_write_protection(vcpu);
4843         role.base.smm = is_smm(vcpu);
4844         role.base.guest_mode = is_guest_mode(vcpu);
4845
4846         if (base_only)
4847                 return role;
4848
4849         role.ext = kvm_calc_mmu_role_ext(vcpu);
4850
4851         return role;
4852 }
4853
4854 static union kvm_mmu_role
4855 kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4856 {
4857         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4858
4859         role.base.ad_disabled = (shadow_accessed_mask == 0);
4860         role.base.level = kvm_x86_ops->get_tdp_level(vcpu);
4861         role.base.direct = true;
4862         role.base.gpte_is_8_bytes = true;
4863
4864         return role;
4865 }
4866
4867 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
4868 {
4869         struct kvm_mmu *context = vcpu->arch.mmu;
4870         union kvm_mmu_role new_role =
4871                 kvm_calc_tdp_mmu_root_page_role(vcpu, false);
4872
4873         new_role.base.word &= mmu_base_role_mask.word;
4874         if (new_role.as_u64 == context->mmu_role.as_u64)
4875                 return;
4876
4877         context->mmu_role.as_u64 = new_role.as_u64;
4878         context->page_fault = tdp_page_fault;
4879         context->sync_page = nonpaging_sync_page;
4880         context->invlpg = nonpaging_invlpg;
4881         context->update_pte = nonpaging_update_pte;
4882         context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
4883         context->direct_map = true;
4884         context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
4885         context->get_cr3 = get_cr3;
4886         context->get_pdptr = kvm_pdptr_read;
4887         context->inject_page_fault = kvm_inject_page_fault;
4888
4889         if (!is_paging(vcpu)) {
4890                 context->nx = false;
4891                 context->gva_to_gpa = nonpaging_gva_to_gpa;
4892                 context->root_level = 0;
4893         } else if (is_long_mode(vcpu)) {
4894                 context->nx = is_nx(vcpu);
4895                 context->root_level = is_la57_mode(vcpu) ?
4896                                 PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
4897                 reset_rsvds_bits_mask(vcpu, context);
4898                 context->gva_to_gpa = paging64_gva_to_gpa;
4899         } else if (is_pae(vcpu)) {
4900                 context->nx = is_nx(vcpu);
4901                 context->root_level = PT32E_ROOT_LEVEL;
4902                 reset_rsvds_bits_mask(vcpu, context);
4903                 context->gva_to_gpa = paging64_gva_to_gpa;
4904         } else {
4905                 context->nx = false;
4906                 context->root_level = PT32_ROOT_LEVEL;
4907                 reset_rsvds_bits_mask(vcpu, context);
4908                 context->gva_to_gpa = paging32_gva_to_gpa;
4909         }
4910
4911         update_permission_bitmask(vcpu, context, false);
4912         update_pkru_bitmask(vcpu, context, false);
4913         update_last_nonleaf_level(vcpu, context);
4914         reset_tdp_shadow_zero_bits_mask(vcpu, context);
4915 }
4916
4917 static union kvm_mmu_role
4918 kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only)
4919 {
4920         union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only);
4921
4922         role.base.smep_andnot_wp = role.ext.cr4_smep &&
4923                 !is_write_protection(vcpu);
4924         role.base.smap_andnot_wp = role.ext.cr4_smap &&
4925                 !is_write_protection(vcpu);
4926         role.base.direct = !is_paging(vcpu);
4927         role.base.gpte_is_8_bytes = !!is_pae(vcpu);
4928
4929         if (!is_long_mode(vcpu))
4930                 role.base.level = PT32E_ROOT_LEVEL;
4931         else if (is_la57_mode(vcpu))
4932                 role.base.level = PT64_ROOT_5LEVEL;
4933         else
4934                 role.base.level = PT64_ROOT_4LEVEL;
4935
4936         return role;
4937 }
4938
4939 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
4940 {
4941         struct kvm_mmu *context = vcpu->arch.mmu;
4942         union kvm_mmu_role new_role =
4943                 kvm_calc_shadow_mmu_root_page_role(vcpu, false);
4944
4945         new_role.base.word &= mmu_base_role_mask.word;
4946         if (new_role.as_u64 == context->mmu_role.as_u64)
4947                 return;
4948
4949         if (!is_paging(vcpu))
4950                 nonpaging_init_context(vcpu, context);
4951         else if (is_long_mode(vcpu))
4952                 paging64_init_context(vcpu, context);
4953         else if (is_pae(vcpu))
4954                 paging32E_init_context(vcpu, context);
4955         else
4956                 paging32_init_context(vcpu, context);
4957
4958         context->mmu_role.as_u64 = new_role.as_u64;
4959         reset_shadow_zero_bits_mask(vcpu, context);
4960 }
4961 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
4962
4963 static union kvm_mmu_role
4964 kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
4965                                    bool execonly)
4966 {
4967         union kvm_mmu_role role = {0};
4968
4969         /* SMM flag is inherited from root_mmu */
4970         role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
4971
4972         role.base.level = PT64_ROOT_4LEVEL;
4973         role.base.gpte_is_8_bytes = true;
4974         role.base.direct = false;
4975         role.base.ad_disabled = !accessed_dirty;
4976         role.base.guest_mode = true;
4977         role.base.access = ACC_ALL;
4978
4979         /*
4980          * WP=1 and NOT_WP=1 is an impossible combination, use WP and the
4981          * SMAP variation to denote shadow EPT entries.
4982          */
4983         role.base.cr0_wp = true;
4984         role.base.smap_andnot_wp = true;
4985
4986         role.ext = kvm_calc_mmu_role_ext(vcpu);
4987         role.ext.execonly = execonly;
4988
4989         return role;
4990 }
4991
4992 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
4993                              bool accessed_dirty, gpa_t new_eptp)
4994 {
4995         struct kvm_mmu *context = vcpu->arch.mmu;
4996         union kvm_mmu_role new_role =
4997                 kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
4998                                                    execonly);
4999
5000         __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false);
5001
5002         new_role.base.word &= mmu_base_role_mask.word;
5003         if (new_role.as_u64 == context->mmu_role.as_u64)
5004                 return;
5005
5006         context->shadow_root_level = PT64_ROOT_4LEVEL;
5007
5008         context->nx = true;
5009         context->ept_ad = accessed_dirty;
5010         context->page_fault = ept_page_fault;
5011         context->gva_to_gpa = ept_gva_to_gpa;
5012         context->sync_page = ept_sync_page;
5013         context->invlpg = ept_invlpg;
5014         context->update_pte = ept_update_pte;
5015         context->root_level = PT64_ROOT_4LEVEL;
5016         context->direct_map = false;
5017         context->mmu_role.as_u64 = new_role.as_u64;
5018
5019         update_permission_bitmask(vcpu, context, true);
5020         update_pkru_bitmask(vcpu, context, true);
5021         update_last_nonleaf_level(vcpu, context);
5022         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
5023         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
5024 }
5025 EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5026
5027 static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
5028 {
5029         struct kvm_mmu *context = vcpu->arch.mmu;
5030
5031         kvm_init_shadow_mmu(vcpu);
5032         context->set_cr3           = kvm_x86_ops->set_cr3;
5033         context->get_cr3           = get_cr3;
5034         context->get_pdptr         = kvm_pdptr_read;
5035         context->inject_page_fault = kvm_inject_page_fault;
5036 }
5037
5038 static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
5039 {
5040         union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false);
5041         struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5042
5043         new_role.base.word &= mmu_base_role_mask.word;
5044         if (new_role.as_u64 == g_context->mmu_role.as_u64)
5045                 return;
5046
5047         g_context->mmu_role.as_u64 = new_role.as_u64;
5048         g_context->get_cr3           = get_cr3;
5049         g_context->get_pdptr         = kvm_pdptr_read;
5050         g_context->inject_page_fault = kvm_inject_page_fault;
5051
5052         /*
5053          * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5054          * L1's nested page tables (e.g. EPT12). The nested translation
5055          * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5056          * L2's page tables as the first level of translation and L1's
5057          * nested page tables as the second level of translation. Basically
5058          * the gva_to_gpa functions between mmu and nested_mmu are swapped.
5059          */
5060         if (!is_paging(vcpu)) {
5061                 g_context->nx = false;
5062                 g_context->root_level = 0;
5063                 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
5064         } else if (is_long_mode(vcpu)) {
5065                 g_context->nx = is_nx(vcpu);
5066                 g_context->root_level = is_la57_mode(vcpu) ?
5067                                         PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
5068                 reset_rsvds_bits_mask(vcpu, g_context);
5069                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5070         } else if (is_pae(vcpu)) {
5071                 g_context->nx = is_nx(vcpu);
5072                 g_context->root_level = PT32E_ROOT_LEVEL;
5073                 reset_rsvds_bits_mask(vcpu, g_context);
5074                 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
5075         } else {
5076                 g_context->nx = false;
5077                 g_context->root_level = PT32_ROOT_LEVEL;
5078                 reset_rsvds_bits_mask(vcpu, g_context);
5079                 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
5080         }
5081
5082         update_permission_bitmask(vcpu, g_context, false);
5083         update_pkru_bitmask(vcpu, g_context, false);
5084         update_last_nonleaf_level(vcpu, g_context);
5085 }
5086
5087 void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
5088 {
5089         if (reset_roots) {
5090                 uint i;
5091
5092                 vcpu->arch.mmu->root_hpa = INVALID_PAGE;
5093
5094                 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5095                         vcpu->arch.mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5096         }
5097
5098         if (mmu_is_nested(vcpu))
5099                 init_kvm_nested_mmu(vcpu);
5100         else if (tdp_enabled)
5101                 init_kvm_tdp_mmu(vcpu);
5102         else
5103                 init_kvm_softmmu(vcpu);
5104 }
5105 EXPORT_SYMBOL_GPL(kvm_init_mmu);
5106
5107 static union kvm_mmu_page_role
5108 kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
5109 {
5110         union kvm_mmu_role role;
5111
5112         if (tdp_enabled)
5113                 role = kvm_calc_tdp_mmu_root_page_role(vcpu, true);
5114         else
5115                 role = kvm_calc_shadow_mmu_root_page_role(vcpu, true);
5116
5117         return role.base;
5118 }
5119
5120 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5121 {
5122         kvm_mmu_unload(vcpu);
5123         kvm_init_mmu(vcpu, true);
5124 }
5125 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5126
5127 int kvm_mmu_load(struct kvm_vcpu *vcpu)
5128 {
5129         int r;
5130
5131         r = mmu_topup_memory_caches(vcpu);
5132         if (r)
5133                 goto out;
5134         r = mmu_alloc_roots(vcpu);
5135         kvm_mmu_sync_roots(vcpu);
5136         if (r)
5137                 goto out;
5138         kvm_mmu_load_cr3(vcpu);
5139         kvm_x86_ops->tlb_flush(vcpu, true);
5140 out:
5141         return r;
5142 }
5143 EXPORT_SYMBOL_GPL(kvm_mmu_load);
5144
5145 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5146 {
5147         kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5148         WARN_ON(VALID_PAGE(vcpu->arch.root_mmu.root_hpa));
5149         kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5150         WARN_ON(VALID_PAGE(vcpu->arch.guest_mmu.root_hpa));
5151 }
5152 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
5153
5154 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
5155                                   struct kvm_mmu_page *sp, u64 *spte,
5156                                   const void *new)
5157 {
5158         if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
5159                 ++vcpu->kvm->stat.mmu_pde_zapped;
5160                 return;
5161         }
5162
5163         ++vcpu->kvm->stat.mmu_pte_updated;
5164         vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
5165 }
5166
5167 static bool need_remote_flush(u64 old, u64 new)
5168 {
5169         if (!is_shadow_present_pte(old))
5170                 return false;
5171         if (!is_shadow_present_pte(new))
5172                 return true;
5173         if ((old ^ new) & PT64_BASE_ADDR_MASK)
5174                 return true;
5175         old ^= shadow_nx_mask;
5176         new ^= shadow_nx_mask;
5177         return (old & ~new & PT64_PERM_MASK) != 0;
5178 }
5179
5180 static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
5181                                     int *bytes)
5182 {
5183         u64 gentry = 0;
5184         int r;
5185
5186         /*
5187          * Assume that the pte write on a page table of the same type
5188          * as the current vcpu paging mode since we update the sptes only
5189          * when they have the same mode.
5190          */
5191         if (is_pae(vcpu) && *bytes == 4) {
5192                 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
5193                 *gpa &= ~(gpa_t)7;
5194                 *bytes = 8;
5195         }
5196
5197         if (*bytes == 4 || *bytes == 8) {
5198                 r = kvm_vcpu_read_guest_atomic(vcpu, *gpa, &gentry, *bytes);
5199                 if (r)
5200                         gentry = 0;
5201         }
5202
5203         return gentry;
5204 }
5205
5206 /*
5207  * If we're seeing too many writes to a page, it may no longer be a page table,
5208  * or we may be forking, in which case it is better to unmap the page.
5209  */
5210 static bool detect_write_flooding(struct kvm_mmu_page *sp)
5211 {
5212         /*
5213          * Skip write-flooding detected for the sp whose level is 1, because
5214          * it can become unsync, then the guest page is not write-protected.
5215          */
5216         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
5217                 return false;
5218
5219         atomic_inc(&sp->write_flooding_count);
5220         return atomic_read(&sp->write_flooding_count) >= 3;
5221 }
5222
5223 /*
5224  * Misaligned accesses are too much trouble to fix up; also, they usually
5225  * indicate a page is not used as a page table.
5226  */
5227 static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5228                                     int bytes)
5229 {
5230         unsigned offset, pte_size, misaligned;
5231
5232         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
5233                  gpa, bytes, sp->role.word);
5234
5235         offset = offset_in_page(gpa);
5236         pte_size = sp->role.gpte_is_8_bytes ? 8 : 4;
5237
5238         /*
5239          * Sometimes, the OS only writes the last one bytes to update status
5240          * bits, for example, in linux, andb instruction is used in clear_bit().
5241          */
5242         if (!(offset & (pte_size - 1)) && bytes == 1)
5243                 return false;
5244
5245         misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
5246         misaligned |= bytes < 4;
5247
5248         return misaligned;
5249 }
5250
5251 static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte)
5252 {
5253         unsigned page_offset, quadrant;
5254         u64 *spte;
5255         int level;
5256
5257         page_offset = offset_in_page(gpa);
5258         level = sp->role.level;
5259         *nspte = 1;
5260         if (!sp->role.gpte_is_8_bytes) {
5261                 page_offset <<= 1;      /* 32->64 */
5262                 /*
5263                  * A 32-bit pde maps 4MB while the shadow pdes map
5264                  * only 2MB.  So we need to double the offset again
5265                  * and zap two pdes instead of one.
5266                  */
5267                 if (level == PT32_ROOT_LEVEL) {
5268                         page_offset &= ~7; /* kill rounding error */
5269                         page_offset <<= 1;
5270                         *nspte = 2;
5271                 }
5272                 quadrant = page_offset >> PAGE_SHIFT;
5273                 page_offset &= ~PAGE_MASK;
5274                 if (quadrant != sp->role.quadrant)
5275                         return NULL;
5276         }
5277
5278         spte = &sp->spt[page_offset / sizeof(*spte)];
5279         return spte;
5280 }
5281
5282 static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
5283                               const u8 *new, int bytes,
5284                               struct kvm_page_track_notifier_node *node)
5285 {
5286         gfn_t gfn = gpa >> PAGE_SHIFT;
5287         struct kvm_mmu_page *sp;
5288         LIST_HEAD(invalid_list);
5289         u64 entry, gentry, *spte;
5290         int npte;
5291         bool remote_flush, local_flush;
5292
5293         /*
5294          * If we don't have indirect shadow pages, it means no page is
5295          * write-protected, so we can exit simply.
5296          */
5297         if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5298                 return;
5299
5300         remote_flush = local_flush = false;
5301
5302         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
5303
5304         /*
5305          * No need to care whether allocation memory is successful
5306          * or not since pte prefetch is skiped if it does not have
5307          * enough objects in the cache.
5308          */
5309         mmu_topup_memory_caches(vcpu);
5310
5311         spin_lock(&vcpu->kvm->mmu_lock);
5312
5313         gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
5314
5315         ++vcpu->kvm->stat.mmu_pte_write;
5316         kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
5317
5318         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
5319                 if (detect_write_misaligned(sp, gpa, bytes) ||
5320                       detect_write_flooding(sp)) {
5321                         kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
5322                         ++vcpu->kvm->stat.mmu_flooded;
5323                         continue;
5324                 }
5325
5326                 spte = get_written_sptes(sp, gpa, &npte);
5327                 if (!spte)
5328                         continue;
5329
5330                 local_flush = true;
5331                 while (npte--) {
5332                         u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
5333
5334                         entry = *spte;
5335                         mmu_page_zap_pte(vcpu->kvm, sp, spte);
5336                         if (gentry &&
5337                               !((sp->role.word ^ base_role)
5338                               & mmu_base_role_mask.word) && rmap_can_add(vcpu))
5339                                 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
5340                         if (need_remote_flush(entry, *spte))
5341                                 remote_flush = true;
5342                         ++spte;
5343                 }
5344         }
5345         kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
5346         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
5347         spin_unlock(&vcpu->kvm->mmu_lock);
5348 }
5349
5350 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
5351 {
5352         gpa_t gpa;
5353         int r;
5354
5355         if (vcpu->arch.mmu->direct_map)
5356                 return 0;
5357
5358         gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
5359
5360         r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
5361
5362         return r;
5363 }
5364 EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
5365
5366 static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
5367 {
5368         LIST_HEAD(invalid_list);
5369
5370         if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
5371                 return 0;
5372
5373         while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
5374                 if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
5375                         break;
5376
5377                 ++vcpu->kvm->stat.mmu_recycled;
5378         }
5379         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
5380
5381         if (!kvm_mmu_available_pages(vcpu->kvm))
5382                 return -ENOSPC;
5383         return 0;
5384 }
5385
5386 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
5387                        void *insn, int insn_len)
5388 {
5389         int r, emulation_type = 0;
5390         bool direct = vcpu->arch.mmu->direct_map;
5391
5392         /* With shadow page tables, fault_address contains a GVA or nGPA.  */
5393         if (vcpu->arch.mmu->direct_map) {
5394                 vcpu->arch.gpa_available = true;
5395                 vcpu->arch.gpa_val = cr2;
5396         }
5397
5398         r = RET_PF_INVALID;
5399         if (unlikely(error_code & PFERR_RSVD_MASK)) {
5400                 r = handle_mmio_page_fault(vcpu, cr2, direct);
5401                 if (r == RET_PF_EMULATE)
5402                         goto emulate;
5403         }
5404
5405         if (r == RET_PF_INVALID) {
5406                 r = vcpu->arch.mmu->page_fault(vcpu, cr2,
5407                                                lower_32_bits(error_code),
5408                                                false);
5409                 WARN_ON(r == RET_PF_INVALID);
5410         }
5411
5412         if (r == RET_PF_RETRY)
5413                 return 1;
5414         if (r < 0)
5415                 return r;
5416
5417         /*
5418          * Before emulating the instruction, check if the error code
5419          * was due to a RO violation while translating the guest page.
5420          * This can occur when using nested virtualization with nested
5421          * paging in both guests. If true, we simply unprotect the page
5422          * and resume the guest.
5423          */
5424         if (vcpu->arch.mmu->direct_map &&
5425             (error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5426                 kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
5427                 return 1;
5428         }
5429
5430         /*
5431          * vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5432          * optimistically try to just unprotect the page and let the processor
5433          * re-execute the instruction that caused the page fault.  Do not allow
5434          * retrying MMIO emulation, as it's not only pointless but could also
5435          * cause us to enter an infinite loop because the processor will keep
5436          * faulting on the non-existent MMIO address.  Retrying an instruction
5437          * from a nested guest is also pointless and dangerous as we are only
5438          * explicitly shadowing L1's page tables, i.e. unprotecting something
5439          * for L1 isn't going to magically fix whatever issue cause L2 to fail.
5440          */
5441         if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
5442                 emulation_type = EMULTYPE_ALLOW_RETRY;
5443 emulate:
5444         /*
5445          * On AMD platforms, under certain conditions insn_len may be zero on #NPF.
5446          * This can happen if a guest gets a page-fault on data access but the HW
5447          * table walker is not able to read the instruction page (e.g instruction
5448          * page is not present in memory). In those cases we simply restart the
5449          * guest, with the exception of AMD Erratum 1096 which is unrecoverable.
5450          */
5451         if (unlikely(insn && !insn_len)) {
5452                 if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
5453                         return 1;
5454         }
5455
5456         return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
5457                                        insn_len);
5458 }
5459 EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5460
5461 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5462 {
5463         struct kvm_mmu *mmu = vcpu->arch.mmu;
5464         int i;
5465
5466         /* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
5467         if (is_noncanonical_address(gva, vcpu))
5468                 return;
5469
5470         mmu->invlpg(vcpu, gva, mmu->root_hpa);
5471
5472         /*
5473          * INVLPG is required to invalidate any global mappings for the VA,
5474          * irrespective of PCID. Since it would take us roughly similar amount
5475          * of work to determine whether any of the prev_root mappings of the VA
5476          * is marked global, or to just sync it blindly, so we might as well
5477          * just always sync it.
5478          *
5479          * Mappings not reachable via the current cr3 or the prev_roots will be
5480          * synced when switching to that cr3, so nothing needs to be done here
5481          * for them.
5482          */
5483         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5484                 if (VALID_PAGE(mmu->prev_roots[i].hpa))
5485                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5486
5487         kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5488         ++vcpu->stat.invlpg;
5489 }
5490 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5491
5492 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
5493 {
5494         struct kvm_mmu *mmu = vcpu->arch.mmu;
5495         bool tlb_flush = false;
5496         uint i;
5497
5498         if (pcid == kvm_get_active_pcid(vcpu)) {
5499                 mmu->invlpg(vcpu, gva, mmu->root_hpa);
5500                 tlb_flush = true;
5501         }
5502
5503         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5504                 if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
5505                     pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
5506                         mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
5507                         tlb_flush = true;
5508                 }
5509         }
5510
5511         if (tlb_flush)
5512                 kvm_x86_ops->tlb_flush_gva(vcpu, gva);
5513
5514         ++vcpu->stat.invlpg;
5515
5516         /*
5517          * Mappings not reachable via the current cr3 or the prev_roots will be
5518          * synced when switching to that cr3, so nothing needs to be done here
5519          * for them.
5520          */
5521 }
5522 EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
5523
5524 void kvm_enable_tdp(void)
5525 {
5526         tdp_enabled = true;
5527 }
5528 EXPORT_SYMBOL_GPL(kvm_enable_tdp);
5529
5530 void kvm_disable_tdp(void)
5531 {
5532         tdp_enabled = false;
5533 }
5534 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
5535
5536
5537 /* The return value indicates if tlb flush on all vcpus is needed. */
5538 typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
5539
5540 /* The caller should hold mmu-lock before calling this function. */
5541 static __always_inline bool
5542 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
5543                         slot_level_handler fn, int start_level, int end_level,
5544                         gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
5545 {
5546         struct slot_rmap_walk_iterator iterator;
5547         bool flush = false;
5548
5549         for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
5550                         end_gfn, &iterator) {
5551                 if (iterator.rmap)
5552                         flush |= fn(kvm, iterator.rmap);
5553
5554                 if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
5555                         if (flush && lock_flush_tlb) {
5556                                 kvm_flush_remote_tlbs_with_address(kvm,
5557                                                 start_gfn,
5558                                                 iterator.gfn - start_gfn + 1);
5559                                 flush = false;
5560                         }
5561                         cond_resched_lock(&kvm->mmu_lock);
5562                 }
5563         }
5564
5565         if (flush && lock_flush_tlb) {
5566                 kvm_flush_remote_tlbs_with_address(kvm, start_gfn,
5567                                                    end_gfn - start_gfn + 1);
5568                 flush = false;
5569         }
5570
5571         return flush;
5572 }
5573
5574 static __always_inline bool
5575 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5576                   slot_level_handler fn, int start_level, int end_level,
5577                   bool lock_flush_tlb)
5578 {
5579         return slot_handle_level_range(kvm, memslot, fn, start_level,
5580                         end_level, memslot->base_gfn,
5581                         memslot->base_gfn + memslot->npages - 1,
5582                         lock_flush_tlb);
5583 }
5584
5585 static __always_inline bool
5586 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5587                       slot_level_handler fn, bool lock_flush_tlb)
5588 {
5589         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5590                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5591 }
5592
5593 static __always_inline bool
5594 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
5595                         slot_level_handler fn, bool lock_flush_tlb)
5596 {
5597         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
5598                                  PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
5599 }
5600
5601 static __always_inline bool
5602 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
5603                  slot_level_handler fn, bool lock_flush_tlb)
5604 {
5605         return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
5606                                  PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
5607 }
5608
5609 static void free_mmu_pages(struct kvm_mmu *mmu)
5610 {
5611         free_page((unsigned long)mmu->pae_root);
5612         free_page((unsigned long)mmu->lm_root);
5613 }
5614
5615 static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
5616 {
5617         struct page *page;
5618         int i;
5619
5620         /*
5621          * When using PAE paging, the four PDPTEs are treated as 'root' pages,
5622          * while the PDP table is a per-vCPU construct that's allocated at MMU
5623          * creation.  When emulating 32-bit mode, cr3 is only 32 bits even on
5624          * x86_64.  Therefore we need to allocate the PDP table in the first
5625          * 4GB of memory, which happens to fit the DMA32 zone.  Except for
5626          * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can
5627          * skip allocating the PDP table.
5628          */
5629         if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
5630                 return 0;
5631
5632         page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
5633         if (!page)
5634                 return -ENOMEM;
5635
5636         mmu->pae_root = page_address(page);
5637         for (i = 0; i < 4; ++i)
5638                 mmu->pae_root[i] = INVALID_PAGE;
5639
5640         return 0;
5641 }
5642
5643 int kvm_mmu_create(struct kvm_vcpu *vcpu)
5644 {
5645         uint i;
5646         int ret;
5647
5648         vcpu->arch.mmu = &vcpu->arch.root_mmu;
5649         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
5650
5651         vcpu->arch.root_mmu.root_hpa = INVALID_PAGE;
5652         vcpu->arch.root_mmu.root_cr3 = 0;
5653         vcpu->arch.root_mmu.translate_gpa = translate_gpa;
5654         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5655                 vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5656
5657         vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE;
5658         vcpu->arch.guest_mmu.root_cr3 = 0;
5659         vcpu->arch.guest_mmu.translate_gpa = translate_gpa;
5660         for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
5661                 vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
5662
5663         vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5664
5665         ret = alloc_mmu_pages(vcpu, &vcpu->arch.guest_mmu);
5666         if (ret)
5667                 return ret;
5668
5669         ret = alloc_mmu_pages(vcpu, &vcpu->arch.root_mmu);
5670         if (ret)
5671                 goto fail_allocate_root;
5672
5673         return ret;
5674  fail_allocate_root:
5675         free_mmu_pages(&vcpu->arch.guest_mmu);
5676         return ret;
5677 }
5678
5679 #define BATCH_ZAP_PAGES 10
5680 static void kvm_zap_obsolete_pages(struct kvm *kvm)
5681 {
5682         struct kvm_mmu_page *sp, *node;
5683         int nr_zapped, batch = 0;
5684
5685 restart:
5686         list_for_each_entry_safe_reverse(sp, node,
5687               &kvm->arch.active_mmu_pages, link) {
5688                 /*
5689                  * No obsolete valid page exists before a newly created page
5690                  * since active_mmu_pages is a FIFO list.
5691                  */
5692                 if (!is_obsolete_sp(kvm, sp))
5693                         break;
5694
5695                 /*
5696                  * Since we are reversely walking the list and the invalid
5697                  * list will be moved to the head, skip the invalid page
5698                  * can help us to avoid the infinity list walking.
5699                  */
5700                 if (sp->role.invalid)
5701                         continue;
5702
5703                 /*
5704                  * No need to flush the TLB since we're only zapping shadow
5705                  * pages with an obsolete generation number and all vCPUS have
5706                  * loaded a new root, i.e. the shadow pages being zapped cannot
5707                  * be in active use by the guest.
5708                  */
5709                 if (batch >= BATCH_ZAP_PAGES &&
5710                     cond_resched_lock(&kvm->mmu_lock)) {
5711                         batch = 0;
5712                         goto restart;
5713                 }
5714
5715                 if (__kvm_mmu_prepare_zap_page(kvm, sp,
5716                                 &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
5717                         batch += nr_zapped;
5718                         goto restart;
5719                 }
5720         }
5721
5722         /*
5723          * Trigger a remote TLB flush before freeing the page tables to ensure
5724          * KVM is not in the middle of a lockless shadow page table walk, which
5725          * may reference the pages.
5726          */
5727         kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
5728 }
5729
5730 /*
5731  * Fast invalidate all shadow pages and use lock-break technique
5732  * to zap obsolete pages.
5733  *
5734  * It's required when memslot is being deleted or VM is being
5735  * destroyed, in these cases, we should ensure that KVM MMU does
5736  * not use any resource of the being-deleted slot or all slots
5737  * after calling the function.
5738  */
5739 static void kvm_mmu_zap_all_fast(struct kvm *kvm)
5740 {
5741         lockdep_assert_held(&kvm->slots_lock);
5742
5743         spin_lock(&kvm->mmu_lock);
5744         trace_kvm_mmu_zap_all_fast(kvm);
5745
5746         /*
5747          * Toggle mmu_valid_gen between '0' and '1'.  Because slots_lock is
5748          * held for the entire duration of zapping obsolete pages, it's
5749          * impossible for there to be multiple invalid generations associated
5750          * with *valid* shadow pages at any given time, i.e. there is exactly
5751          * one valid generation and (at most) one invalid generation.
5752          */
5753         kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? 0 : 1;
5754
5755         /*
5756          * Notify all vcpus to reload its shadow page table and flush TLB.
5757          * Then all vcpus will switch to new shadow page table with the new
5758          * mmu_valid_gen.
5759          *
5760          * Note: we need to do this under the protection of mmu_lock,
5761          * otherwise, vcpu would purge shadow page but miss tlb flush.
5762          */
5763         kvm_reload_remote_mmus(kvm);
5764
5765         kvm_zap_obsolete_pages(kvm);
5766         spin_unlock(&kvm->mmu_lock);
5767 }
5768
5769 static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
5770 {
5771         return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
5772 }
5773
5774 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
5775                         struct kvm_memory_slot *slot,
5776                         struct kvm_page_track_notifier_node *node)
5777 {
5778         kvm_mmu_zap_all_fast(kvm);
5779 }
5780
5781 void kvm_mmu_init_vm(struct kvm *kvm)
5782 {
5783         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5784
5785         node->track_write = kvm_mmu_pte_write;
5786         node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
5787         kvm_page_track_register_notifier(kvm, node);
5788 }
5789
5790 void kvm_mmu_uninit_vm(struct kvm *kvm)
5791 {
5792         struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
5793
5794         kvm_page_track_unregister_notifier(kvm, node);
5795 }
5796
5797 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
5798 {
5799         struct kvm_memslots *slots;
5800         struct kvm_memory_slot *memslot;
5801         int i;
5802
5803         spin_lock(&kvm->mmu_lock);
5804         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
5805                 slots = __kvm_memslots(kvm, i);
5806                 kvm_for_each_memslot(memslot, slots) {
5807                         gfn_t start, end;
5808
5809                         start = max(gfn_start, memslot->base_gfn);
5810                         end = min(gfn_end, memslot->base_gfn + memslot->npages);
5811                         if (start >= end)
5812                                 continue;
5813
5814                         slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
5815                                                 PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
5816                                                 start, end - 1, true);
5817                 }
5818         }
5819
5820         spin_unlock(&kvm->mmu_lock);
5821 }
5822
5823 static bool slot_rmap_write_protect(struct kvm *kvm,
5824                                     struct kvm_rmap_head *rmap_head)
5825 {
5826         return __rmap_write_protect(kvm, rmap_head, false);
5827 }
5828
5829 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
5830                                       struct kvm_memory_slot *memslot)
5831 {
5832         bool flush;
5833
5834         spin_lock(&kvm->mmu_lock);
5835         flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
5836                                       false);
5837         spin_unlock(&kvm->mmu_lock);
5838
5839         /*
5840          * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
5841          * which do tlb flush out of mmu-lock should be serialized by
5842          * kvm->slots_lock otherwise tlb flush would be missed.
5843          */
5844         lockdep_assert_held(&kvm->slots_lock);
5845
5846         /*
5847          * We can flush all the TLBs out of the mmu lock without TLB
5848          * corruption since we just change the spte from writable to
5849          * readonly so that we only need to care the case of changing
5850          * spte from present to present (changing the spte from present
5851          * to nonpresent will flush all the TLBs immediately), in other
5852          * words, the only case we care is mmu_spte_update() where we
5853          * have checked SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE
5854          * instead of PT_WRITABLE_MASK, that means it does not depend
5855          * on PT_WRITABLE_MASK anymore.
5856          */
5857         if (flush)
5858                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5859                         memslot->npages);
5860 }
5861
5862 static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
5863                                          struct kvm_rmap_head *rmap_head)
5864 {
5865         u64 *sptep;
5866         struct rmap_iterator iter;
5867         int need_tlb_flush = 0;
5868         kvm_pfn_t pfn;
5869         struct kvm_mmu_page *sp;
5870
5871 restart:
5872         for_each_rmap_spte(rmap_head, &iter, sptep) {
5873                 sp = page_header(__pa(sptep));
5874                 pfn = spte_to_pfn(*sptep);
5875
5876                 /*
5877                  * We cannot do huge page mapping for indirect shadow pages,
5878                  * which are found on the last rmap (level = 1) when not using
5879                  * tdp; such shadow pages are synced with the page table in
5880                  * the guest, and the guest page table is using 4K page size
5881                  * mapping if the indirect sp has level = 1.
5882                  */
5883                 if (sp->role.direct &&
5884                         !kvm_is_reserved_pfn(pfn) &&
5885                         PageTransCompoundMap(pfn_to_page(pfn))) {
5886                         pte_list_remove(rmap_head, sptep);
5887
5888                         if (kvm_available_flush_tlb_with_range())
5889                                 kvm_flush_remote_tlbs_with_address(kvm, sp->gfn,
5890                                         KVM_PAGES_PER_HPAGE(sp->role.level));
5891                         else
5892                                 need_tlb_flush = 1;
5893
5894                         goto restart;
5895                 }
5896         }
5897
5898         return need_tlb_flush;
5899 }
5900
5901 void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
5902                                    const struct kvm_memory_slot *memslot)
5903 {
5904         /* FIXME: const-ify all uses of struct kvm_memory_slot.  */
5905         spin_lock(&kvm->mmu_lock);
5906         slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
5907                          kvm_mmu_zap_collapsible_spte, true);
5908         spin_unlock(&kvm->mmu_lock);
5909 }
5910
5911 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
5912                                    struct kvm_memory_slot *memslot)
5913 {
5914         bool flush;
5915
5916         spin_lock(&kvm->mmu_lock);
5917         flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
5918         spin_unlock(&kvm->mmu_lock);
5919
5920         lockdep_assert_held(&kvm->slots_lock);
5921
5922         /*
5923          * It's also safe to flush TLBs out of mmu lock here as currently this
5924          * function is only used for dirty logging, in which case flushing TLB
5925          * out of mmu lock also guarantees no dirty pages will be lost in
5926          * dirty_bitmap.
5927          */
5928         if (flush)
5929                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5930                                 memslot->npages);
5931 }
5932 EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
5933
5934 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
5935                                         struct kvm_memory_slot *memslot)
5936 {
5937         bool flush;
5938
5939         spin_lock(&kvm->mmu_lock);
5940         flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
5941                                         false);
5942         spin_unlock(&kvm->mmu_lock);
5943
5944         /* see kvm_mmu_slot_remove_write_access */
5945         lockdep_assert_held(&kvm->slots_lock);
5946
5947         if (flush)
5948                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5949                                 memslot->npages);
5950 }
5951 EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
5952
5953 void kvm_mmu_slot_set_dirty(struct kvm *kvm,
5954                             struct kvm_memory_slot *memslot)
5955 {
5956         bool flush;
5957
5958         spin_lock(&kvm->mmu_lock);
5959         flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
5960         spin_unlock(&kvm->mmu_lock);
5961
5962         lockdep_assert_held(&kvm->slots_lock);
5963
5964         /* see kvm_mmu_slot_leaf_clear_dirty */
5965         if (flush)
5966                 kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn,
5967                                 memslot->npages);
5968 }
5969 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
5970
5971 void kvm_mmu_zap_all(struct kvm *kvm)
5972 {
5973         struct kvm_mmu_page *sp, *node;
5974         LIST_HEAD(invalid_list);
5975         int ign;
5976
5977         spin_lock(&kvm->mmu_lock);
5978 restart:
5979         list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
5980                 if (sp->role.invalid && sp->root_count)
5981                         continue;
5982                 if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign))
5983                         goto restart;
5984                 if (cond_resched_lock(&kvm->mmu_lock))
5985                         goto restart;
5986         }
5987
5988         kvm_mmu_commit_zap_page(kvm, &invalid_list);
5989         spin_unlock(&kvm->mmu_lock);
5990 }
5991
5992 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
5993 {
5994         WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
5995
5996         gen &= MMIO_SPTE_GEN_MASK;
5997
5998         /*
5999          * Generation numbers are incremented in multiples of the number of
6000          * address spaces in order to provide unique generations across all
6001          * address spaces.  Strip what is effectively the address space
6002          * modifier prior to checking for a wrap of the MMIO generation so
6003          * that a wrap in any address space is detected.
6004          */
6005         gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
6006
6007         /*
6008          * The very rare case: if the MMIO generation number has wrapped,
6009          * zap all shadow pages.
6010          */
6011         if (unlikely(gen == 0)) {
6012                 kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
6013                 kvm_mmu_zap_all_fast(kvm);
6014         }
6015 }
6016
6017 static unsigned long
6018 mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
6019 {
6020         struct kvm *kvm;
6021         int nr_to_scan = sc->nr_to_scan;
6022         unsigned long freed = 0;
6023
6024         mutex_lock(&kvm_lock);
6025
6026         list_for_each_entry(kvm, &vm_list, vm_list) {
6027                 int idx;
6028                 LIST_HEAD(invalid_list);
6029
6030                 /*
6031                  * Never scan more than sc->nr_to_scan VM instances.
6032                  * Will not hit this condition practically since we do not try
6033                  * to shrink more than one VM and it is very unlikely to see
6034                  * !n_used_mmu_pages so many times.
6035                  */
6036                 if (!nr_to_scan--)
6037                         break;
6038                 /*
6039                  * n_used_mmu_pages is accessed without holding kvm->mmu_lock
6040                  * here. We may skip a VM instance errorneosly, but we do not
6041                  * want to shrink a VM that only started to populate its MMU
6042                  * anyway.
6043                  */
6044                 if (!kvm->arch.n_used_mmu_pages &&
6045                     !kvm_has_zapped_obsolete_pages(kvm))
6046                         continue;
6047
6048                 idx = srcu_read_lock(&kvm->srcu);
6049                 spin_lock(&kvm->mmu_lock);
6050
6051                 if (kvm_has_zapped_obsolete_pages(kvm)) {
6052                         kvm_mmu_commit_zap_page(kvm,
6053                               &kvm->arch.zapped_obsolete_pages);
6054                         goto unlock;
6055                 }
6056
6057                 if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
6058                         freed++;
6059                 kvm_mmu_commit_zap_page(kvm, &invalid_list);
6060
6061 unlock:
6062                 spin_unlock(&kvm->mmu_lock);
6063                 srcu_read_unlock(&kvm->srcu, idx);
6064
6065                 /*
6066                  * unfair on small ones
6067                  * per-vm shrinkers cry out
6068                  * sadness comes quickly
6069                  */
6070                 list_move_tail(&kvm->vm_list, &vm_list);
6071                 break;
6072         }
6073
6074         mutex_unlock(&kvm_lock);
6075         return freed;
6076 }
6077
6078 static unsigned long
6079 mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
6080 {
6081         return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
6082 }
6083
6084 static struct shrinker mmu_shrinker = {
6085         .count_objects = mmu_shrink_count,
6086         .scan_objects = mmu_shrink_scan,
6087         .seeks = DEFAULT_SEEKS * 10,
6088 };
6089
6090 static void mmu_destroy_caches(void)
6091 {
6092         kmem_cache_destroy(pte_list_desc_cache);
6093         kmem_cache_destroy(mmu_page_header_cache);
6094 }
6095
6096 static void kvm_set_mmio_spte_mask(void)
6097 {
6098         u64 mask;
6099
6100         /*
6101          * Set the reserved bits and the present bit of an paging-structure
6102          * entry to generate page fault with PFER.RSV = 1.
6103          */
6104
6105         /*
6106          * Mask the uppermost physical address bit, which would be reserved as
6107          * long as the supported physical address width is less than 52.
6108          */
6109         mask = 1ull << 51;
6110
6111         /* Set the present bit. */
6112         mask |= 1ull;
6113
6114         /*
6115          * If reserved bit is not supported, clear the present bit to disable
6116          * mmio page fault.
6117          */
6118         if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
6119                 mask &= ~1ull;
6120
6121         kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
6122 }
6123
6124 int kvm_mmu_module_init(void)
6125 {
6126         int ret = -ENOMEM;
6127
6128         /*
6129          * MMU roles use union aliasing which is, generally speaking, an
6130          * undefined behavior. However, we supposedly know how compilers behave
6131          * and the current status quo is unlikely to change. Guardians below are
6132          * supposed to let us know if the assumption becomes false.
6133          */
6134         BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
6135         BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
6136         BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
6137
6138         kvm_mmu_reset_all_pte_masks();
6139
6140         kvm_set_mmio_spte_mask();
6141
6142         pte_list_desc_cache = kmem_cache_create("pte_list_desc",
6143                                             sizeof(struct pte_list_desc),
6144                                             0, SLAB_ACCOUNT, NULL);
6145         if (!pte_list_desc_cache)
6146                 goto out;
6147
6148         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
6149                                                   sizeof(struct kvm_mmu_page),
6150                                                   0, SLAB_ACCOUNT, NULL);
6151         if (!mmu_page_header_cache)
6152                 goto out;
6153
6154         if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
6155                 goto out;
6156
6157         ret = register_shrinker(&mmu_shrinker);
6158         if (ret)
6159                 goto out;
6160
6161         return 0;
6162
6163 out:
6164         mmu_destroy_caches();
6165         return ret;
6166 }
6167
6168 /*
6169  * Calculate mmu pages needed for kvm.
6170  */
6171 unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm)
6172 {
6173         unsigned long nr_mmu_pages;
6174         unsigned long nr_pages = 0;
6175         struct kvm_memslots *slots;
6176         struct kvm_memory_slot *memslot;
6177         int i;
6178
6179         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
6180                 slots = __kvm_memslots(kvm, i);
6181
6182                 kvm_for_each_memslot(memslot, slots)
6183                         nr_pages += memslot->npages;
6184         }
6185
6186         nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
6187         nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
6188
6189         return nr_mmu_pages;
6190 }
6191
6192 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
6193 {
6194         kvm_mmu_unload(vcpu);
6195         free_mmu_pages(&vcpu->arch.root_mmu);
6196         free_mmu_pages(&vcpu->arch.guest_mmu);
6197         mmu_free_memory_caches(vcpu);
6198 }
6199
6200 void kvm_mmu_module_exit(void)
6201 {
6202         mmu_destroy_caches();
6203         percpu_counter_destroy(&kvm_total_used_mmu_pages);
6204         unregister_shrinker(&mmu_shrinker);
6205         mmu_audit_disable();
6206 }