Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)
diff --combined arch/x86/include/asm/ftrace.h

index 21b66dbf3601aebdd7c9b00927ff184a8ea03159,abbad505dd245a349db3a8996b976146002d186a..a4820d4df617daede213516eea42becd5b874640
--- 1/arch/x86/include/asm/ftrace.h
--- 2/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@@ -52,13 -52,13 +52,13 @@@ int ftrace_int3_handler(struct pt_regs 
    * this screws up the trace output when tracing a ia32 task.
    * Instead of reporting bogus syscalls, just do not trace them.
    *
-  * If the user realy wants these, then they should use the
+  * If the user really wants these, then they should use the
    * raw syscall tracepoints with filtering.
    */
   #define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1
   static inline bool arch_trace_is_compat_syscall(struct pt_regs *regs)
   {
- -      if (is_compat_task())
+ +      if (in_compat_syscall())
                 return true;
         return false;
   }
diff --combined arch/x86/kernel/cpu/amd.c

index 5026a13356c468052e3bd113af2059ef3da5a9ee,b39338c4b2604c0fff4339379c5c5ae0ed7f4036..e51021c9207a8279a88df9a4a2e68444ef1da2cb
--- 1/arch/x86/kernel/cpu/amd.c
--- 2/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@@ -75,17 -75,14 +75,17 @@@ static inline int wrmsrl_amd_safe(unsig
    */
   
   extern __visible void vide(void);
- -__asm__(".globl vide\n\t.align 4\nvide: ret");
+ +__asm__(".globl vide\n"
+ +      ".type vide, @function\n"
+ +      ".align 4\n"
+ +      "vide: ret\n");
   
   static void init_amd_k5(struct cpuinfo_x86 *c)
   {
   #ifdef CONFIG_X86_32
   /*
    * General Systems BIOSen alias the cpu frequency registers
-  * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+  * of the Elan at 0x000df000. Unfortunately, one of the Linux
    * drivers subsequently pokes it, and changes the CPU speed.
    * Workaround : Remove the unneeded alias.
    */
diff --combined arch/x86/kernel/cpu/common.c

index 06ad72383b4eb496373a79af8aad508807d7a02a,62590aa064c83dd3282d7084c0124de681aeee7b..9988caf4216118800307cb813a6d61540f41e127
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -303,48 -303,6 +303,48 @@@ static __always_inline void setup_smap(
         }
   }
   
+ +/*
+ + * Protection Keys are not available in 32-bit mode.
+ + */
+ +static bool pku_disabled;
+ +
+ +static __always_inline void setup_pku(struct cpuinfo_x86 *c)
+ +{
+ +      if (!cpu_has(c, X86_FEATURE_PKU))
+ +              return;
+ +      if (pku_disabled)
+ +              return;
+ +
+ +      cr4_set_bits(X86_CR4_PKE);
+ +      /*
+ +       * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE
+ +       * cpuid bit to be set.  We need to ensure that we
+ +       * update that bit in this CPU's "cpu_info".
+ +       */
+ +      get_cpu_cap(c);
+ +}
+ +
+ +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+ +static __init int setup_disable_pku(char *arg)
+ +{
+ +      /*
+ +       * Do not clear the X86_FEATURE_PKU bit.  All of the
+ +       * runtime checks are against OSPKE so clearing the
+ +       * bit does nothing.
+ +       *
+ +       * This way, we will see "pku" in cpuinfo, but not
+ +       * "ospke", which is exactly what we want.  It shows
+ +       * that the CPU has PKU, but the OS has not enabled it.
+ +       * This happens to be exactly how a system would look
+ +       * if we disabled the config option.
+ +       */
+ +      pr_info("x86: 'nopku' specified, disabling Memory Protection Keys\n");
+ +      pku_disabled = true;
+ +      return 1;
+ +}
+ +__setup("nopku", setup_disable_pku);
+ +#endif /* CONFIG_X86_64 */
+ +
   /*
    * Some CPU features depend on higher CPUID levels, which may not always
    * be available due to CPUID level capping or broken virtualization
@@@ -667,7 -625,6 +667,7 @@@ void get_cpu_cap(struct cpuinfo_x86 *c
                 c->x86_capability[CPUID_7_0_EBX] = ebx;
   
                 c->x86_capability[CPUID_6_EAX] = cpuid_eax(0x00000006);
+ +              c->x86_capability[CPUID_7_ECX] = ecx;
         }
   
         /* Extended state features: level 0x0000000d */
@@@ -968,7 -925,7 +968,7 @@@ static void identify_cpu(struct cpuinfo
         if (this_cpu->c_identify)
                 this_cpu->c_identify(c);
   
-       /* Clear/Set all flags overriden by options, after probe */
+       /* Clear/Set all flags overridden by options, after probe */
         for (i = 0; i < NCAPINTS; i++) {
                 c->x86_capability[i] &= ~cpu_caps_cleared[i];
                 c->x86_capability[i] |= cpu_caps_set[i];
@@@ -1025,10 -982,9 +1025,10 @@@
         init_hypervisor(c);
         x86_init_rdrand(c);
         x86_init_cache_qos(c);
+ +      setup_pku(c);
   
         /*
-        * Clear/Set all flags overriden by options, need do it
+        * Clear/Set all flags overridden by options, need do it
          * before following smp all cpus cap AND.
          */
         for (i = 0; i < NCAPINTS; i++) {
diff --combined arch/x86/kernel/process_64.c

index dfa2781610e8e435953bd09f5b1e51b8ee5e103a,9f751876066f5c92e99adf7609a542cf9cc3550e..6cbab31ac23a20fb3980f06f88becddff135411b
--- 1/arch/x86/kernel/process_64.c
--- 2/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@@ -48,6 -48,7 +48,7 @@@
   #include <asm/syscalls.h>
   #include <asm/debugreg.h>
   #include <asm/switch_to.h>
+ #include <asm/xen/hypervisor.h>
   
   asmlinkage extern void ret_from_fork(void);
   
@@@ -116,8 -117,6 +117,8 @@@ void __show_regs(struct pt_regs *regs, 
         printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
         printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
   
+ +      if (boot_cpu_has(X86_FEATURE_OSPKE))
+ +              printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
   }
   
   void release_thread(struct task_struct *dead_task)
@@@ -413,6 -412,17 +414,17 @@@ __switch_to(struct task_struct *prev_p
                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
                 __switch_to_xtra(prev_p, next_p, tss);
   
+ #ifdef CONFIG_XEN
+       /*
+        * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
+        * current_pt_regs()->flags may not match the current task's
+        * intended IOPL.  We need to switch it manually.
+        */
+       if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
+                    prev->iopl != next->iopl))
+               xen_set_iopl_mask(next->iopl);
+ #endif
+ 
         if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
                 /*
                  * AMD CPUs have a misfeature: SYSRET sets the SS selector but
@@@ -478,7 -488,7 +490,7 @@@ void set_personality_ia32(bool x32
                 if (current->mm)
                         current->mm->context.ia32_compat = TIF_X32;
                 current->personality &= ~READ_IMPLIES_EXEC;
- -              /* is_compat_task() uses the presence of the x32
+ +              /* in_compat_syscall() uses the presence of the x32
                    syscall bit flag to determine compat status */
                 current_thread_info()->status &= ~TS_COMPAT;
         } else {
diff --combined arch/x86/kvm/mmu.c

index 6bdfbc23ecaa8fc779085076bc08f340fc704513,ddb3291d49c90b955c03aa7c09b3c47319a4c110..70e95d097ef104ac489a41a17dce2296dad5123c
--- 1/arch/x86/kvm/mmu.c
--- 2/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@@ -41,7 -41,6 +41,7 @@@
   #include <asm/cmpxchg.h>
   #include <asm/io.h>
   #include <asm/vmx.h>
+ +#include <asm/kvm_page_track.h>
   
   /*
    * When setting this variable to true it enables Two-Dimensional-Paging
@@@ -479,7 -478,7 +479,7 @@@ static bool spte_is_locklessly_modifiab
   static bool spte_has_volatile_bits(u64 spte)
   {
         /*
-        * Always atomicly update spte if it can be updated
+        * Always atomically update spte if it can be updated
          * out of mmu-lock, it can ensure dirty bit is not lost,
          * also, it can help us to get a stable is_writable_pte()
          * to ensure tlb flush is not missed.
@@@ -550,7 -549,7 +550,7 @@@ static bool mmu_spte_update(u64 *sptep
   
         /*
          * For the spte updated out of mmu-lock is safe, since
-        * we always atomicly update it, see the comments in
+        * we always atomically update it, see the comments in
          * spte_has_volatile_bits().
          */
         if (spte_is_locklessly_modifiable(old_spte) &&
@@@ -632,12 -631,12 +632,12 @@@ static void walk_shadow_page_lockless_b
          * kvm_flush_remote_tlbs() IPI to all active vcpus.
          */
         local_irq_disable();
- -      vcpu->mode = READING_SHADOW_PAGE_TABLES;
+ +
         /*
          * Make sure a following spte read is not reordered ahead of the write
          * to vcpu->mode.
          */
- -      smp_mb();
+ +      smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
   }
   
   static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
@@@ -647,7 -646,8 +647,7 @@@
          * reads to sptes.  If it does, kvm_commit_zap_page() can see us
          * OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
          */
- -      smp_mb();
- -      vcpu->mode = OUTSIDE_GUEST_MODE;
+ +      smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
         local_irq_enable();
   }
   
@@@ -776,85 -776,62 +776,85 @@@ static struct kvm_lpage_info *lpage_inf
         return &slot->arch.lpage_info[level - 2][idx];
   }
   
+ +static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
+ +                                          gfn_t gfn, int count)
+ +{
+ +      struct kvm_lpage_info *linfo;
+ +      int i;
+ +
+ +      for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
+ +              linfo = lpage_info_slot(gfn, slot, i);
+ +              linfo->disallow_lpage += count;
+ +              WARN_ON(linfo->disallow_lpage < 0);
+ +      }
+ +}
+ +
+ +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ +{
+ +      update_gfn_disallow_lpage_count(slot, gfn, 1);
+ +}
+ +
+ +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
+ +{
+ +      update_gfn_disallow_lpage_count(slot, gfn, -1);
+ +}
+ +
   static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
- -      struct kvm_lpage_info *linfo;
         gfn_t gfn;
- -      int i;
   
+ +      kvm->arch.indirect_shadow_pages++;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
- -      for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- -              linfo = lpage_info_slot(gfn, slot, i);
- -              linfo->write_count += 1;
- -      }
- -      kvm->arch.indirect_shadow_pages++;
+ +
+ +      /* the non-leaf shadow pages are keeping readonly. */
+ +      if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ +              return kvm_slot_page_track_add_page(kvm, slot, gfn,
+ +                                                  KVM_PAGE_TRACK_WRITE);
+ +
+ +      kvm_mmu_gfn_disallow_lpage(slot, gfn);
   }
   
   static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
   {
         struct kvm_memslots *slots;
         struct kvm_memory_slot *slot;
- -      struct kvm_lpage_info *linfo;
         gfn_t gfn;
- -      int i;
   
+ +      kvm->arch.indirect_shadow_pages--;
         gfn = sp->gfn;
         slots = kvm_memslots_for_spte_role(kvm, sp->role);
         slot = __gfn_to_memslot(slots, gfn);
- -      for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
- -              linfo = lpage_info_slot(gfn, slot, i);
- -              linfo->write_count -= 1;
- -              WARN_ON(linfo->write_count < 0);
- -      }
- -      kvm->arch.indirect_shadow_pages--;
+ +      if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ +              return kvm_slot_page_track_remove_page(kvm, slot, gfn,
+ +                                                     KVM_PAGE_TRACK_WRITE);
+ +
+ +      kvm_mmu_gfn_allow_lpage(slot, gfn);
   }
   
- -static int __has_wrprotected_page(gfn_t gfn, int level,
- -                                struct kvm_memory_slot *slot)
+ +static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
+ +                                        struct kvm_memory_slot *slot)
   {
         struct kvm_lpage_info *linfo;
   
         if (slot) {
                 linfo = lpage_info_slot(gfn, slot, level);
- -              return linfo->write_count;
+ +              return !!linfo->disallow_lpage;
         }
   
- -      return 1;
+ +      return true;
   }
   
- -static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+ +static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
+ +                                      int level)
   {
         struct kvm_memory_slot *slot;
   
         slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- -      return __has_wrprotected_page(gfn, level, slot);
+ +      return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
   }
   
   static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
@@@ -920,7 -897,7 +920,7 @@@ static int mapping_level(struct kvm_vcp
         max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
   
         for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
- -              if (__has_wrprotected_page(large_gfn, level, slot))
+ +              if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
                         break;
   
         return level - 1;
@@@ -1346,29 -1323,23 +1346,29 @@@ void kvm_arch_mmu_enable_log_dirty_pt_m
                 kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
   }
   
- -static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
+ +                                  struct kvm_memory_slot *slot, u64 gfn)
   {
- -      struct kvm_memory_slot *slot;
         struct kvm_rmap_head *rmap_head;
         int i;
         bool write_protected = false;
   
- -      slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
- -
         for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
                 rmap_head = __gfn_to_rmap(gfn, i, slot);
- -              write_protected |= __rmap_write_protect(vcpu->kvm, rmap_head, true);
+ +              write_protected |= __rmap_write_protect(kvm, rmap_head, true);
         }
   
         return write_protected;
   }
   
+ +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
+ +{
+ +      struct kvm_memory_slot *slot;
+ +
+ +      slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+ +      return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
+ +}
+ +
   static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
   {
         u64 *sptep;
@@@ -1783,7 -1754,7 +1783,7 @@@ static void mark_unsync(u64 *spte
   static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
                                struct kvm_mmu_page *sp)
   {
- -      return 1;
+ +      return 0;
   }
   
   static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
@@@ -1869,16 -1840,13 +1869,16 @@@ static int __mmu_unsync_walk(struct kvm
         return nr_unsync_leaf;
   }
   
+ +#define INVALID_INDEX (-1)
+ +
   static int mmu_unsync_walk(struct kvm_mmu_page *sp,
                            struct kvm_mmu_pages *pvec)
   {
+ +      pvec->nr = 0;
         if (!sp->unsync_children)
                 return 0;
   
- -      mmu_pages_add(pvec, sp, 0);
+ +      mmu_pages_add(pvec, sp, INVALID_INDEX);
         return __mmu_unsync_walk(sp, pvec);
   }
   
@@@ -1915,35 -1883,37 +1915,35 @@@ static void kvm_mmu_commit_zap_page(str
                 if ((_sp)->role.direct || (_sp)->role.invalid) {} else
   
   /* @sp->gfn should be write-protected at the call site */
- -static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- -                         struct list_head *invalid_list, bool clear_unsync)
+ +static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ +                          struct list_head *invalid_list)
   {
         if (sp->role.cr4_pae != !!is_pae(vcpu)) {
                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- -              return 1;
+ +              return false;
         }
   
- -      if (clear_unsync)
- -              kvm_unlink_unsync_page(vcpu->kvm, sp);
- -
- -      if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
+ +      if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
                 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- -              return 1;
+ +              return false;
         }
   
- -      kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- -      return 0;
+ +      return true;
   }
   
- -static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
- -                                 struct kvm_mmu_page *sp)
+ +static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
+ +                               struct list_head *invalid_list,
+ +                               bool remote_flush, bool local_flush)
   {
- -      LIST_HEAD(invalid_list);
- -      int ret;
- -
- -      ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
- -      if (ret)
- -              kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ +      if (!list_empty(invalid_list)) {
+ +              kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+ +              return;
+ +      }
   
- -      return ret;
+ +      if (remote_flush)
+ +              kvm_flush_remote_tlbs(vcpu->kvm);
+ +      else if (local_flush)
+ +              kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
   }
   
   #ifdef CONFIG_KVM_MMU_AUDIT
@@@ -1953,38 -1923,46 +1953,38 @@@ static void kvm_mmu_audit(struct kvm_vc
   static void mmu_audit_disable(void) { }
   #endif
   
- -static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ +static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                          struct list_head *invalid_list)
   {
- -      return __kvm_sync_page(vcpu, sp, invalid_list, true);
+ +      kvm_unlink_unsync_page(vcpu->kvm, sp);
+ +      return __kvm_sync_page(vcpu, sp, invalid_list);
   }
   
   /* @gfn should be write-protected at the call site */
- -static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ +static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
+ +                         struct list_head *invalid_list)
   {
         struct kvm_mmu_page *s;
- -      LIST_HEAD(invalid_list);
- -      bool flush = false;
+ +      bool ret = false;
   
         for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
                 if (!s->unsync)
                         continue;
   
                 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- -              kvm_unlink_unsync_page(vcpu->kvm, s);
- -              if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- -                      (vcpu->arch.mmu.sync_page(vcpu, s))) {
- -                      kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
- -                      continue;
- -              }
- -              flush = true;
+ +              ret |= kvm_sync_page(vcpu, s, invalid_list);
         }
   
- -      kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- -      if (flush)
- -              kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ +      return ret;
   }
   
   struct mmu_page_path {
- -      struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
- -      unsigned int idx[PT64_ROOT_LEVEL-1];
+ +      struct kvm_mmu_page *parent[PT64_ROOT_LEVEL];
+ +      unsigned int idx[PT64_ROOT_LEVEL];
   };
   
   #define for_each_sp(pvec, sp, parents, i)                     \
- -              for (i = mmu_pages_next(&pvec, &parents, -1),   \
- -                      sp = pvec.page[i].sp;                   \
+ +              for (i = mmu_pages_first(&pvec, &parents);      \
                         i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});   \
                         i = mmu_pages_next(&pvec, &parents, i))
   
@@@ -1996,43 -1974,19 +1996,43 @@@ static int mmu_pages_next(struct kvm_mm
   
         for (n = i+1; n < pvec->nr; n++) {
                 struct kvm_mmu_page *sp = pvec->page[n].sp;
+ +              unsigned idx = pvec->page[n].idx;
+ +              int level = sp->role.level;
   
- -              if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
- -                      parents->idx[0] = pvec->page[n].idx;
- -                      return n;
- -              }
+ +              parents->idx[level-1] = idx;
+ +              if (level == PT_PAGE_TABLE_LEVEL)
+ +                      break;
   
- -              parents->parent[sp->role.level-2] = sp;
- -              parents->idx[sp->role.level-1] = pvec->page[n].idx;
+ +              parents->parent[level-2] = sp;
         }
   
         return n;
   }
   
+ +static int mmu_pages_first(struct kvm_mmu_pages *pvec,
+ +                         struct mmu_page_path *parents)
+ +{
+ +      struct kvm_mmu_page *sp;
+ +      int level;
+ +
+ +      if (pvec->nr == 0)
+ +              return 0;
+ +
+ +      WARN_ON(pvec->page[0].idx != INVALID_INDEX);
+ +
+ +      sp = pvec->page[0].sp;
+ +      level = sp->role.level;
+ +      WARN_ON(level == PT_PAGE_TABLE_LEVEL);
+ +
+ +      parents->parent[level-2] = sp;
+ +
+ +      /* Also set up a sentinel.  Further entries in pvec are all
+ +       * children of sp, so this element is never overwritten.
+ +       */
+ +      parents->parent[level-1] = NULL;
+ +      return mmu_pages_next(pvec, parents, 0);
+ +}
+ +
   static void mmu_pages_clear_parents(struct mmu_page_path *parents)
   {
         struct kvm_mmu_page *sp;
@@@ -2040,14 -1994,22 +2040,14 @@@
   
         do {
                 unsigned int idx = parents->idx[level];
- -
                 sp = parents->parent[level];
                 if (!sp)
                         return;
   
+ +              WARN_ON(idx == INVALID_INDEX);
                 clear_unsync_child_bit(sp, idx);
                 level++;
- -      } while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
- -}
- -
- -static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
- -                             struct mmu_page_path *parents,
- -                             struct kvm_mmu_pages *pvec)
- -{
- -      parents->parent[parent->role.level-1] = NULL;
- -      pvec->nr = 0;
+ +      } while (!sp->unsync_children);
   }
   
   static void mmu_sync_children(struct kvm_vcpu *vcpu,
@@@ -2058,36 -2020,30 +2058,36 @@@
         struct mmu_page_path parents;
         struct kvm_mmu_pages pages;
         LIST_HEAD(invalid_list);
+ +      bool flush = false;
   
- -      kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 bool protected = false;
   
                 for_each_sp(pages, sp, parents, i)
                         protected |= rmap_write_protect(vcpu, sp->gfn);
   
- -              if (protected)
+ +              if (protected) {
                         kvm_flush_remote_tlbs(vcpu->kvm);
+ +                      flush = false;
+ +              }
   
                 for_each_sp(pages, sp, parents, i) {
- -                      kvm_sync_page(vcpu, sp, &invalid_list);
+ +                      flush |= kvm_sync_page(vcpu, sp, &invalid_list);
                         mmu_pages_clear_parents(&parents);
                 }
- -              kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- -              cond_resched_lock(&vcpu->kvm->mmu_lock);
- -              kvm_mmu_pages_init(parent, &parents, &pages);
+ +              if (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)) {
+ +                      kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+ +                      cond_resched_lock(&vcpu->kvm->mmu_lock);
+ +                      flush = false;
+ +              }
         }
+ +
+ +      kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
   }
   
   static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
   {
- -      sp->write_flooding_count = 0;
+ +      atomic_set(&sp->write_flooding_count,  0);
   }
   
   static void clear_sp_write_flooding_count(u64 *spte)
@@@ -2113,8 -2069,6 +2113,8 @@@ static struct kvm_mmu_page *kvm_mmu_get
         unsigned quadrant;
         struct kvm_mmu_page *sp;
         bool need_sync = false;
+ +      bool flush = false;
+ +      LIST_HEAD(invalid_list);
   
         role = vcpu->arch.mmu.base_role;
         role.level = level;
@@@ -2138,16 -2092,8 +2138,16 @@@
                 if (sp->role.word != role.word)
                         continue;
   
- -              if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
- -                      break;
+ +              if (sp->unsync) {
+ +                      /* The page is good, but __kvm_sync_page might still end
+ +                       * up zapping it.  If so, break in order to rebuild it.
+ +                       */
+ +                      if (!__kvm_sync_page(vcpu, sp, &invalid_list))
+ +                              break;
+ +
+ +                      WARN_ON(!list_empty(&invalid_list));
+ +                      kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ +              }
   
                 if (sp->unsync_children)
                         kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@@ -2166,24 -2112,16 +2166,24 @@@
         hlist_add_head(&sp->hash_link,
                 &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
         if (!direct) {
- -              if (rmap_write_protect(vcpu, gfn))
+ +              /*
+ +               * we should do write protection before syncing pages
+ +               * otherwise the content of the synced shadow page may
+ +               * be inconsistent with guest page table.
+ +               */
+ +              account_shadowed(vcpu->kvm, sp);
+ +              if (level == PT_PAGE_TABLE_LEVEL &&
+ +                    rmap_write_protect(vcpu, gfn))
                         kvm_flush_remote_tlbs(vcpu->kvm);
- -              if (level > PT_PAGE_TABLE_LEVEL && need_sync)
- -                      kvm_sync_pages(vcpu, gfn);
   
- -              account_shadowed(vcpu->kvm, sp);
+ +              if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+ +                      flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
         }
         sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
         clear_page(sp->spt);
         trace_kvm_mmu_get_page(sp, true);
+ +
+ +      kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
         return sp;
   }
   
@@@ -2331,6 -2269,7 +2331,6 @@@ static int mmu_zap_unsync_children(stru
         if (parent->role.level == PT_PAGE_TABLE_LEVEL)
                 return 0;
   
- -      kvm_mmu_pages_init(parent, &parents, &pages);
         while (mmu_unsync_walk(parent, &pages)) {
                 struct kvm_mmu_page *sp;
   
@@@ -2339,6 -2278,7 +2339,6 @@@
                         mmu_pages_clear_parents(&parents);
                         zapped++;
                 }
- -              kvm_mmu_pages_init(parent, &parents, &pages);
         }
   
         return zapped;
@@@ -2389,13 -2329,14 +2389,13 @@@ static void kvm_mmu_commit_zap_page(str
                 return;
   
         /*
- -       * wmb: make sure everyone sees our modifications to the page tables
- -       * rmb: make sure we see changes to vcpu->mode
- -       */
- -      smp_mb();
- -
- -      /*
- -       * Wait for all vcpus to exit guest mode and/or lockless shadow
- -       * page table walks.
+ +       * We need to make sure everyone sees our modifications to
+ +       * the page tables and see changes to vcpu->mode here. The barrier
+ +       * in the kvm_flush_remote_tlbs() achieves this. This pairs
+ +       * with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
+ +       *
+ +       * In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
+ +       * guest mode and/or lockless shadow page table walks.
          */
         kvm_flush_remote_tlbs(kvm);
   
@@@ -2413,8 -2354,8 +2413,8 @@@ static bool prepare_zap_oldest_mmu_page
         if (list_empty(&kvm->arch.active_mmu_pages))
                 return false;
   
- -      sp = list_entry(kvm->arch.active_mmu_pages.prev,
- -                      struct kvm_mmu_page, link);
+ +      sp = list_last_entry(&kvm->arch.active_mmu_pages,
+ +                           struct kvm_mmu_page, link);
         kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
   
         return true;
@@@ -2467,7 -2408,7 +2467,7 @@@ int kvm_mmu_unprotect_page(struct kvm *
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
   
- -static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+ +static void kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
   {
         trace_kvm_mmu_unsync_page(sp);
         ++vcpu->kvm->stat.mmu_unsync;
@@@ -2476,26 -2417,37 +2476,26 @@@
         kvm_mmu_mark_parents_unsync(sp);
   }
   
- -static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+ +static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
+ +                                 bool can_unsync)
   {
- -      struct kvm_mmu_page *s;
- -
- -      for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
- -              if (s->unsync)
- -                      continue;
- -              WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
- -              __kvm_unsync_page(vcpu, s);
- -      }
- -}
+ +      struct kvm_mmu_page *sp;
   
- -static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
- -                                bool can_unsync)
- -{
- -      struct kvm_mmu_page *s;
- -      bool need_unsync = false;
+ +      if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ +              return true;
   
- -      for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
+ +      for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                 if (!can_unsync)
- -                      return 1;
+ +                      return true;
   
- -              if (s->role.level != PT_PAGE_TABLE_LEVEL)
- -                      return 1;
+ +              if (sp->unsync)
+ +                      continue;
   
- -              if (!s->unsync)
- -                      need_unsync = true;
+ +              WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
+ +              kvm_unsync_page(vcpu, sp);
         }
- -      if (need_unsync)
- -              kvm_unsync_pages(vcpu, gfn);
- -      return 0;
+ +
+ +      return false;
   }
   
   static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
@@@ -2551,7 -2503,7 +2551,7 @@@ static int set_spte(struct kvm_vcpu *vc
                  * be fixed if guest refault.
                  */
                 if (level > PT_PAGE_TABLE_LEVEL &&
- -                  has_wrprotected_page(vcpu, gfn, level))
+ +                  mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
                         goto done;
   
                 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
@@@ -2816,7 -2768,7 +2816,7 @@@ static void transparent_hugepage_adjust
         if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
             level == PT_PAGE_TABLE_LEVEL &&
             PageTransCompound(pfn_to_page(pfn)) &&
- -          !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
+ +          !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
                 unsigned long mask;
                 /*
                  * mmu_notifier_retry was successful and we hold the
@@@ -2844,16 -2796,20 +2844,16 @@@
   static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
                                 kvm_pfn_t pfn, unsigned access, int *ret_val)
   {
- -      bool ret = true;
- -
         /* The pfn is invalid, report the error! */
         if (unlikely(is_error_pfn(pfn))) {
                 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
- -              goto exit;
+ +              return true;
         }
   
         if (unlikely(is_noslot_pfn(pfn)))
                 vcpu_cache_mmio_info(vcpu, gva, gfn, access);
   
- -      ret = false;
- -exit:
- -      return ret;
+ +      return false;
   }
   
   static bool page_fault_can_be_fast(u32 error_code)
@@@ -3317,7 -3273,7 +3317,7 @@@ static bool is_shadow_zero_bits_set(str
         return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
   }
   
- -static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+ +static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
   {
         if (direct)
                 return vcpu_match_mmio_gpa(vcpu, addr);
@@@ -3376,7 -3332,7 +3376,7 @@@ int handle_mmio_page_fault(struct kvm_v
         u64 spte;
         bool reserved;
   
- -      if (quickly_check_mmio_pf(vcpu, addr, direct))
+ +      if (mmio_info_in_cache(vcpu, addr, direct))
                 return RET_MMIO_PF_EMULATE;
   
         reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
@@@ -3406,53 -3362,20 +3406,53 @@@
   }
   EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
   
+ +static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
+ +                                       u32 error_code, gfn_t gfn)
+ +{
+ +      if (unlikely(error_code & PFERR_RSVD_MASK))
+ +              return false;
+ +
+ +      if (!(error_code & PFERR_PRESENT_MASK) ||
+ +            !(error_code & PFERR_WRITE_MASK))
+ +              return false;
+ +
+ +      /*
+ +       * guest is writing the page which is write tracked which can
+ +       * not be fixed by page fault handler.
+ +       */
+ +      if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
+ +              return true;
+ +
+ +      return false;
+ +}
+ +
+ +static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
+ +{
+ +      struct kvm_shadow_walk_iterator iterator;
+ +      u64 spte;
+ +
+ +      if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ +              return;
+ +
+ +      walk_shadow_page_lockless_begin(vcpu);
+ +      for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ +              clear_sp_write_flooding_count(iterator.sptep);
+ +              if (!is_shadow_present_pte(spte))
+ +                      break;
+ +      }
+ +      walk_shadow_page_lockless_end(vcpu);
+ +}
+ +
   static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
                                 u32 error_code, bool prefault)
   {
- -      gfn_t gfn;
+ +      gfn_t gfn = gva >> PAGE_SHIFT;
         int r;
   
         pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
   
- -      if (unlikely(error_code & PFERR_RSVD_MASK)) {
- -              r = handle_mmio_page_fault(vcpu, gva, true);
- -
- -              if (likely(r != RET_MMIO_PF_INVALID))
- -                      return r;
- -      }
+ +      if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ +              return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -3460,6 -3383,7 +3460,6 @@@
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
- -      gfn = gva >> PAGE_SHIFT;
   
         return nonpaging_map(vcpu, gva & PAGE_MASK,
                              error_code, gfn, prefault);
@@@ -3536,8 -3460,12 +3536,8 @@@ static int tdp_page_fault(struct kvm_vc
   
         MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
   
- -      if (unlikely(error_code & PFERR_RSVD_MASK)) {
- -              r = handle_mmio_page_fault(vcpu, gpa, true);
- -
- -              if (likely(r != RET_MMIO_PF_INVALID))
- -                      return r;
- -      }
+ +      if (page_fault_handle_page_track(vcpu, error_code, gfn))
+ +              return 1;
   
         r = mmu_topup_memory_caches(vcpu);
         if (r)
@@@ -3630,24 -3558,13 +3630,24 @@@ static bool sync_mmio_spte(struct kvm_v
         return false;
   }
   
- -static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte)
+ +static inline bool is_last_gpte(struct kvm_mmu *mmu,
+ +                              unsigned level, unsigned gpte)
   {
- -      unsigned index;
+ +      /*
+ +       * PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+ +       * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+ +       * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+ +       */
+ +      gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+ +
+ +      /*
+ +       * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
+ +       * If it is clear, there are no large pages at this level, so clear
+ +       * PT_PAGE_SIZE_MASK in gpte if that is the case.
+ +       */
+ +      gpte &= level - mmu->last_nonleaf_level;
   
- -      index = level - 1;
- -      index |= (gpte & PT_PAGE_SIZE_MASK) >> (PT_PAGE_SIZE_SHIFT - 2);
- -      return mmu->last_pte_bitmap & (1 << index);
+ +      return gpte & PT_PAGE_SIZE_MASK;
   }
   
   #define PTTYPE_EPT 18 /* arbitrary */
@@@ -3921,88 -3838,22 +3921,88 @@@ static void update_permission_bitmask(s
         }
   }
   
- -static void update_last_pte_bitmap(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
- -{
- -      u8 map;
- -      unsigned level, root_level = mmu->root_level;
- -      const unsigned ps_set_index = 1 << 2;  /* bit 2 of index: ps */
- -
- -      if (root_level == PT32E_ROOT_LEVEL)
- -              --root_level;
- -      /* PT_PAGE_TABLE_LEVEL always terminates */
- -      map = 1 | (1 << ps_set_index);
- -      for (level = PT_DIRECTORY_LEVEL; level <= root_level; ++level) {
- -              if (level <= PT_PDPE_LEVEL
- -                  && (mmu->root_level >= PT32E_ROOT_LEVEL || is_pse(vcpu)))
- -                      map |= 1 << (ps_set_index | (level - 1));
+ +/*
+ +* PKU is an additional mechanism by which the paging controls access to
+ +* user-mode addresses based on the value in the PKRU register.  Protection
+ +* key violations are reported through a bit in the page fault error code.
+ +* Unlike other bits of the error code, the PK bit is not known at the
+ +* call site of e.g. gva_to_gpa; it must be computed directly in
+ +* permission_fault based on two bits of PKRU, on some machine state (CR4,
+ +* CR0, EFER, CPL), and on other bits of the error code and the page tables.
+ +*
+ +* In particular the following conditions come from the error code, the
+ +* page tables and the machine state:
+ +* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
+ +* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
+ +* - PK is always zero if U=0 in the page tables
+ +* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
+ +*
+ +* The PKRU bitmask caches the result of these four conditions.  The error
+ +* code (minus the P bit) and the page table's U bit form an index into the
+ +* PKRU bitmask.  Two bits of the PKRU bitmask are then extracted and ANDed
+ +* with the two bits of the PKRU register corresponding to the protection key.
+ +* For the first three conditions above the bits will be 00, thus masking
+ +* away both AD and WD.  For all reads or if the last condition holds, WD
+ +* only will be masked away.
+ +*/
+ +static void update_pkru_bitmask(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+ +                              bool ept)
+ +{
+ +      unsigned bit;
+ +      bool wp;
+ +
+ +      if (ept) {
+ +              mmu->pkru_mask = 0;
+ +              return;
+ +      }
+ +
+ +      /* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
+ +      if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || !is_long_mode(vcpu)) {
+ +              mmu->pkru_mask = 0;
+ +              return;
         }
- -      mmu->last_pte_bitmap = map;
+ +
+ +      wp = is_write_protection(vcpu);
+ +
+ +      for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
+ +              unsigned pfec, pkey_bits;
+ +              bool check_pkey, check_write, ff, uf, wf, pte_user;
+ +
+ +              pfec = bit << 1;
+ +              ff = pfec & PFERR_FETCH_MASK;
+ +              uf = pfec & PFERR_USER_MASK;
+ +              wf = pfec & PFERR_WRITE_MASK;
+ +
+ +              /* PFEC.RSVD is replaced by ACC_USER_MASK. */
+ +              pte_user = pfec & PFERR_RSVD_MASK;
+ +
+ +              /*
+ +               * Only need to check the access which is not an
+ +               * instruction fetch and is to a user page.
+ +               */
+ +              check_pkey = (!ff && pte_user);
+ +              /*
+ +               * write access is controlled by PKRU if it is a
+ +               * user access or CR0.WP = 1.
+ +               */
+ +              check_write = check_pkey && wf && (uf || wp);
+ +
+ +              /* PKRU.AD stops both read and write access. */
+ +              pkey_bits = !!check_pkey;
+ +              /* PKRU.WD stops write access. */
+ +              pkey_bits |= (!!check_write) << 1;
+ +
+ +              mmu->pkru_mask |= (pkey_bits & 3) << pfec;
+ +      }
+ +}
+ +
+ +static void update_last_nonleaf_level(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+ +{
+ +      unsigned root_level = mmu->root_level;
+ +
+ +      mmu->last_nonleaf_level = root_level;
+ +      if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
+ +              mmu->last_nonleaf_level++;
   }
   
   static void paging64_init_context_common(struct kvm_vcpu *vcpu,
@@@ -4014,8 -3865,7 +4014,8 @@@
   
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
- -      update_last_pte_bitmap(vcpu, context);
+ +      update_pkru_bitmask(vcpu, context, false);
+ +      update_last_nonleaf_level(vcpu, context);
   
         MMU_WARN_ON(!is_pae(vcpu));
         context->page_fault = paging64_page_fault;
@@@ -4042,8 -3892,7 +4042,8 @@@ static void paging32_init_context(struc
   
         reset_rsvds_bits_mask(vcpu, context);
         update_permission_bitmask(vcpu, context, false);
- -      update_last_pte_bitmap(vcpu, context);
+ +      update_pkru_bitmask(vcpu, context, false);
+ +      update_last_nonleaf_level(vcpu, context);
   
         context->page_fault = paging32_page_fault;
         context->gva_to_gpa = paging32_gva_to_gpa;
@@@ -4101,8 -3950,7 +4101,8 @@@ static void init_kvm_tdp_mmu(struct kvm
         }
   
         update_permission_bitmask(vcpu, context, false);
- -      update_last_pte_bitmap(vcpu, context);
+ +      update_pkru_bitmask(vcpu, context, false);
+ +      update_last_nonleaf_level(vcpu, context);
         reset_tdp_shadow_zero_bits_mask(vcpu, context);
   }
   
@@@ -4154,7 -4002,6 +4154,7 @@@ void kvm_init_shadow_ept_mmu(struct kvm
         context->direct_map = false;
   
         update_permission_bitmask(vcpu, context, true);
+ +      update_pkru_bitmask(vcpu, context, true);
         reset_rsvds_bits_mask_ept(vcpu, context, execonly);
         reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
   }
@@@ -4209,8 -4056,7 +4209,8 @@@ static void init_kvm_nested_mmu(struct 
         }
   
         update_permission_bitmask(vcpu, g_context, false);
- -      update_last_pte_bitmap(vcpu, g_context);
+ +      update_pkru_bitmask(vcpu, g_context, false);
+ +      update_last_nonleaf_level(vcpu, g_context);
   }
   
   static void init_kvm_mmu(struct kvm_vcpu *vcpu)
@@@ -4281,6 -4127,18 +4281,6 @@@ static bool need_remote_flush(u64 old, 
         return (old & ~new & PT64_PERM_MASK) != 0;
   }
   
- -static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
- -                                  bool remote_flush, bool local_flush)
- -{
- -      if (zap_page)
- -              return;
- -
- -      if (remote_flush)
- -              kvm_flush_remote_tlbs(vcpu->kvm);
- -      else if (local_flush)
- -              kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- -}
- -
   static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa,
                                     const u8 *new, int *bytes)
   {
@@@ -4330,8 -4188,7 +4330,8 @@@ static bool detect_write_flooding(struc
         if (sp->role.level == PT_PAGE_TABLE_LEVEL)
                 return false;
   
- -      return ++sp->write_flooding_count >= 3;
+ +      atomic_inc(&sp->write_flooding_count);
+ +      return atomic_read(&sp->write_flooding_count) >= 3;
   }
   
   /*
@@@ -4393,15 -4250,15 +4393,15 @@@ static u64 *get_written_sptes(struct kv
         return spte;
   }
   
- -void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
- -                     const u8 *new, int bytes)
+ +static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
+ +                            const u8 *new, int bytes)
   {
         gfn_t gfn = gpa >> PAGE_SHIFT;
         struct kvm_mmu_page *sp;
         LIST_HEAD(invalid_list);
         u64 entry, gentry, *spte;
         int npte;
- -      bool remote_flush, local_flush, zap_page;
+ +      bool remote_flush, local_flush;
         union kvm_mmu_page_role mask = { };
   
         mask.cr0_wp = 1;
@@@ -4418,7 -4275,7 +4418,7 @@@
         if (!ACCESS_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
                 return;
   
- -      zap_page = remote_flush = local_flush = false;
+ +      remote_flush = local_flush = false;
   
         pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
   
@@@ -4438,7 -4295,8 +4438,7 @@@
         for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
                 if (detect_write_misaligned(sp, gpa, bytes) ||
                       detect_write_flooding(sp)) {
- -                      zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
- -                                                   &invalid_list);
+ +                      kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
                         ++vcpu->kvm->stat.mmu_flooded;
                         continue;
                 }
@@@ -4460,7 -4318,8 +4460,7 @@@
                         ++spte;
                 }
         }
- -      mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
- -      kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+ +      kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
         kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
         spin_unlock(&vcpu->kvm->mmu_lock);
   }
@@@ -4497,34 -4356,32 +4497,34 @@@ static void make_mmu_pages_available(st
         kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
   }
   
- -static bool is_mmio_page_fault(struct kvm_vcpu *vcpu, gva_t addr)
- -{
- -      if (vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu))
- -              return vcpu_match_mmio_gpa(vcpu, addr);
- -
- -      return vcpu_match_mmio_gva(vcpu, addr);
- -}
- -
   int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
                        void *insn, int insn_len)
   {
         int r, emulation_type = EMULTYPE_RETRY;
         enum emulation_result er;
+ +      bool direct = vcpu->arch.mmu.direct_map || mmu_is_nested(vcpu);
+ +
+ +      if (unlikely(error_code & PFERR_RSVD_MASK)) {
+ +              r = handle_mmio_page_fault(vcpu, cr2, direct);
+ +              if (r == RET_MMIO_PF_EMULATE) {
+ +                      emulation_type = 0;
+ +                      goto emulate;
+ +              }
+ +              if (r == RET_MMIO_PF_RETRY)
+ +                      return 1;
+ +              if (r < 0)
+ +                      return r;
+ +      }
   
         r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
         if (r < 0)
- -              goto out;
- -
- -      if (!r) {
- -              r = 1;
- -              goto out;
- -      }
+ +              return r;
+ +      if (!r)
+ +              return 1;
   
- -      if (is_mmio_page_fault(vcpu, cr2))
+ +      if (mmio_info_in_cache(vcpu, cr2, direct))
                 emulation_type = 0;
- -
+ +emulate:
         er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
   
         switch (er) {
@@@ -4538,6 -4395,8 +4538,6 @@@
         default:
                 BUG();
         }
- -out:
- -      return r;
   }
   EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
   
@@@ -4606,21 -4465,6 +4606,21 @@@ void kvm_mmu_setup(struct kvm_vcpu *vcp
         init_kvm_mmu(vcpu);
   }
   
+ +void kvm_mmu_init_vm(struct kvm *kvm)
+ +{
+ +      struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ +
+ +      node->track_write = kvm_mmu_pte_write;
+ +      kvm_page_track_register_notifier(kvm, node);
+ +}
+ +
+ +void kvm_mmu_uninit_vm(struct kvm *kvm)
+ +{
+ +      struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
+ +
+ +      kvm_page_track_unregister_notifier(kvm, node);
+ +}
+ +
   /* The return value indicates if tlb flush on all vcpus is needed. */
   typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
   
diff --combined arch/x86/kvm/vmx.c

index efc243e4dabfc75d44240fab88052598c0d3da26,1735ae9d684a2a7f029b88dd04415d62c18d9adf..ee1c8a93871c551f9cddc93de56197806d0c0cfd
--- 1/arch/x86/kvm/vmx.c
--- 2/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@@ -598,10 -598,6 +598,10 @@@ struct vcpu_vmx 
         struct page *pml_pg;
   
         u64 current_tsc_ratio;
+ +
+ +      bool guest_pkru_valid;
+ +      u32 guest_pkru;
+ +      u32 host_pkru;
   };
   
   enum segment_cache_field {
@@@ -867,6 -863,7 +867,6 @@@ static unsigned long nested_ept_get_cr3
   static u64 construct_eptp(unsigned long root_hpa);
   static void kvm_cpu_vmxon(u64 addr);
   static void kvm_cpu_vmxoff(void);
- -static bool vmx_mpx_supported(void);
   static bool vmx_xsaves_supported(void);
   static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
   static void vmx_set_segment(struct kvm_vcpu *vcpu,
@@@ -966,36 -963,25 +966,36 @@@ static const u32 vmx_msr_index[] = 
         MSR_EFER, MSR_TSC_AUX, MSR_STAR,
   };
   
- -static inline bool is_page_fault(u32 intr_info)
+ +static inline bool is_exception_n(u32 intr_info, u8 vector)
   {
         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
                              INTR_INFO_VALID_MASK)) ==
- -              (INTR_TYPE_HARD_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
+ +              (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ +}
+ +
+ +static inline bool is_debug(u32 intr_info)
+ +{
+ +      return is_exception_n(intr_info, DB_VECTOR);
+ +}
+ +
+ +static inline bool is_breakpoint(u32 intr_info)
+ +{
+ +      return is_exception_n(intr_info, BP_VECTOR);
+ +}
+ +
+ +static inline bool is_page_fault(u32 intr_info)
+ +{
+ +      return is_exception_n(intr_info, PF_VECTOR);
   }
   
   static inline bool is_no_device(u32 intr_info)
   {
- -      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- -                           INTR_INFO_VALID_MASK)) ==
- -              (INTR_TYPE_HARD_EXCEPTION | NM_VECTOR | INTR_INFO_VALID_MASK);
+ +      return is_exception_n(intr_info, NM_VECTOR);
   }
   
   static inline bool is_invalid_opcode(u32 intr_info)
   {
- -      return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- -                           INTR_INFO_VALID_MASK)) ==
- -              (INTR_TYPE_HARD_EXCEPTION | UD_VECTOR | INTR_INFO_VALID_MASK);
+ +      return is_exception_n(intr_info, UD_VECTOR);
   }
   
   static inline bool is_external_interrupt(u32 intr_info)
@@@ -2111,7 -2097,6 +2111,7 @@@ static void vmx_vcpu_pi_load(struct kvm
         } while (cmpxchg(&pi_desc->control, old.control,
                         new.control) != old.control);
   }
+ +
   /*
    * Switches to specified vcpu, until a matching vcpu_put(), but assumes
    * vcpu mutex is already taken.
@@@ -2172,7 -2157,6 +2172,7 @@@ static void vmx_vcpu_load(struct kvm_vc
         }
   
         vmx_vcpu_pi_load(vcpu, cpu);
+ +      vmx->host_pkru = read_pkru();
   }
   
   static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
@@@ -2292,11 -2276,6 +2292,11 @@@ static void vmx_set_rflags(struct kvm_v
         vmcs_writel(GUEST_RFLAGS, rflags);
   }
   
+ +static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
+ +{
+ +      return to_vmx(vcpu)->guest_pkru;
+ +}
+ +
   static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
   {
         u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@@ -2626,7 -2605,7 +2626,7 @@@ static void nested_vmx_setup_ctls_msrs(
                 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
                 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
   
- -      if (vmx_mpx_supported())
+ +      if (kvm_mpx_supported())
                 vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS;
   
         /* We support free control of debug control saving. */
@@@ -2647,7 -2626,7 +2647,7 @@@
                 VM_ENTRY_LOAD_IA32_PAT;
         vmx->nested.nested_vmx_entry_ctls_high |=
                 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
- -      if (vmx_mpx_supported())
+ +      if (kvm_mpx_supported())
                 vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS;
   
         /* We support free control of debug control loading. */
@@@ -2723,15 -2702,8 +2723,15 @@@
         } else
                 vmx->nested.nested_vmx_ept_caps = 0;
   
+ +      /*
+ +       * Old versions of KVM use the single-context version without
+ +       * checking for support, so declare that it is supported even
+ +       * though it is treated as global context.  The alternative is
+ +       * not failing the single-context invvpid, and it is worse.
+ +       */
         if (enable_vpid)
                 vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+ +                              VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
                                 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
         else
                 vmx->nested.nested_vmx_vpid_caps = 0;
@@@ -2898,7 -2870,7 +2898,7 @@@ static int vmx_get_msr(struct kvm_vcpu 
                 msr_info->data = vmcs_readl(GUEST_SYSENTER_ESP);
                 break;
         case MSR_IA32_BNDCFGS:
- -              if (!vmx_mpx_supported())
+ +              if (!kvm_mpx_supported())
                         return 1;
                 msr_info->data = vmcs_read64(GUEST_BNDCFGS);
                 break;
@@@ -2975,7 -2947,7 +2975,7 @@@ static int vmx_set_msr(struct kvm_vcpu 
                 vmcs_writel(GUEST_SYSENTER_ESP, data);
                 break;
         case MSR_IA32_BNDCFGS:
- -              if (!vmx_mpx_supported())
+ +              if (!kvm_mpx_supported())
                         return 1;
                 vmcs_write64(GUEST_BNDCFGS, data);
                 break;
@@@ -3448,7 -3420,7 +3448,7 @@@ static void init_vmcs_shadow_fields(voi
         for (i = j = 0; i < max_shadow_read_write_fields; i++) {
                 switch (shadow_read_write_fields[i]) {
                 case GUEST_BNDCFGS:
- -                      if (!vmx_mpx_supported())
+ +                      if (!kvm_mpx_supported())
                                 continue;
                         break;
                 default:
@@@ -3904,17 -3876,13 +3904,17 @@@ static int vmx_set_cr4(struct kvm_vcpu 
   
         if (!enable_unrestricted_guest && !is_paging(vcpu))
                 /*
- -               * SMEP/SMAP is disabled if CPU is in non-paging mode in
- -               * hardware.  However KVM always uses paging mode without
- -               * unrestricted guest.
- -               * To emulate this behavior, SMEP/SMAP needs to be manually
- -               * disabled when guest switches to non-paging mode.
+ +               * SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
+ +               * hardware.  To emulate this behavior, SMEP/SMAP/PKU needs
+ +               * to be manually disabled when guest switches to non-paging
+ +               * mode.
+ +               *
+ +               * If !enable_unrestricted_guest, the CPU is always running
+ +               * with CR0.PG=1 and CR4 needs to be modified.
+ +               * If enable_unrestricted_guest, the CPU automatically
+ +               * disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
                  */
- -              hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+ +              hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE);
   
         vmcs_writel(CR4_READ_SHADOW, cr4);
         vmcs_writel(GUEST_CR4, hw_cr4);
@@@ -5528,7 -5496,7 +5528,7 @@@ static int handle_set_cr4(struct kvm_vc
                 return kvm_set_cr4(vcpu, val);
   }
   
- /* called to set cr0 as approriate for clts instruction exit. */
+ /* called to set cr0 as appropriate for clts instruction exit. */
   static void handle_clts(struct kvm_vcpu *vcpu)
   {
         if (is_guest_mode(vcpu)) {
@@@ -5661,8 -5629,11 +5661,8 @@@ static int handle_dr(struct kvm_vcpu *v
         }
   
         if (vcpu->guest_debug == 0) {
- -              u32 cpu_based_vm_exec_control;
- -
- -              cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- -              cpu_based_vm_exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- -              vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ +              vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ +                              CPU_BASED_MOV_DR_EXITING);
   
                 /*
                  * No more DR vmexits; force a reload of the debug registers
@@@ -5699,6 -5670,8 +5699,6 @@@ static void vmx_set_dr6(struct kvm_vcp
   
   static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
   {
- -      u32 cpu_based_vm_exec_control;
- -
         get_debugreg(vcpu->arch.db[0], 0);
         get_debugreg(vcpu->arch.db[1], 1);
         get_debugreg(vcpu->arch.db[2], 2);
@@@ -5707,7 -5680,10 +5707,7 @@@
         vcpu->arch.dr7 = vmcs_readl(GUEST_DR7);
   
         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
- -
- -      cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- -      cpu_based_vm_exec_control |= CPU_BASED_MOV_DR_EXITING;
- -      vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ +      vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_MOV_DR_EXITING);
   }
   
   static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val)
@@@ -5792,7 -5768,8 +5792,7 @@@ static int handle_halt(struct kvm_vcpu 
   
   static int handle_vmcall(struct kvm_vcpu *vcpu)
   {
- -      kvm_emulate_hypercall(vcpu);
- -      return 1;
+ +      return kvm_emulate_hypercall(vcpu);
   }
   
   static int handle_invd(struct kvm_vcpu *vcpu)
@@@ -6479,8 -6456,8 +6479,8 @@@ static struct loaded_vmcs *nested_get_c
   
         if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
                 /* Recycle the least recently used VMCS. */
- -              item = list_entry(vmx->nested.vmcs02_pool.prev,
- -                      struct vmcs02_list, list);
+ +              item = list_last_entry(&vmx->nested.vmcs02_pool,
+ +                                     struct vmcs02_list, list);
                 item->vmptr = vmx->nested.current_vmptr;
                 list_move(&item->list, &vmx->nested.vmcs02_pool);
                 return &item->vmcs02;
@@@ -7267,7 -7244,7 +7267,7 @@@ static int handle_vmwrite(struct kvm_vc
         /* The value to write might be 32 or 64 bits, depending on L1's long
          * mode, and eventually we need to write that into a field of several
          * possible lengths. The code below first zero-extends the value to 64
-        * bit (field_value), and then copies only the approriate number of
+        * bit (field_value), and then copies only the appropriate number of
          * bits into the vmcs12 field.
          */
         u64 field_value = 0;
@@@ -7421,7 -7398,6 +7421,7 @@@ static int handle_invept(struct kvm_vcp
         if (!(types & (1UL << type))) {
                 nested_vmx_failValid(vcpu,
                                 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ +              skip_emulated_instruction(vcpu);
                 return 1;
         }
   
@@@ -7480,7 -7456,6 +7480,7 @@@ static int handle_invvpid(struct kvm_vc
         if (!(types & (1UL << type))) {
                 nested_vmx_failValid(vcpu,
                         VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ +              skip_emulated_instruction(vcpu);
                 return 1;
         }
   
@@@ -7497,17 -7472,12 +7497,17 @@@
         }
   
         switch (type) {
+ +      case VMX_VPID_EXTENT_SINGLE_CONTEXT:
+ +              /*
+ +               * Old versions of KVM use the single-context version so we
+ +               * have to support it; just treat it the same as all-context.
+ +               */
         case VMX_VPID_EXTENT_ALL_CONTEXT:
                 __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
                 nested_vmx_succeed(vcpu);
                 break;
         default:
- -              /* Trap single context invalidation invvpid calls */
+ +              /* Trap individual address invalidation invvpid calls */
                 BUG_ON(1);
                 break;
         }
@@@ -7803,13 -7773,6 +7803,13 @@@ static bool nested_vmx_exit_handled(str
                 else if (is_no_device(intr_info) &&
                          !(vmcs12->guest_cr0 & X86_CR0_TS))
                         return false;
+ +              else if (is_debug(intr_info) &&
+ +                       vcpu->guest_debug &
+ +                       (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
+ +                      return false;
+ +              else if (is_breakpoint(intr_info) &&
+ +                       vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
+ +                      return false;
                 return vmcs12->exception_bitmap &
                                 (1u << (intr_info & INTR_INFO_VECTOR_MASK));
         case EXIT_REASON_EXTERNAL_INTERRUPT:
@@@ -8414,7 -8377,6 +8414,7 @@@ static void vmx_complete_atomic_exit(st
   static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
   {
         u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ +      register void *__sp asm(_ASM_SP);
   
         /*
          * If external interrupt exists, IF bit is set in rflags/eflags on the
@@@ -8447,9 -8409,8 +8447,9 @@@
                         "call *%[entry]\n\t"
                         :
   #ifdef CONFIG_X86_64
- -                      [sp]"=&r"(tmp)
+ +                      [sp]"=&r"(tmp),
   #endif
+ +                      "+r"(__sp)
                         :
                         [entry]"r"(entry),
                         [ss]"i"(__KERNEL_DS),
@@@ -8650,9 -8611,6 +8650,9 @@@ static void __noclone vmx_vcpu_run(stru
         if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
                 vmx_set_interrupt_shadow(vcpu, 0);
   
+ +      if (vmx->guest_pkru_valid)
+ +              __write_pkru(vmx->guest_pkru);
+ +
         atomic_switch_perf_msrs(vmx);
         debugctlmsr = get_debugctlmsr();
   
@@@ -8792,20 -8750,6 +8792,20 @@@
   
         vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
   
+ +      /*
+ +       * eager fpu is enabled if PKEY is supported and CR4 is switched
+ +       * back on host, so it is safe to read guest PKRU from current
+ +       * XSAVE.
+ +       */
+ +      if (boot_cpu_has(X86_FEATURE_OSPKE)) {
+ +              vmx->guest_pkru = __read_pkru();
+ +              if (vmx->guest_pkru != vmx->host_pkru) {
+ +                      vmx->guest_pkru_valid = true;
+ +                      __write_pkru(vmx->host_pkru);
+ +              } else
+ +                      vmx->guest_pkru_valid = false;
+ +      }
+ +
         /*
          * the KVM_REQ_EVENT optimization bit is only on for one entry, and if
          * we did not inject a still-pending event to L1 now because of
@@@ -10333,7 -10277,7 +10333,7 @@@ static void prepare_vmcs12(struct kvm_v
         vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
         vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
         vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
- -      if (vmx_mpx_supported())
+ +      if (kvm_mpx_supported())
                 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
         if (nested_cpu_has_xsaves(vmcs12))
                 vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
@@@ -10841,26 -10785,13 +10841,26 @@@ static int vmx_update_pi_irte(struct kv
                  */
   
                 kvm_set_msi_irq(e, &irq);
- -              if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+ +              if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ +                      /*
+ +                       * Make sure the IRTE is in remapped mode if
+ +                       * we don't handle it in posted mode.
+ +                       */
+ +                      ret = irq_set_vcpu_affinity(host_irq, NULL);
+ +                      if (ret < 0) {
+ +                              printk(KERN_INFO
+ +                                 "failed to back to remapped mode, irq: %u\n",
+ +                                 host_irq);
+ +                              goto out;
+ +                      }
+ +
                         continue;
+ +              }
   
                 vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
                 vcpu_info.vector = irq.vector;
   
- -              trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+ +              trace_kvm_pi_irte_update(vcpu->vcpu_id, host_irq, e->gsi,
                                 vcpu_info.vector, vcpu_info.pi_desc_addr, set);
   
                 if (set)
@@@ -10930,9 -10861,6 +10930,9 @@@ static struct kvm_x86_ops vmx_x86_ops 
         .cache_reg = vmx_cache_reg,
         .get_rflags = vmx_get_rflags,
         .set_rflags = vmx_set_rflags,
+ +
+ +      .get_pkru = vmx_get_pkru,
+ +
         .fpu_activate = vmx_fpu_activate,
         .fpu_deactivate = vmx_fpu_deactivate,
   
diff --combined arch/x86/kvm/x86.c

index e260ccbc8f5574d8e587ebd7704d051ee873147a,4838d35c9641d6cee63da0e930224d7b1d446e8c..742d0f7d3556e143e219c4ea474e85605be33cde
--- 1/arch/x86/kvm/x86.c
--- 2/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@@ -123,9 -123,6 +123,9 @@@ module_param(tsc_tolerance_ppm, uint, S
   unsigned int __read_mostly lapic_timer_advance_ns = 0;
   module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
   
+ +static bool __read_mostly vector_hashing = true;
+ +module_param(vector_hashing, bool, S_IRUGO);
+ +
   static bool __read_mostly backwards_tsc_observed = false;
   
   #define KVM_NR_SHARED_MSRS 16
@@@ -723,7 -720,7 +723,7 @@@ int kvm_set_cr4(struct kvm_vcpu *vcpu, 
   {
         unsigned long old_cr4 = kvm_read_cr4(vcpu);
         unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- -                                 X86_CR4_SMEP | X86_CR4_SMAP;
+ +                                 X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
   
         if (cr4 & CR4_RESERVED_BITS)
                 return 1;
@@@ -740,9 -737,6 +740,9 @@@
         if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
                 return 1;
   
+ +      if (!guest_cpuid_has_pku(vcpu) && (cr4 & X86_CR4_PKE))
+ +              return 1;
+ +
         if (is_long_mode(vcpu)) {
                 if (!(cr4 & X86_CR4_PAE))
                         return 1;
@@@ -768,7 -762,7 +768,7 @@@
             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                 kvm_mmu_reset_context(vcpu);
   
- -      if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+ +      if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                 kvm_update_cpuid(vcpu);
   
         return 0;
@@@ -1202,11 -1196,17 +1202,11 @@@ static void kvm_write_wall_clock(struc
   
   static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
   {
- -      uint32_t quotient, remainder;
- -
- -      /* Don't try to replace with do_div(), this one calculates
- -       * "(dividend << 32) / divisor" */
- -      __asm__ ( "divl %4"
- -                : "=a" (quotient), "=d" (remainder)
- -                : "0" (0), "1" (dividend), "r" (divisor) );
- -      return quotient;
+ +      do_shl32_div32(dividend, divisor);
+ +      return dividend;
   }
   
- -static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
+ +static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
                                s8 *pshift, u32 *pmultiplier)
   {
         uint64_t scaled64;
@@@ -1214,8 -1214,8 +1214,8 @@@
         uint64_t tps64;
         uint32_t tps32;
   
- -      tps64 = base_khz * 1000LL;
- -      scaled64 = scaled_khz * 1000LL;
+ +      tps64 = base_hz;
+ +      scaled64 = scaled_hz;
         while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
                 tps64 >>= 1;
                 shift--;
@@@ -1233,8 -1233,8 +1233,8 @@@
         *pshift = shift;
         *pmultiplier = div_frac(scaled64, tps32);
   
- -      pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
- -               __func__, base_khz, scaled_khz, shift, *pmultiplier);
+ +      pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n",
+ +               __func__, base_hz, scaled_hz, shift, *pmultiplier);
   }
   
   #ifdef CONFIG_X86_64
@@@ -1293,23 -1293,23 +1293,23 @@@ static int set_tsc_khz(struct kvm_vcpu 
         return 0;
   }
   
- -static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
+ +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
   {
         u32 thresh_lo, thresh_hi;
         int use_scaling = 0;
   
         /* tsc_khz can be zero if TSC calibration fails */
- -      if (this_tsc_khz == 0) {
+ +      if (user_tsc_khz == 0) {
                 /* set tsc_scaling_ratio to a safe value */
                 vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
                 return -1;
         }
   
         /* Compute a scale to convert nanoseconds in TSC cycles */
- -      kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
+ +      kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
                            &vcpu->arch.virtual_tsc_shift,
                            &vcpu->arch.virtual_tsc_mult);
- -      vcpu->arch.virtual_tsc_khz = this_tsc_khz;
+ +      vcpu->arch.virtual_tsc_khz = user_tsc_khz;
   
         /*
          * Compute the variation in TSC rate which is acceptable
@@@ -1319,11 -1319,11 +1319,11 @@@
          */
         thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
         thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
- -      if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
- -              pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
+ +      if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
+ +              pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
                 use_scaling = 1;
         }
- -      return set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
+ +      return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
   }
   
   static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
@@@ -1562,7 -1562,7 +1562,7 @@@ static cycle_t read_tsc(void
   
         /*
          * GCC likes to generate cmov here, but this branch is extremely
-        * predictable (it's just a funciton of time and the likely is
+        * predictable (it's just a function of time and the likely is
          * very likely) and there's a data dependence, so force GCC
          * to generate a branch instead.  I don't barrier() because
          * we don't actually need a barrier, and if this function
@@@ -1716,7 -1716,7 +1716,7 @@@ static void kvm_gen_update_masterclock(
   
   static int kvm_guest_time_update(struct kvm_vcpu *v)
   {
- -      unsigned long flags, this_tsc_khz, tgt_tsc_khz;
+ +      unsigned long flags, tgt_tsc_khz;
         struct kvm_vcpu_arch *vcpu = &v->arch;
         struct kvm_arch *ka = &v->kvm->arch;
         s64 kernel_ns;
@@@ -1742,8 -1742,8 +1742,8 @@@
   
         /* Keep irq disabled to prevent changes to the clock */
         local_irq_save(flags);
- -      this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
- -      if (unlikely(this_tsc_khz == 0)) {
+ +      tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
+ +      if (unlikely(tgt_tsc_khz == 0)) {
                 local_irq_restore(flags);
                 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
                 return 1;
@@@ -1778,14 -1778,13 +1778,14 @@@
         if (!vcpu->pv_time_enabled)
                 return 0;
   
- -      if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
- -              tgt_tsc_khz = kvm_has_tsc_control ?
- -                      vcpu->virtual_tsc_khz : this_tsc_khz;
- -              kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz,
+ +      if (kvm_has_tsc_control)
+ +              tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
+ +
+ +      if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
+ +              kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
                                    &vcpu->hv_clock.tsc_shift,
                                    &vcpu->hv_clock.tsc_to_system_mul);
- -              vcpu->hw_tsc_khz = this_tsc_khz;
+ +              vcpu->hw_tsc_khz = tgt_tsc_khz;
         }
   
         /* With all the info we got, fill in the values */
@@@ -2988,7 -2987,7 +2988,7 @@@ static int kvm_vcpu_ioctl_x86_set_vcpu_
         kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
   
         if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
- -          kvm_vcpu_has_lapic(vcpu))
+ +          lapic_in_kernel(vcpu))
                 vcpu->arch.apic->sipi_vector = events->sipi_vector;
   
         if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
@@@ -3001,7 -3000,7 +3001,7 @@@
                         vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
                 else
                         vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- -              if (kvm_vcpu_has_lapic(vcpu)) {
+ +              if (lapic_in_kernel(vcpu)) {
                         if (events->smi.latched_init)
                                 set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
                         else
@@@ -3241,7 -3240,7 +3241,7 @@@ long kvm_arch_vcpu_ioctl(struct file *f
         switch (ioctl) {
         case KVM_GET_LAPIC: {
                 r = -EINVAL;
- -              if (!vcpu->arch.apic)
+ +              if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
   
@@@ -3259,7 -3258,7 +3259,7 @@@
         }
         case KVM_SET_LAPIC: {
                 r = -EINVAL;
- -              if (!vcpu->arch.apic)
+ +              if (!lapic_in_kernel(vcpu))
                         goto out;
                 u.lapic = memdup_user(argp, sizeof(*u.lapic));
                 if (IS_ERR(u.lapic))
@@@ -3606,26 -3605,20 +3606,26 @@@ static int kvm_vm_ioctl_set_irqchip(str
   
   static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
- -      mutex_lock(&kvm->arch.vpit->pit_state.lock);
- -      memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
- -      mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ +      struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
+ +
+ +      BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
+ +
+ +      mutex_lock(&kps->lock);
+ +      memcpy(ps, &kps->channels, sizeof(*ps));
+ +      mutex_unlock(&kps->lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
   {
         int i;
- -      mutex_lock(&kvm->arch.vpit->pit_state.lock);
- -      memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ +      struct kvm_pit *pit = kvm->arch.vpit;
+ +
+ +      mutex_lock(&pit->pit_state.lock);
+ +      memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
         for (i = 0; i < 3; i++)
- -              kvm_pit_load_count(kvm, i, ps->channels[i].count, 0);
- -      mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ +              kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
+ +      mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
@@@ -3645,39 -3638,29 +3645,39 @@@ static int kvm_vm_ioctl_set_pit2(struc
         int start = 0;
         int i;
         u32 prev_legacy, cur_legacy;
- -      mutex_lock(&kvm->arch.vpit->pit_state.lock);
- -      prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
+ +      struct kvm_pit *pit = kvm->arch.vpit;
+ +
+ +      mutex_lock(&pit->pit_state.lock);
+ +      prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
         cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
         if (!prev_legacy && cur_legacy)
                 start = 1;
- -      memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
- -             sizeof(kvm->arch.vpit->pit_state.channels));
- -      kvm->arch.vpit->pit_state.flags = ps->flags;
+ +      memcpy(&pit->pit_state.channels, &ps->channels,
+ +             sizeof(pit->pit_state.channels));
+ +      pit->pit_state.flags = ps->flags;
         for (i = 0; i < 3; i++)
- -              kvm_pit_load_count(kvm, i, kvm->arch.vpit->pit_state.channels[i].count,
+ +              kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
                                    start && i == 0);
- -      mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ +      mutex_unlock(&pit->pit_state.lock);
         return 0;
   }
   
   static int kvm_vm_ioctl_reinject(struct kvm *kvm,
                                  struct kvm_reinject_control *control)
   {
- -      if (!kvm->arch.vpit)
+ +      struct kvm_pit *pit = kvm->arch.vpit;
+ +
+ +      if (!pit)
                 return -ENXIO;
- -      mutex_lock(&kvm->arch.vpit->pit_state.lock);
- -      kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
- -      mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ +
+ +      /* pit->pit_state.lock was overloaded to prevent userspace from getting
+ +       * an inconsistent state after running multiple KVM_REINJECT_CONTROL
+ +       * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
+ +       */
+ +      mutex_lock(&pit->pit_state.lock);
+ +      kvm_pit_set_reinject(pit, control->pit_reinject);
+ +      mutex_unlock(&pit->pit_state.lock);
+ +
         return 0;
   }
   
@@@ -4110,7 -4093,7 +4110,7 @@@ static int vcpu_mmio_write(struct kvm_v
   
         do {
                 n = min(len, 8);
- -              if (!(vcpu->arch.apic &&
+ +              if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
                     && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
                         break;
@@@ -4130,7 -4113,7 +4130,7 @@@ static int vcpu_mmio_read(struct kvm_vc
   
         do {
                 n = min(len, 8);
- -              if (!(vcpu->arch.apic &&
+ +              if (!(lapic_in_kernel(vcpu) &&
                       !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
                                          addr, n, v))
                     && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
@@@ -4329,14 -4312,9 +4329,14 @@@ static int vcpu_mmio_gva_to_gpa(struct 
         u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
                 | (write ? PFERR_WRITE_MASK : 0);
   
+ +      /*
+ +       * currently PKRU is only applied to ept enabled guest so
+ +       * there is no pkey in EPT page table for L1 guest or EPT
+ +       * shadow page table for L2 guest.
+ +       */
         if (vcpu_match_mmio_gva(vcpu, gva)
             && !permission_fault(vcpu, vcpu->arch.walk_mmu,
- -                               vcpu->arch.access, access)) {
+ +                               vcpu->arch.access, 0, access)) {
                 *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
                                         (gva & (PAGE_SIZE - 1));
                 trace_vcpu_match_mmio(gva, *gpa, write, false);
@@@ -4368,7 -4346,7 +4368,7 @@@ int emulator_write_phys(struct kvm_vcp
         ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
         if (ret < 0)
                 return 0;
- -      kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+ +      kvm_page_track_write(vcpu, gpa, val, bytes);
         return 1;
   }
   
@@@ -4626,7 -4604,7 +4626,7 @@@ static int emulator_cmpxchg_emulated(st
                 return X86EMUL_CMPXCHG_FAILED;
   
         kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
- -      kvm_mmu_pte_write(vcpu, gpa, new, bytes);
+ +      kvm_page_track_write(vcpu, gpa, new, bytes);
   
         return X86EMUL_CONTINUE;
   
@@@ -6032,7 -6010,7 +6032,7 @@@ static void update_cr8_intercept(struc
         if (!kvm_x86_ops->update_cr8_intercept)
                 return;
   
- -      if (!vcpu->arch.apic)
+ +      if (!lapic_in_kernel(vcpu))
                 return;
   
         if (vcpu->arch.apicv_active)
@@@ -6596,12 -6574,8 +6596,12 @@@ static int vcpu_enter_guest(struct kvm_
   
         srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
   
- -      /* We should set ->mode before check ->requests,
- -       * see the comment in make_all_cpus_request.
+ +      /*
+ +       * We should set ->mode before check ->requests,
+ +       * Please see the comment in kvm_make_all_cpus_request.
+ +       * This also orders the write to mode from any reads
+ +       * to the page tables done while the VCPU is running.
+ +       * Please see the comment in kvm_flush_remote_tlbs.
          */
         smp_mb__after_srcu_read_unlock();
   
@@@ -7064,7 -7038,7 +7064,7 @@@ int kvm_arch_vcpu_ioctl_get_mpstate(str
   int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
                                     struct kvm_mp_state *mp_state)
   {
- -      if (!kvm_vcpu_has_lapic(vcpu) &&
+ +      if (!lapic_in_kernel(vcpu) &&
             mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                 return -EINVAL;
   
@@@ -7135,7 -7109,7 +7135,7 @@@ int kvm_arch_vcpu_ioctl_set_sregs(struc
   
         mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
- -      if (sregs->cr4 & X86_CR4_OSXSAVE)
+ +      if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE))
                 kvm_update_cpuid(vcpu);
   
         idx = srcu_read_lock(&vcpu->kvm->srcu);
@@@ -7340,7 -7314,7 +7340,7 @@@ void kvm_put_guest_fpu(struct kvm_vcpu 
          * Every 255 times fpu_counter rolls over to 0; a guest that uses
          * the FPU in bursts will revert to loading it on demand.
          */
- -      if (!vcpu->arch.eager_fpu) {
+ +      if (!use_eager_fpu()) {
                 if (++vcpu->fpu_counter < 5)
                         kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
         }
@@@ -7619,7 -7593,6 +7619,7 @@@ bool kvm_vcpu_compatible(struct kvm_vcp
   }
   
   struct static_key kvm_no_apic_vcpu __read_mostly;
+ +EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
   
   int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
   {
@@@ -7751,9 -7724,6 +7751,9 @@@ int kvm_arch_init_vm(struct kvm *kvm, u
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
         INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
   
+ +      kvm_page_track_init(kvm);
+ +      kvm_mmu_init_vm(kvm);
+ +
         return 0;
   }
   
@@@ -7880,7 -7850,6 +7880,7 @@@ void kvm_arch_destroy_vm(struct kvm *kv
         kfree(kvm->arch.vioapic);
         kvm_free_vcpus(kvm);
         kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ +      kvm_mmu_uninit_vm(kvm);
   }
   
   void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@@ -7902,8 -7871,6 +7902,8 @@@
                         free->arch.lpage_info[i - 1] = NULL;
                 }
         }
+ +
+ +      kvm_page_track_free_memslot(free, dont);
   }
   
   int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
@@@ -7912,7 -7879,6 +7912,7 @@@
         int i;
   
         for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
+ +              struct kvm_lpage_info *linfo;
                 unsigned long ugfn;
                 int lpages;
                 int level = i + 1;
@@@ -7927,16 -7893,15 +7927,16 @@@
                 if (i == 0)
                         continue;
   
- -              slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
- -                                      sizeof(*slot->arch.lpage_info[i - 1]));
- -              if (!slot->arch.lpage_info[i - 1])
+ +              linfo = kvm_kvzalloc(lpages * sizeof(*linfo));
+ +              if (!linfo)
                         goto out_free;
   
+ +              slot->arch.lpage_info[i - 1] = linfo;
+ +
                 if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
- -                      slot->arch.lpage_info[i - 1][0].write_count = 1;
+ +                      linfo[0].disallow_lpage = 1;
                 if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
- -                      slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
+ +                      linfo[lpages - 1].disallow_lpage = 1;
                 ugfn = slot->userspace_addr >> PAGE_SHIFT;
                 /*
                  * If the gfn and userspace address are not aligned wrt each
@@@ -7948,13 -7913,10 +7948,13 @@@
                         unsigned long j;
   
                         for (j = 0; j < lpages; ++j)
- -                              slot->arch.lpage_info[i - 1][j].write_count = 1;
+ +                              linfo[j].disallow_lpage = 1;
                 }
         }
   
+ +      if (kvm_page_track_create_memslot(slot, npages))
+ +              goto out_free;
+ +
         return 0;
   
   out_free:
@@@ -8408,12 -8370,6 +8408,12 @@@ int kvm_arch_update_irqfd_routing(struc
         return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
   }
   
+ +bool kvm_vector_hashing_enabled(void)
+ +{
+ +      return vector_hashing;
+ +}
+ +EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
+ +
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
   EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --combined arch/x86/mm/mpx.c

index a0a0b9861902657733cf4fefb8d839a1dcb36b05,009679ae5065921b1b6b905c8c5fadb8b1395600..80476878eb4ca5c8ad56340bb52b2423995a7867
--- 1/arch/x86/mm/mpx.c
--- 2/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@@ -546,8 -546,8 +546,8 @@@ static int mpx_resolve_fault(long __use
         int nr_pages = 1;
         int force = 0;
   
- -      gup_ret = get_user_pages(current, current->mm, (unsigned long)addr,
- -                               nr_pages, write, force, NULL, NULL);
+ +      gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
+ +                      force, NULL, NULL);
         /*
          * get_user_pages() returns number of pages gotten.
          * 0 means we failed to fault in and get anything,
@@@ -728,14 -728,14 +728,14 @@@ static inline unsigned long bd_entry_vi
   
         /*
          * This covers 32-bit emulation as well as 32-bit kernels
-        * running on 64-bit harware.
+        * running on 64-bit hardware.
          */
         if (!is_64bit_mm(mm))
                 return (4ULL * GB) / MPX_BD_NR_ENTRIES_32;
   
         /*
          * 'x86_virt_bits' returns what the hardware is capable
-        * of, and returns the full >32-bit adddress space when
+        * of, and returns the full >32-bit address space when
          * running 32-bit kernels on 64-bit hardware.
          */
         virt_space = (1ULL << boot_cpu_data.x86_virt_bits);
diff --combined arch/x86/xen/enlighten.c

index 2379a5a88504e471d2f5b0f5180c39be930c8bb9,8381fb990c7faee0f0faa2aebddaf9fd3bfff6a2..880862c7d9ddba51e1b6964bc80dcf49d6a8b6ff
--- 1/arch/x86/xen/enlighten.c
--- 2/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@@ -32,7 -32,6 +32,7 @@@
   #include <linux/gfp.h>
   #include <linux/memblock.h>
   #include <linux/edd.h>
+ +#include <linux/frame.h>
   
   #ifdef CONFIG_KEXEC_CORE
   #include <linux/kexec.h>
@@@ -352,8 -351,8 +352,8 @@@ static void xen_cpuid(unsigned int *ax
         *cx &= maskecx;
         *cx |= setecx;
         *dx &= maskedx;
- -
   }
+ +STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */
   
   static bool __init xen_check_mwait(void)
   {
@@@ -962,7 -961,7 +962,7 @@@ static void xen_load_sp0(struct tss_str
         tss->x86_tss.sp0 = thread->sp0;
   }
   
- static void xen_set_iopl_mask(unsigned mask)
+ void xen_set_iopl_mask(unsigned mask)
   {
         struct physdev_set_iopl set_iopl;
   
diff --combined arch/x86/xen/xen-head.S

index de93b20fa0d28aaffaf91b327a7ab8a0bc1a70f0,dc6457017dec2ad3e508642be97d07c0b48adb4a..7f8d8abf4c1ab8b1ea114fee0365f1505b81aeea
--- 1/arch/x86/xen/xen-head.S
--- 2/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@@ -26,7 -26,7 +26,7 @@@
                       (1 << XENFEAT_auto_translated_physmap) | \
                       (1 << XENFEAT_supervisor_mode_kernel) | \
                       (1 << XENFEAT_hvm_callback_vector))
- /* The XENFEAT_writable_page_tables is not stricly neccessary as we set that
+ /* The XENFEAT_writable_page_tables is not stricly necessary as we set that
    * up regardless whether this CONFIG option is enabled or not, but it
    * clarifies what the right flags need to be.
    */
@@@ -38,18 -38,13 +38,18 @@@
         __INIT
   ENTRY(startup_xen)
         cld
- -#ifdef CONFIG_X86_32
- -      mov %esi,xen_start_info
- -      mov $init_thread_union+THREAD_SIZE,%esp
- -#else
- -      mov %rsi,xen_start_info
- -      mov $init_thread_union+THREAD_SIZE,%rsp
- -#endif
+ +
+ +      /* Clear .bss */
+ +      xor %eax,%eax
+ +      mov $__bss_start, %_ASM_DI
+ +      mov $__bss_stop, %_ASM_CX
+ +      sub %_ASM_DI, %_ASM_CX
+ +      shr $__ASM_SEL(2, 3), %_ASM_CX
+ +      rep __ASM_SIZE(stos)
+ +
+ +      mov %_ASM_SI, xen_start_info
+ +      mov $init_thread_union+THREAD_SIZE, %_ASM_SP
+ +
         jmp xen_start_kernel
   
         __FINIT
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Mar 2016 16:47:32 +0000 (09:47 -0700)
		1	2
arch/x86/include/asm/ftrace.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/process_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/vmx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kvm/x86.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/mpx.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/enlighten.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/xen-head.S	patch \|	diff1 \|	diff2 \|	blob \| history