KVM: VMX: Handle single-step #DB for EMULTYPE_SKIP on EPT misconfig

[linux.git] / arch / x86 / kvm / vmx / vmx.c
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c

index c030c96fc81a817f6e11e3b1580aa907b8bc63f7..ef98311ad15356741527704d2dfaf790dcd2cef6 100644 (file)
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -486,6 +486,35 @@ static int hv_remote_flush_tlb(struct kvm *kvm)
         return hv_remote_flush_tlb_with_range(kvm, NULL);
  }
  
+static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+       struct hv_enlightened_vmcs *evmcs;
+       struct hv_partition_assist_pg **p_hv_pa_pg =
+                       &vcpu->kvm->arch.hyperv.hv_pa_pg;
+       /*
+        * Synthetic VM-Exit is not enabled in current code and so All
+        * evmcs in singe VM shares same assist page.
+        */
+       if (!*p_hv_pa_pg) {
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               if (!*p_hv_pa_pg)
+                       return -ENOMEM;
+               pr_debug("KVM: Hyper-V: allocated PA_PG for %llx\n",
+                      (u64)&vcpu->kvm);
+       }
+
+       evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+       evmcs->partition_assist_page =
+               __pa(*p_hv_pa_pg);
+       evmcs->hv_vm_id = (u64)vcpu->kvm;
+       evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+       pr_debug("KVM: Hyper-V: enabled DIRECT flush for %llx\n",
+                (u64)vcpu->kvm);
+       return 0;
+}
+
  #endif /* IS_ENABLED(CONFIG_HYPERV) */
  
  /*
@@ -1472,17 +1501,32 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
         return 0;
  }
  
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
  {
         unsigned long rip;
  
-       rip = kvm_rip_read(vcpu);
-       rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       kvm_rip_write(vcpu, rip);
+       /*
+        * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+        * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+        * set when EPT misconfig occurs.  In practice, real hardware updates
+        * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+        * (namely Hyper-V) don't set it due to it being undefined behavior,
+        * i.e. we end up advancing IP with some random value.
+        */
+       if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+           to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+               rip = kvm_rip_read(vcpu);
+               rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               kvm_rip_write(vcpu, rip);
+       } else {
+               if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+                       return 0;
+       }
  
         /* skipping an emulated instruction also counts */
         vmx_set_interrupt_shadow(vcpu, 0);
+
+       return 1;
  }
  
  static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
@@ -1517,8 +1561,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
                 int inc_eip = 0;
                 if (kvm_exception_is_soft(nr))
                         inc_eip = vcpu->arch.event_exit_inst_len;
-               if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
                 return;
         }
  
@@ -4026,7 +4069,7 @@ static void ept_set_mmio_spte_mask(void)
          * of an EPT paging-structure entry is 110b (write/execute).
          */
         kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
-                                  VMX_EPT_MISCONFIG_WX_VALUE);
+                                  VMX_EPT_MISCONFIG_WX_VALUE, 0);
  }
  
  #define VMX_XSS_EXIT_BITMAP 0
@@ -4152,6 +4195,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  
         vcpu->arch.microcode_version = 0x100000000ULL;
         vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       vmx->hv_deadline_tsc = -1;
         kvm_set_cr8(vcpu, 0);
  
         if (!init_event) {
@@ -4266,8 +4310,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
                 int inc_eip = 0;
                 if (vcpu->arch.interrupt.soft)
                         inc_eip = vcpu->arch.event_exit_inst_len;
-               if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
                 return;
         }
         intr = irq | INTR_INFO_VALID_MASK;
@@ -4303,8 +4346,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
         vmx->loaded_vmcs->nmi_known_unmasked = false;
  
         if (vmx->rmode.vm86_active) {
-               if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
                 return;
         }
  
@@ -4431,7 +4473,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
          * Cause the #SS fault with 0 error code in VM86 mode.
          */
         if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
-               if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+               if (kvm_emulate_instruction(vcpu, 0)) {
                         if (vcpu->arch.halt_request) {
                                 vcpu->arch.halt_request = 0;
                                 return kvm_vcpu_halt(vcpu);
@@ -4482,7 +4524,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
         u32 intr_info, ex_no, error_code;
         unsigned long cr2, rip, dr6;
         u32 vect_info;
-       enum emulation_result er;
  
         vect_info = vmx->idt_vectoring_info;
         intr_info = vmx->exit_intr_info;
@@ -4499,13 +4540,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
  
         if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
                 WARN_ON_ONCE(!enable_vmware_backdoor);
-               er = kvm_emulate_instruction(vcpu,
-                       EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
-               if (er == EMULATE_USER_EXIT)
-                       return 0;
-               else if (er != EMULATE_DONE)
+
+               /*
+                * VMware backdoor emulation on #GP interception only handles
+                * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+                * error code on #GP.
+                */
+               if (error_code) {
                         kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-               return 1;
+                       return 1;
+               }
+               return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
         }
  
         /*
@@ -4547,7 +4592,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                         vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                         vcpu->arch.dr6 |= dr6 | DR6_RTM;
                         if (is_icebp(intr_info))
-                               skip_emulated_instruction(vcpu);
+                               WARN_ON(!skip_emulated_instruction(vcpu));
  
                         kvm_queue_exception(vcpu, DB_VECTOR);
                         return 1;
@@ -4602,7 +4647,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
         ++vcpu->stat.io_exits;
  
         if (string)
-               return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+               return kvm_emulate_instruction(vcpu, 0);
  
         port = exit_qualification >> 16;
         size = (exit_qualification & 7) + 1;
@@ -4676,7 +4721,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
  static int handle_desc(struct kvm_vcpu *vcpu)
  {
         WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
  }
  
  static int handle_cr(struct kvm_vcpu *vcpu)
@@ -4856,41 +4901,12 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
  
  static int handle_rdmsr(struct kvm_vcpu *vcpu)
  {
-       u32 ecx = kvm_rcx_read(vcpu);
-       struct msr_data msr_info;
-
-       msr_info.index = ecx;
-       msr_info.host_initiated = false;
-       if (vmx_get_msr(vcpu, &msr_info)) {
-               trace_kvm_msr_read_ex(ecx);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       trace_kvm_msr_read(ecx, msr_info.data);
-
-       kvm_rax_write(vcpu, msr_info.data & -1u);
-       kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u);
-       return kvm_skip_emulated_instruction(vcpu);
+       return kvm_emulate_rdmsr(vcpu);
  }
  
  static int handle_wrmsr(struct kvm_vcpu *vcpu)
  {
-       struct msr_data msr;
-       u32 ecx = kvm_rcx_read(vcpu);
-       u64 data = kvm_read_edx_eax(vcpu);
-
-       msr.data = data;
-       msr.index = ecx;
-       msr.host_initiated = false;
-       if (kvm_set_msr(vcpu, &msr) != 0) {
-               trace_kvm_msr_write_ex(ecx, data);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       trace_kvm_msr_write(ecx, data);
-       return kvm_skip_emulated_instruction(vcpu);
+       return kvm_emulate_wrmsr(vcpu);
  }
  
  static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -4921,7 +4937,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
  
  static int handle_invd(struct kvm_vcpu *vcpu)
  {
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
  }
  
  static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -4988,7 +5004,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
                         return kvm_skip_emulated_instruction(vcpu);
                 }
         }
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
  }
  
  static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -5057,23 +5073,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
         if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
                        type != INTR_TYPE_EXT_INTR &&
                        type != INTR_TYPE_NMI_INTR))
-               skip_emulated_instruction(vcpu);
-
-       if (kvm_task_switch(vcpu, tss_selector,
-                           type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
-                           has_error_code, error_code) == EMULATE_FAIL) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return 0;
-       }
+               WARN_ON(!skip_emulated_instruction(vcpu));
  
         /*
          * TODO: What about debug traps on tss switch?
          *       Are we supposed to inject them and update dr6?
          */
-
-       return 1;
+       return kvm_task_switch(vcpu, tss_selector,
+                              type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+                              reason, has_error_code, error_code);
  }
  
  static int handle_ept_violation(struct kvm_vcpu *vcpu)
@@ -5132,21 +5140,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
         if (!is_guest_mode(vcpu) &&
             !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                 trace_kvm_fast_mmio(gpa);
-               /*
-                * Doing kvm_skip_emulated_instruction() depends on undefined
-                * behavior: Intel's manual doesn't mandate
-                * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
-                * occurs and while on real hardware it was observed to be set,
-                * other hypervisors (namely Hyper-V) don't set it, we end up
-                * advancing IP with some random value. Disable fast mmio when
-                * running nested and keep it for real hardware in hope that
-                * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
-                */
-               if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
-                       return kvm_skip_emulated_instruction(vcpu);
-               else
-                       return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
-                                                               EMULATE_DONE;
+               return kvm_skip_emulated_instruction(vcpu);
         }
  
         return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -5165,8 +5159,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
  static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       enum emulation_result err = EMULATE_DONE;
-       int ret = 1;
         bool intr_window_requested;
         unsigned count = 130;
  
@@ -5187,71 +5179,67 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                 if (kvm_test_request(KVM_REQ_EVENT, vcpu))
                         return 1;
  
-               err = kvm_emulate_instruction(vcpu, 0);
-
-               if (err == EMULATE_USER_EXIT) {
-                       ++vcpu->stat.mmio_exits;
-                       ret = 0;
-                       goto out;
-               }
-
-               if (err != EMULATE_DONE)
-                       goto emulation_error;
+               if (!kvm_emulate_instruction(vcpu, 0))
+                       return 0;
  
                 if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending)
-                       goto emulation_error;
+                   vcpu->arch.exception.pending) {
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror =
+                                               KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
+                       return 0;
+               }
  
                 if (vcpu->arch.halt_request) {
                         vcpu->arch.halt_request = 0;
-                       ret = kvm_vcpu_halt(vcpu);
-                       goto out;
+                       return kvm_vcpu_halt(vcpu);
                 }
  
+               /*
+                * Note, return 1 and not 0, vcpu_run() is responsible for
+                * morphing the pending signal into the proper return code.
+                */
                 if (signal_pending(current))
-                       goto out;
+                       return 1;
+
                 if (need_resched())
                         schedule();
         }
  
-out:
-       return ret;
-
-emulation_error:
-       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-       vcpu->run->internal.ndata = 0;
-       return 0;
+       return 1;
  }
  
  static void grow_ple_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int old = vmx->ple_window;
+       unsigned int old = vmx->ple_window;
  
         vmx->ple_window = __grow_ple_window(old, ple_window,
                                             ple_window_grow,
                                             ple_window_max);
  
-       if (vmx->ple_window != old)
+       if (vmx->ple_window != old) {
                 vmx->ple_window_dirty = true;
-
-       trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+               trace_kvm_ple_window_update(vcpu->vcpu_id,
+                                           vmx->ple_window, old);
+       }
  }
  
  static void shrink_ple_window(struct kvm_vcpu *vcpu)
  {
         struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int old = vmx->ple_window;
+       unsigned int old = vmx->ple_window;
  
         vmx->ple_window = __shrink_ple_window(old, ple_window,
                                               ple_window_shrink,
                                               ple_window);
  
-       if (vmx->ple_window != old)
+       if (vmx->ple_window != old) {
                 vmx->ple_window_dirty = true;
-
-       trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+               trace_kvm_ple_window_update(vcpu->vcpu_id,
+                                           vmx->ple_window, old);
+       }
  }
  
  /*
@@ -5887,8 +5875,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
         else {
                 vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                                 exit_reason);
-               kvm_queue_exception(vcpu, UD_VECTOR);
-               return 1;
+               dump_vmcs();
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+               vcpu->run->internal.ndata = 1;
+               vcpu->run->internal.data[0] = exit_reason;
+               return 0;
         }
  }
  
@@ -6522,6 +6515,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
                 current_evmcs->hv_clean_fields |=
                         HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
  
+       if (static_branch_unlikely(&enable_evmcs))
+               current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+
         /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
         if (vmx->host_debugctlmsr)
                 update_debugctlmsr(vmx->host_debugctlmsr);
@@ -6589,6 +6585,7 @@ static struct kvm *vmx_vm_alloc(void)
  
  static void vmx_vm_free(struct kvm *kvm)
  {
+       kfree(kvm->arch.hyperv.hv_pa_pg);
         vfree(to_kvm_vmx(kvm));
  }
  
@@ -6615,6 +6612,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         unsigned long *msr_bitmap;
         int cpu;
  
+       BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
+               "struct kvm_vcpu must be at offset 0 for arch usercopy region");
+
         vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
         if (!vmx)
                 return ERR_PTR(-ENOMEM);
@@ -7369,10 +7369,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                  * irqbalance to make the interrupts single-CPU.
                  *
                  * We will support full lowest-priority interrupt later.
+                *
+                * In addition, we can only inject generic interrupts using
+                * the PI mechanism, refuse to route others through it.
                  */
  
                 kvm_set_msi_irq(kvm, e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+                   !kvm_irq_is_postable(&irq)) {
                         /*
                          * Make sure the IRTE is in remapped mode if
                          * we don't handle it in posted mode.
@@ -7474,6 +7478,11 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
         return false;
  }
  
+static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->nested.vmxon;
+}
+
  static __init int hardware_setup(void)
  {
         unsigned long host_bndcfgs;
@@ -7799,6 +7808,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
         .nested_enable_evmcs = NULL,
         .nested_get_evmcs_version = NULL,
         .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+       .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
  };
  
  static void vmx_cleanup_l1d_flush(void)
@@ -7835,6 +7845,7 @@ static void vmx_exit(void)
                         if (!vp_ap)
                                 continue;
  
+                       vp_ap->nested_control.features.directhypercall = 0;
                         vp_ap->current_nested_vmcs = 0;
                         vp_ap->enlighten_vmentry = 0;
                 }
@@ -7874,6 +7885,11 @@ static int __init vmx_init(void)
                         pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
                         static_branch_enable(&enable_evmcs);
                 }
+
+               if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+                       vmx_x86_ops.enable_direct_tlbflush
+                               = hv_enable_direct_tlbflush;
+
         } else {
                 enlightened_vmcs = false;
         }