]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - arch/x86/kvm/vmx/vmx.c
KVM: VMX: Handle single-step #DB for EMULTYPE_SKIP on EPT misconfig
[linux.git] / arch / x86 / kvm / vmx / vmx.c
index c030c96fc81a817f6e11e3b1580aa907b8bc63f7..ef98311ad15356741527704d2dfaf790dcd2cef6 100644 (file)
@@ -486,6 +486,35 @@ static int hv_remote_flush_tlb(struct kvm *kvm)
        return hv_remote_flush_tlb_with_range(kvm, NULL);
 }
 
+static int hv_enable_direct_tlbflush(struct kvm_vcpu *vcpu)
+{
+       struct hv_enlightened_vmcs *evmcs;
+       struct hv_partition_assist_pg **p_hv_pa_pg =
+                       &vcpu->kvm->arch.hyperv.hv_pa_pg;
+       /*
+        * Synthetic VM-Exit is not enabled in current code and so All
+        * evmcs in singe VM shares same assist page.
+        */
+       if (!*p_hv_pa_pg) {
+               *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+               if (!*p_hv_pa_pg)
+                       return -ENOMEM;
+               pr_debug("KVM: Hyper-V: allocated PA_PG for %llx\n",
+                      (u64)&vcpu->kvm);
+       }
+
+       evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
+
+       evmcs->partition_assist_page =
+               __pa(*p_hv_pa_pg);
+       evmcs->hv_vm_id = (u64)vcpu->kvm;
+       evmcs->hv_enlightenments_control.nested_flush_hypercall = 1;
+
+       pr_debug("KVM: Hyper-V: enabled DIRECT flush for %llx\n",
+                (u64)vcpu->kvm);
+       return 0;
+}
+
 #endif /* IS_ENABLED(CONFIG_HYPERV) */
 
 /*
@@ -1472,17 +1501,32 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
        return 0;
 }
 
-
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
        unsigned long rip;
 
-       rip = kvm_rip_read(vcpu);
-       rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
-       kvm_rip_write(vcpu, rip);
+       /*
+        * Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
+        * undefined behavior: Intel's SDM doesn't mandate the VMCS field be
+        * set when EPT misconfig occurs.  In practice, real hardware updates
+        * VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
+        * (namely Hyper-V) don't set it due to it being undefined behavior,
+        * i.e. we end up advancing IP with some random value.
+        */
+       if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
+           to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+               rip = kvm_rip_read(vcpu);
+               rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+               kvm_rip_write(vcpu, rip);
+       } else {
+               if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+                       return 0;
+       }
 
        /* skipping an emulated instruction also counts */
        vmx_set_interrupt_shadow(vcpu, 0);
+
+       return 1;
 }
 
 static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
@@ -1517,8 +1561,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu)
                int inc_eip = 0;
                if (kvm_exception_is_soft(nr))
                        inc_eip = vcpu->arch.event_exit_inst_len;
-               if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
                return;
        }
 
@@ -4026,7 +4069,7 @@ static void ept_set_mmio_spte_mask(void)
         * of an EPT paging-structure entry is 110b (write/execute).
         */
        kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
-                                  VMX_EPT_MISCONFIG_WX_VALUE);
+                                  VMX_EPT_MISCONFIG_WX_VALUE, 0);
 }
 
 #define VMX_XSS_EXIT_BITMAP 0
@@ -4152,6 +4195,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        vcpu->arch.microcode_version = 0x100000000ULL;
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
+       vmx->hv_deadline_tsc = -1;
        kvm_set_cr8(vcpu, 0);
 
        if (!init_event) {
@@ -4266,8 +4310,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
                int inc_eip = 0;
                if (vcpu->arch.interrupt.soft)
                        inc_eip = vcpu->arch.event_exit_inst_len;
-               if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
                return;
        }
        intr = irq | INTR_INFO_VALID_MASK;
@@ -4303,8 +4346,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        vmx->loaded_vmcs->nmi_known_unmasked = false;
 
        if (vmx->rmode.vm86_active) {
-               if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
-                       kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+               kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0);
                return;
        }
 
@@ -4431,7 +4473,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
         * Cause the #SS fault with 0 error code in VM86 mode.
         */
        if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) {
-               if (kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE) {
+               if (kvm_emulate_instruction(vcpu, 0)) {
                        if (vcpu->arch.halt_request) {
                                vcpu->arch.halt_request = 0;
                                return kvm_vcpu_halt(vcpu);
@@ -4482,7 +4524,6 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
        u32 intr_info, ex_no, error_code;
        unsigned long cr2, rip, dr6;
        u32 vect_info;
-       enum emulation_result er;
 
        vect_info = vmx->idt_vectoring_info;
        intr_info = vmx->exit_intr_info;
@@ -4499,13 +4540,17 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
 
        if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
                WARN_ON_ONCE(!enable_vmware_backdoor);
-               er = kvm_emulate_instruction(vcpu,
-                       EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
-               if (er == EMULATE_USER_EXIT)
-                       return 0;
-               else if (er != EMULATE_DONE)
+
+               /*
+                * VMware backdoor emulation on #GP interception only handles
+                * IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
+                * error code on #GP.
+                */
+               if (error_code) {
                        kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
-               return 1;
+                       return 1;
+               }
+               return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
        }
 
        /*
@@ -4547,7 +4592,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
                        vcpu->arch.dr6 &= ~DR_TRAP_BITS;
                        vcpu->arch.dr6 |= dr6 | DR6_RTM;
                        if (is_icebp(intr_info))
-                               skip_emulated_instruction(vcpu);
+                               WARN_ON(!skip_emulated_instruction(vcpu));
 
                        kvm_queue_exception(vcpu, DB_VECTOR);
                        return 1;
@@ -4602,7 +4647,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
        ++vcpu->stat.io_exits;
 
        if (string)
-               return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+               return kvm_emulate_instruction(vcpu, 0);
 
        port = exit_qualification >> 16;
        size = (exit_qualification & 7) + 1;
@@ -4676,7 +4721,7 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
 static int handle_desc(struct kvm_vcpu *vcpu)
 {
        WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP));
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_cr(struct kvm_vcpu *vcpu)
@@ -4856,41 +4901,12 @@ static int handle_cpuid(struct kvm_vcpu *vcpu)
 
 static int handle_rdmsr(struct kvm_vcpu *vcpu)
 {
-       u32 ecx = kvm_rcx_read(vcpu);
-       struct msr_data msr_info;
-
-       msr_info.index = ecx;
-       msr_info.host_initiated = false;
-       if (vmx_get_msr(vcpu, &msr_info)) {
-               trace_kvm_msr_read_ex(ecx);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       trace_kvm_msr_read(ecx, msr_info.data);
-
-       kvm_rax_write(vcpu, msr_info.data & -1u);
-       kvm_rdx_write(vcpu, (msr_info.data >> 32) & -1u);
-       return kvm_skip_emulated_instruction(vcpu);
+       return kvm_emulate_rdmsr(vcpu);
 }
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
-       struct msr_data msr;
-       u32 ecx = kvm_rcx_read(vcpu);
-       u64 data = kvm_read_edx_eax(vcpu);
-
-       msr.data = data;
-       msr.index = ecx;
-       msr.host_initiated = false;
-       if (kvm_set_msr(vcpu, &msr) != 0) {
-               trace_kvm_msr_write_ex(ecx, data);
-               kvm_inject_gp(vcpu, 0);
-               return 1;
-       }
-
-       trace_kvm_msr_write(ecx, data);
-       return kvm_skip_emulated_instruction(vcpu);
+       return kvm_emulate_wrmsr(vcpu);
 }
 
 static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
@@ -4921,7 +4937,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu)
 
 static int handle_invd(struct kvm_vcpu *vcpu)
 {
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_invlpg(struct kvm_vcpu *vcpu)
@@ -4988,7 +5004,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu)
                        return kvm_skip_emulated_instruction(vcpu);
                }
        }
-       return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+       return kvm_emulate_instruction(vcpu, 0);
 }
 
 static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
@@ -5057,23 +5073,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
        if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
                       type != INTR_TYPE_EXT_INTR &&
                       type != INTR_TYPE_NMI_INTR))
-               skip_emulated_instruction(vcpu);
-
-       if (kvm_task_switch(vcpu, tss_selector,
-                           type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason,
-                           has_error_code, error_code) == EMULATE_FAIL) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-               vcpu->run->internal.ndata = 0;
-               return 0;
-       }
+               WARN_ON(!skip_emulated_instruction(vcpu));
 
        /*
         * TODO: What about debug traps on tss switch?
         *       Are we supposed to inject them and update dr6?
         */
-
-       return 1;
+       return kvm_task_switch(vcpu, tss_selector,
+                              type == INTR_TYPE_SOFT_INTR ? idt_index : -1,
+                              reason, has_error_code, error_code);
 }
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu)
@@ -5132,21 +5140,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        if (!is_guest_mode(vcpu) &&
            !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                trace_kvm_fast_mmio(gpa);
-               /*
-                * Doing kvm_skip_emulated_instruction() depends on undefined
-                * behavior: Intel's manual doesn't mandate
-                * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG
-                * occurs and while on real hardware it was observed to be set,
-                * other hypervisors (namely Hyper-V) don't set it, we end up
-                * advancing IP with some random value. Disable fast mmio when
-                * running nested and keep it for real hardware in hope that
-                * VM_EXIT_INSTRUCTION_LEN will always be set correctly.
-                */
-               if (!static_cpu_has(X86_FEATURE_HYPERVISOR))
-                       return kvm_skip_emulated_instruction(vcpu);
-               else
-                       return kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) ==
-                                                               EMULATE_DONE;
+               return kvm_skip_emulated_instruction(vcpu);
        }
 
        return kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0);
@@ -5165,8 +5159,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
 static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       enum emulation_result err = EMULATE_DONE;
-       int ret = 1;
        bool intr_window_requested;
        unsigned count = 130;
 
@@ -5187,71 +5179,67 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
                if (kvm_test_request(KVM_REQ_EVENT, vcpu))
                        return 1;
 
-               err = kvm_emulate_instruction(vcpu, 0);
-
-               if (err == EMULATE_USER_EXIT) {
-                       ++vcpu->stat.mmio_exits;
-                       ret = 0;
-                       goto out;
-               }
-
-               if (err != EMULATE_DONE)
-                       goto emulation_error;
+               if (!kvm_emulate_instruction(vcpu, 0))
+                       return 0;
 
                if (vmx->emulation_required && !vmx->rmode.vm86_active &&
-                   vcpu->arch.exception.pending)
-                       goto emulation_error;
+                   vcpu->arch.exception.pending) {
+                       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       vcpu->run->internal.suberror =
+                                               KVM_INTERNAL_ERROR_EMULATION;
+                       vcpu->run->internal.ndata = 0;
+                       return 0;
+               }
 
                if (vcpu->arch.halt_request) {
                        vcpu->arch.halt_request = 0;
-                       ret = kvm_vcpu_halt(vcpu);
-                       goto out;
+                       return kvm_vcpu_halt(vcpu);
                }
 
+               /*
+                * Note, return 1 and not 0, vcpu_run() is responsible for
+                * morphing the pending signal into the proper return code.
+                */
                if (signal_pending(current))
-                       goto out;
+                       return 1;
+
                if (need_resched())
                        schedule();
        }
 
-out:
-       return ret;
-
-emulation_error:
-       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-       vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-       vcpu->run->internal.ndata = 0;
-       return 0;
+       return 1;
 }
 
 static void grow_ple_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int old = vmx->ple_window;
+       unsigned int old = vmx->ple_window;
 
        vmx->ple_window = __grow_ple_window(old, ple_window,
                                            ple_window_grow,
                                            ple_window_max);
 
-       if (vmx->ple_window != old)
+       if (vmx->ple_window != old) {
                vmx->ple_window_dirty = true;
-
-       trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old);
+               trace_kvm_ple_window_update(vcpu->vcpu_id,
+                                           vmx->ple_window, old);
+       }
 }
 
 static void shrink_ple_window(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       int old = vmx->ple_window;
+       unsigned int old = vmx->ple_window;
 
        vmx->ple_window = __shrink_ple_window(old, ple_window,
                                              ple_window_shrink,
                                              ple_window);
 
-       if (vmx->ple_window != old)
+       if (vmx->ple_window != old) {
                vmx->ple_window_dirty = true;
-
-       trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old);
+               trace_kvm_ple_window_update(vcpu->vcpu_id,
+                                           vmx->ple_window, old);
+       }
 }
 
 /*
@@ -5887,8 +5875,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
        else {
                vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
                                exit_reason);
-               kvm_queue_exception(vcpu, UD_VECTOR);
-               return 1;
+               dump_vmcs();
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror =
+                       KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+               vcpu->run->internal.ndata = 1;
+               vcpu->run->internal.data[0] = exit_reason;
+               return 0;
        }
 }
 
@@ -6522,6 +6515,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
                current_evmcs->hv_clean_fields |=
                        HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
 
+       if (static_branch_unlikely(&enable_evmcs))
+               current_evmcs->hv_vp_id = vcpu->arch.hyperv.vp_index;
+
        /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
        if (vmx->host_debugctlmsr)
                update_debugctlmsr(vmx->host_debugctlmsr);
@@ -6589,6 +6585,7 @@ static struct kvm *vmx_vm_alloc(void)
 
 static void vmx_vm_free(struct kvm *kvm)
 {
+       kfree(kvm->arch.hyperv.hv_pa_pg);
        vfree(to_kvm_vmx(kvm));
 }
 
@@ -6615,6 +6612,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        unsigned long *msr_bitmap;
        int cpu;
 
+       BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
+               "struct kvm_vcpu must be at offset 0 for arch usercopy region");
+
        vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
        if (!vmx)
                return ERR_PTR(-ENOMEM);
@@ -7369,10 +7369,14 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
                 * irqbalance to make the interrupts single-CPU.
                 *
                 * We will support full lowest-priority interrupt later.
+                *
+                * In addition, we can only inject generic interrupts using
+                * the PI mechanism, refuse to route others through it.
                 */
 
                kvm_set_msi_irq(kvm, e, &irq);
-               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+                   !kvm_irq_is_postable(&irq)) {
                        /*
                         * Make sure the IRTE is in remapped mode if
                         * we don't handle it in posted mode.
@@ -7474,6 +7478,11 @@ static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
        return false;
 }
 
+static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+       return to_vmx(vcpu)->nested.vmxon;
+}
+
 static __init int hardware_setup(void)
 {
        unsigned long host_bndcfgs;
@@ -7799,6 +7808,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
        .nested_enable_evmcs = NULL,
        .nested_get_evmcs_version = NULL,
        .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
+       .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
 };
 
 static void vmx_cleanup_l1d_flush(void)
@@ -7835,6 +7845,7 @@ static void vmx_exit(void)
                        if (!vp_ap)
                                continue;
 
+                       vp_ap->nested_control.features.directhypercall = 0;
                        vp_ap->current_nested_vmcs = 0;
                        vp_ap->enlighten_vmentry = 0;
                }
@@ -7874,6 +7885,11 @@ static int __init vmx_init(void)
                        pr_info("KVM: vmx: using Hyper-V Enlightened VMCS\n");
                        static_branch_enable(&enable_evmcs);
                }
+
+               if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
+                       vmx_x86_ops.enable_direct_tlbflush
+                               = hv_enable_direct_tlbflush;
+
        } else {
                enlightened_vmcs = false;
        }