]> asedeno.scripts.mit.edu Git - linux.git/blob - arch/x86/kvm/svm.c
Merge tag 'acpi-5.2-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafael...
[linux.git] / arch / x86 / kvm / svm.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * AMD SVM support
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8  *
9  * Authors:
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *   Avi Kivity   <avi@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #define pr_fmt(fmt) "SVM: " fmt
19
20 #include <linux/kvm_host.h>
21
22 #include "irq.h"
23 #include "mmu.h"
24 #include "kvm_cache_regs.h"
25 #include "x86.h"
26 #include "cpuid.h"
27 #include "pmu.h"
28
29 #include <linux/module.h>
30 #include <linux/mod_devicetable.h>
31 #include <linux/kernel.h>
32 #include <linux/vmalloc.h>
33 #include <linux/highmem.h>
34 #include <linux/sched.h>
35 #include <linux/trace_events.h>
36 #include <linux/slab.h>
37 #include <linux/amd-iommu.h>
38 #include <linux/hashtable.h>
39 #include <linux/frame.h>
40 #include <linux/psp-sev.h>
41 #include <linux/file.h>
42 #include <linux/pagemap.h>
43 #include <linux/swap.h>
44
45 #include <asm/apic.h>
46 #include <asm/perf_event.h>
47 #include <asm/tlbflush.h>
48 #include <asm/desc.h>
49 #include <asm/debugreg.h>
50 #include <asm/kvm_para.h>
51 #include <asm/irq_remapping.h>
52 #include <asm/spec-ctrl.h>
53
54 #include <asm/virtext.h>
55 #include "trace.h"
56
57 #define __ex(x) __kvm_handle_fault_on_reboot(x)
58
59 MODULE_AUTHOR("Qumranet");
60 MODULE_LICENSE("GPL");
61
62 static const struct x86_cpu_id svm_cpu_id[] = {
63         X86_FEATURE_MATCH(X86_FEATURE_SVM),
64         {}
65 };
66 MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
67
68 #define IOPM_ALLOC_ORDER 2
69 #define MSRPM_ALLOC_ORDER 1
70
71 #define SEG_TYPE_LDT 2
72 #define SEG_TYPE_BUSY_TSS16 3
73
74 #define SVM_FEATURE_NPT            (1 <<  0)
75 #define SVM_FEATURE_LBRV           (1 <<  1)
76 #define SVM_FEATURE_SVML           (1 <<  2)
77 #define SVM_FEATURE_NRIP           (1 <<  3)
78 #define SVM_FEATURE_TSC_RATE       (1 <<  4)
79 #define SVM_FEATURE_VMCB_CLEAN     (1 <<  5)
80 #define SVM_FEATURE_FLUSH_ASID     (1 <<  6)
81 #define SVM_FEATURE_DECODE_ASSIST  (1 <<  7)
82 #define SVM_FEATURE_PAUSE_FILTER   (1 << 10)
83
84 #define SVM_AVIC_DOORBELL       0xc001011b
85
86 #define NESTED_EXIT_HOST        0       /* Exit handled on host level */
87 #define NESTED_EXIT_DONE        1       /* Exit caused nested vmexit  */
88 #define NESTED_EXIT_CONTINUE    2       /* Further checks needed      */
89
90 #define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
91
92 #define TSC_RATIO_RSVD          0xffffff0000000000ULL
93 #define TSC_RATIO_MIN           0x0000000000000001ULL
94 #define TSC_RATIO_MAX           0x000000ffffffffffULL
95
96 #define AVIC_HPA_MASK   ~((0xFFFULL << 52) | 0xFFF)
97
98 /*
99  * 0xff is broadcast, so the max index allowed for physical APIC ID
100  * table is 0xfe.  APIC IDs above 0xff are reserved.
101  */
102 #define AVIC_MAX_PHYSICAL_ID_COUNT      255
103
104 #define AVIC_UNACCEL_ACCESS_WRITE_MASK          1
105 #define AVIC_UNACCEL_ACCESS_OFFSET_MASK         0xFF0
106 #define AVIC_UNACCEL_ACCESS_VECTOR_MASK         0xFFFFFFFF
107
108 /* AVIC GATAG is encoded using VM and VCPU IDs */
109 #define AVIC_VCPU_ID_BITS               8
110 #define AVIC_VCPU_ID_MASK               ((1 << AVIC_VCPU_ID_BITS) - 1)
111
112 #define AVIC_VM_ID_BITS                 24
113 #define AVIC_VM_ID_NR                   (1 << AVIC_VM_ID_BITS)
114 #define AVIC_VM_ID_MASK                 ((1 << AVIC_VM_ID_BITS) - 1)
115
116 #define AVIC_GATAG(x, y)                (((x & AVIC_VM_ID_MASK) << AVIC_VCPU_ID_BITS) | \
117                                                 (y & AVIC_VCPU_ID_MASK))
118 #define AVIC_GATAG_TO_VMID(x)           ((x >> AVIC_VCPU_ID_BITS) & AVIC_VM_ID_MASK)
119 #define AVIC_GATAG_TO_VCPUID(x)         (x & AVIC_VCPU_ID_MASK)
120
121 static bool erratum_383_found __read_mostly;
122
123 static const u32 host_save_user_msrs[] = {
124 #ifdef CONFIG_X86_64
125         MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
126         MSR_FS_BASE,
127 #endif
128         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
129         MSR_TSC_AUX,
130 };
131
132 #define NR_HOST_SAVE_USER_MSRS ARRAY_SIZE(host_save_user_msrs)
133
134 struct kvm_sev_info {
135         bool active;            /* SEV enabled guest */
136         unsigned int asid;      /* ASID used for this guest */
137         unsigned int handle;    /* SEV firmware handle */
138         int fd;                 /* SEV device fd */
139         unsigned long pages_locked; /* Number of pages locked */
140         struct list_head regions_list;  /* List of registered regions */
141 };
142
143 struct kvm_svm {
144         struct kvm kvm;
145
146         /* Struct members for AVIC */
147         u32 avic_vm_id;
148         struct page *avic_logical_id_table_page;
149         struct page *avic_physical_id_table_page;
150         struct hlist_node hnode;
151
152         struct kvm_sev_info sev_info;
153 };
154
155 struct kvm_vcpu;
156
157 struct nested_state {
158         struct vmcb *hsave;
159         u64 hsave_msr;
160         u64 vm_cr_msr;
161         u64 vmcb;
162
163         /* These are the merged vectors */
164         u32 *msrpm;
165
166         /* gpa pointers to the real vectors */
167         u64 vmcb_msrpm;
168         u64 vmcb_iopm;
169
170         /* A VMEXIT is required but not yet emulated */
171         bool exit_required;
172
173         /* cache for intercepts of the guest */
174         u32 intercept_cr;
175         u32 intercept_dr;
176         u32 intercept_exceptions;
177         u64 intercept;
178
179         /* Nested Paging related state */
180         u64 nested_cr3;
181 };
182
183 #define MSRPM_OFFSETS   16
184 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
185
186 /*
187  * Set osvw_len to higher value when updated Revision Guides
188  * are published and we know what the new status bits are
189  */
190 static uint64_t osvw_len = 4, osvw_status;
191
192 struct vcpu_svm {
193         struct kvm_vcpu vcpu;
194         struct vmcb *vmcb;
195         unsigned long vmcb_pa;
196         struct svm_cpu_data *svm_data;
197         uint64_t asid_generation;
198         uint64_t sysenter_esp;
199         uint64_t sysenter_eip;
200         uint64_t tsc_aux;
201
202         u64 msr_decfg;
203
204         u64 next_rip;
205
206         u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
207         struct {
208                 u16 fs;
209                 u16 gs;
210                 u16 ldt;
211                 u64 gs_base;
212         } host;
213
214         u64 spec_ctrl;
215         /*
216          * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be
217          * translated into the appropriate L2_CFG bits on the host to
218          * perform speculative control.
219          */
220         u64 virt_spec_ctrl;
221
222         u32 *msrpm;
223
224         ulong nmi_iret_rip;
225
226         struct nested_state nested;
227
228         bool nmi_singlestep;
229         u64 nmi_singlestep_guest_rflags;
230
231         unsigned int3_injected;
232         unsigned long int3_rip;
233
234         /* cached guest cpuid flags for faster access */
235         bool nrips_enabled      : 1;
236
237         u32 ldr_reg;
238         u32 dfr_reg;
239         struct page *avic_backing_page;
240         u64 *avic_physical_id_cache;
241         bool avic_is_running;
242
243         /*
244          * Per-vcpu list of struct amd_svm_iommu_ir:
245          * This is used mainly to store interrupt remapping information used
246          * when update the vcpu affinity. This avoids the need to scan for
247          * IRTE and try to match ga_tag in the IOMMU driver.
248          */
249         struct list_head ir_list;
250         spinlock_t ir_list_lock;
251
252         /* which host CPU was used for running this vcpu */
253         unsigned int last_cpu;
254 };
255
256 /*
257  * This is a wrapper of struct amd_iommu_ir_data.
258  */
259 struct amd_svm_iommu_ir {
260         struct list_head node;  /* Used by SVM for per-vcpu ir_list */
261         void *data;             /* Storing pointer to struct amd_ir_data */
262 };
263
264 #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK    (0xFF)
265 #define AVIC_LOGICAL_ID_ENTRY_VALID_BIT                 31
266 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK                (1 << 31)
267
268 #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK    (0xFFULL)
269 #define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK        (0xFFFFFFFFFFULL << 12)
270 #define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK          (1ULL << 62)
271 #define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK               (1ULL << 63)
272
273 static DEFINE_PER_CPU(u64, current_tsc_ratio);
274 #define TSC_RATIO_DEFAULT       0x0100000000ULL
275
276 #define MSR_INVALID                     0xffffffffU
277
278 static const struct svm_direct_access_msrs {
279         u32 index;   /* Index of the MSR */
280         bool always; /* True if intercept is always on */
281 } direct_access_msrs[] = {
282         { .index = MSR_STAR,                            .always = true  },
283         { .index = MSR_IA32_SYSENTER_CS,                .always = true  },
284 #ifdef CONFIG_X86_64
285         { .index = MSR_GS_BASE,                         .always = true  },
286         { .index = MSR_FS_BASE,                         .always = true  },
287         { .index = MSR_KERNEL_GS_BASE,                  .always = true  },
288         { .index = MSR_LSTAR,                           .always = true  },
289         { .index = MSR_CSTAR,                           .always = true  },
290         { .index = MSR_SYSCALL_MASK,                    .always = true  },
291 #endif
292         { .index = MSR_IA32_SPEC_CTRL,                  .always = false },
293         { .index = MSR_IA32_PRED_CMD,                   .always = false },
294         { .index = MSR_IA32_LASTBRANCHFROMIP,           .always = false },
295         { .index = MSR_IA32_LASTBRANCHTOIP,             .always = false },
296         { .index = MSR_IA32_LASTINTFROMIP,              .always = false },
297         { .index = MSR_IA32_LASTINTTOIP,                .always = false },
298         { .index = MSR_INVALID,                         .always = false },
299 };
300
301 /* enable NPT for AMD64 and X86 with PAE */
302 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
303 static bool npt_enabled = true;
304 #else
305 static bool npt_enabled;
306 #endif
307
308 /*
309  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
310  * pause_filter_count: On processors that support Pause filtering(indicated
311  *      by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
312  *      count value. On VMRUN this value is loaded into an internal counter.
313  *      Each time a pause instruction is executed, this counter is decremented
314  *      until it reaches zero at which time a #VMEXIT is generated if pause
315  *      intercept is enabled. Refer to  AMD APM Vol 2 Section 15.14.4 Pause
316  *      Intercept Filtering for more details.
317  *      This also indicate if ple logic enabled.
318  *
319  * pause_filter_thresh: In addition, some processor families support advanced
320  *      pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
321  *      the amount of time a guest is allowed to execute in a pause loop.
322  *      In this mode, a 16-bit pause filter threshold field is added in the
323  *      VMCB. The threshold value is a cycle count that is used to reset the
324  *      pause counter. As with simple pause filtering, VMRUN loads the pause
325  *      count value from VMCB into an internal counter. Then, on each pause
326  *      instruction the hardware checks the elapsed number of cycles since
327  *      the most recent pause instruction against the pause filter threshold.
328  *      If the elapsed cycle count is greater than the pause filter threshold,
329  *      then the internal pause count is reloaded from the VMCB and execution
330  *      continues. If the elapsed cycle count is less than the pause filter
331  *      threshold, then the internal pause count is decremented. If the count
332  *      value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
333  *      triggered. If advanced pause filtering is supported and pause filter
334  *      threshold field is set to zero, the filter will operate in the simpler,
335  *      count only mode.
336  */
337
338 static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
339 module_param(pause_filter_thresh, ushort, 0444);
340
341 static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW;
342 module_param(pause_filter_count, ushort, 0444);
343
344 /* Default doubles per-vcpu window every exit. */
345 static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
346 module_param(pause_filter_count_grow, ushort, 0444);
347
348 /* Default resets per-vcpu window every exit to pause_filter_count. */
349 static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
350 module_param(pause_filter_count_shrink, ushort, 0444);
351
352 /* Default is to compute the maximum so we can never overflow. */
353 static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
354 module_param(pause_filter_count_max, ushort, 0444);
355
356 /* allow nested paging (virtualized MMU) for all guests */
357 static int npt = true;
358 module_param(npt, int, S_IRUGO);
359
360 /* allow nested virtualization in KVM/SVM */
361 static int nested = true;
362 module_param(nested, int, S_IRUGO);
363
364 /* enable / disable AVIC */
365 static int avic;
366 #ifdef CONFIG_X86_LOCAL_APIC
367 module_param(avic, int, S_IRUGO);
368 #endif
369
370 /* enable/disable Virtual VMLOAD VMSAVE */
371 static int vls = true;
372 module_param(vls, int, 0444);
373
374 /* enable/disable Virtual GIF */
375 static int vgif = true;
376 module_param(vgif, int, 0444);
377
378 /* enable/disable SEV support */
379 static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
380 module_param(sev, int, 0444);
381
382 static u8 rsm_ins_bytes[] = "\x0f\xaa";
383
384 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
385 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa);
386 static void svm_complete_interrupts(struct vcpu_svm *svm);
387
388 static int nested_svm_exit_handled(struct vcpu_svm *svm);
389 static int nested_svm_intercept(struct vcpu_svm *svm);
390 static int nested_svm_vmexit(struct vcpu_svm *svm);
391 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
392                                       bool has_error_code, u32 error_code);
393
394 enum {
395         VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
396                             pause filter count */
397         VMCB_PERM_MAP,   /* IOPM Base and MSRPM Base */
398         VMCB_ASID,       /* ASID */
399         VMCB_INTR,       /* int_ctl, int_vector */
400         VMCB_NPT,        /* npt_en, nCR3, gPAT */
401         VMCB_CR,         /* CR0, CR3, CR4, EFER */
402         VMCB_DR,         /* DR6, DR7 */
403         VMCB_DT,         /* GDT, IDT */
404         VMCB_SEG,        /* CS, DS, SS, ES, CPL */
405         VMCB_CR2,        /* CR2 only */
406         VMCB_LBR,        /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
407         VMCB_AVIC,       /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE,
408                           * AVIC PHYSICAL_TABLE pointer,
409                           * AVIC LOGICAL_TABLE pointer
410                           */
411         VMCB_DIRTY_MAX,
412 };
413
414 /* TPR and CR2 are always written before VMRUN */
415 #define VMCB_ALWAYS_DIRTY_MASK  ((1U << VMCB_INTR) | (1U << VMCB_CR2))
416
417 #define VMCB_AVIC_APIC_BAR_MASK         0xFFFFFFFFFF000ULL
418
419 static unsigned int max_sev_asid;
420 static unsigned int min_sev_asid;
421 static unsigned long *sev_asid_bitmap;
422 #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT)
423
424 struct enc_region {
425         struct list_head list;
426         unsigned long npages;
427         struct page **pages;
428         unsigned long uaddr;
429         unsigned long size;
430 };
431
432
433 static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm)
434 {
435         return container_of(kvm, struct kvm_svm, kvm);
436 }
437
438 static inline bool svm_sev_enabled(void)
439 {
440         return IS_ENABLED(CONFIG_KVM_AMD_SEV) ? max_sev_asid : 0;
441 }
442
443 static inline bool sev_guest(struct kvm *kvm)
444 {
445 #ifdef CONFIG_KVM_AMD_SEV
446         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
447
448         return sev->active;
449 #else
450         return false;
451 #endif
452 }
453
454 static inline int sev_get_asid(struct kvm *kvm)
455 {
456         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
457
458         return sev->asid;
459 }
460
461 static inline void mark_all_dirty(struct vmcb *vmcb)
462 {
463         vmcb->control.clean = 0;
464 }
465
466 static inline void mark_all_clean(struct vmcb *vmcb)
467 {
468         vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
469                                & ~VMCB_ALWAYS_DIRTY_MASK;
470 }
471
472 static inline void mark_dirty(struct vmcb *vmcb, int bit)
473 {
474         vmcb->control.clean &= ~(1 << bit);
475 }
476
477 static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
478 {
479         return container_of(vcpu, struct vcpu_svm, vcpu);
480 }
481
482 static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data)
483 {
484         svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK;
485         mark_dirty(svm->vmcb, VMCB_AVIC);
486 }
487
488 static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
489 {
490         struct vcpu_svm *svm = to_svm(vcpu);
491         u64 *entry = svm->avic_physical_id_cache;
492
493         if (!entry)
494                 return false;
495
496         return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
497 }
498
499 static void recalc_intercepts(struct vcpu_svm *svm)
500 {
501         struct vmcb_control_area *c, *h;
502         struct nested_state *g;
503
504         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
505
506         if (!is_guest_mode(&svm->vcpu))
507                 return;
508
509         c = &svm->vmcb->control;
510         h = &svm->nested.hsave->control;
511         g = &svm->nested;
512
513         c->intercept_cr = h->intercept_cr | g->intercept_cr;
514         c->intercept_dr = h->intercept_dr | g->intercept_dr;
515         c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
516         c->intercept = h->intercept | g->intercept;
517 }
518
519 static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
520 {
521         if (is_guest_mode(&svm->vcpu))
522                 return svm->nested.hsave;
523         else
524                 return svm->vmcb;
525 }
526
527 static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
528 {
529         struct vmcb *vmcb = get_host_vmcb(svm);
530
531         vmcb->control.intercept_cr |= (1U << bit);
532
533         recalc_intercepts(svm);
534 }
535
536 static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
537 {
538         struct vmcb *vmcb = get_host_vmcb(svm);
539
540         vmcb->control.intercept_cr &= ~(1U << bit);
541
542         recalc_intercepts(svm);
543 }
544
545 static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
546 {
547         struct vmcb *vmcb = get_host_vmcb(svm);
548
549         return vmcb->control.intercept_cr & (1U << bit);
550 }
551
552 static inline void set_dr_intercepts(struct vcpu_svm *svm)
553 {
554         struct vmcb *vmcb = get_host_vmcb(svm);
555
556         vmcb->control.intercept_dr = (1 << INTERCEPT_DR0_READ)
557                 | (1 << INTERCEPT_DR1_READ)
558                 | (1 << INTERCEPT_DR2_READ)
559                 | (1 << INTERCEPT_DR3_READ)
560                 | (1 << INTERCEPT_DR4_READ)
561                 | (1 << INTERCEPT_DR5_READ)
562                 | (1 << INTERCEPT_DR6_READ)
563                 | (1 << INTERCEPT_DR7_READ)
564                 | (1 << INTERCEPT_DR0_WRITE)
565                 | (1 << INTERCEPT_DR1_WRITE)
566                 | (1 << INTERCEPT_DR2_WRITE)
567                 | (1 << INTERCEPT_DR3_WRITE)
568                 | (1 << INTERCEPT_DR4_WRITE)
569                 | (1 << INTERCEPT_DR5_WRITE)
570                 | (1 << INTERCEPT_DR6_WRITE)
571                 | (1 << INTERCEPT_DR7_WRITE);
572
573         recalc_intercepts(svm);
574 }
575
576 static inline void clr_dr_intercepts(struct vcpu_svm *svm)
577 {
578         struct vmcb *vmcb = get_host_vmcb(svm);
579
580         vmcb->control.intercept_dr = 0;
581
582         recalc_intercepts(svm);
583 }
584
585 static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
586 {
587         struct vmcb *vmcb = get_host_vmcb(svm);
588
589         vmcb->control.intercept_exceptions |= (1U << bit);
590
591         recalc_intercepts(svm);
592 }
593
594 static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
595 {
596         struct vmcb *vmcb = get_host_vmcb(svm);
597
598         vmcb->control.intercept_exceptions &= ~(1U << bit);
599
600         recalc_intercepts(svm);
601 }
602
603 static inline void set_intercept(struct vcpu_svm *svm, int bit)
604 {
605         struct vmcb *vmcb = get_host_vmcb(svm);
606
607         vmcb->control.intercept |= (1ULL << bit);
608
609         recalc_intercepts(svm);
610 }
611
612 static inline void clr_intercept(struct vcpu_svm *svm, int bit)
613 {
614         struct vmcb *vmcb = get_host_vmcb(svm);
615
616         vmcb->control.intercept &= ~(1ULL << bit);
617
618         recalc_intercepts(svm);
619 }
620
621 static inline bool vgif_enabled(struct vcpu_svm *svm)
622 {
623         return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
624 }
625
626 static inline void enable_gif(struct vcpu_svm *svm)
627 {
628         if (vgif_enabled(svm))
629                 svm->vmcb->control.int_ctl |= V_GIF_MASK;
630         else
631                 svm->vcpu.arch.hflags |= HF_GIF_MASK;
632 }
633
634 static inline void disable_gif(struct vcpu_svm *svm)
635 {
636         if (vgif_enabled(svm))
637                 svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
638         else
639                 svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
640 }
641
642 static inline bool gif_set(struct vcpu_svm *svm)
643 {
644         if (vgif_enabled(svm))
645                 return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
646         else
647                 return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
648 }
649
650 static unsigned long iopm_base;
651
652 struct kvm_ldttss_desc {
653         u16 limit0;
654         u16 base0;
655         unsigned base1:8, type:5, dpl:2, p:1;
656         unsigned limit1:4, zero0:3, g:1, base2:8;
657         u32 base3;
658         u32 zero1;
659 } __attribute__((packed));
660
661 struct svm_cpu_data {
662         int cpu;
663
664         u64 asid_generation;
665         u32 max_asid;
666         u32 next_asid;
667         u32 min_asid;
668         struct kvm_ldttss_desc *tss_desc;
669
670         struct page *save_area;
671         struct vmcb *current_vmcb;
672
673         /* index = sev_asid, value = vmcb pointer */
674         struct vmcb **sev_vmcbs;
675 };
676
677 static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
678
679 static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
680
681 #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
682 #define MSRS_RANGE_SIZE 2048
683 #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2)
684
685 static u32 svm_msrpm_offset(u32 msr)
686 {
687         u32 offset;
688         int i;
689
690         for (i = 0; i < NUM_MSR_MAPS; i++) {
691                 if (msr < msrpm_ranges[i] ||
692                     msr >= msrpm_ranges[i] + MSRS_IN_RANGE)
693                         continue;
694
695                 offset  = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */
696                 offset += (i * MSRS_RANGE_SIZE);       /* add range offset */
697
698                 /* Now we have the u8 offset - but need the u32 offset */
699                 return offset / 4;
700         }
701
702         /* MSR not in any range */
703         return MSR_INVALID;
704 }
705
706 #define MAX_INST_SIZE 15
707
708 static inline void clgi(void)
709 {
710         asm volatile (__ex("clgi"));
711 }
712
713 static inline void stgi(void)
714 {
715         asm volatile (__ex("stgi"));
716 }
717
718 static inline void invlpga(unsigned long addr, u32 asid)
719 {
720         asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
721 }
722
723 static int get_npt_level(struct kvm_vcpu *vcpu)
724 {
725 #ifdef CONFIG_X86_64
726         return PT64_ROOT_4LEVEL;
727 #else
728         return PT32E_ROOT_LEVEL;
729 #endif
730 }
731
732 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
733 {
734         vcpu->arch.efer = efer;
735         if (!npt_enabled && !(efer & EFER_LMA))
736                 efer &= ~EFER_LME;
737
738         to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
739         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
740 }
741
742 static int is_external_interrupt(u32 info)
743 {
744         info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
745         return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
746 }
747
748 static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
749 {
750         struct vcpu_svm *svm = to_svm(vcpu);
751         u32 ret = 0;
752
753         if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
754                 ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS;
755         return ret;
756 }
757
758 static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
759 {
760         struct vcpu_svm *svm = to_svm(vcpu);
761
762         if (mask == 0)
763                 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
764         else
765                 svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
766
767 }
768
769 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
770 {
771         struct vcpu_svm *svm = to_svm(vcpu);
772
773         if (svm->vmcb->control.next_rip != 0) {
774                 WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
775                 svm->next_rip = svm->vmcb->control.next_rip;
776         }
777
778         if (!svm->next_rip) {
779                 if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
780                                 EMULATE_DONE)
781                         printk(KERN_DEBUG "%s: NOP\n", __func__);
782                 return;
783         }
784         if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
785                 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
786                        __func__, kvm_rip_read(vcpu), svm->next_rip);
787
788         kvm_rip_write(vcpu, svm->next_rip);
789         svm_set_interrupt_shadow(vcpu, 0);
790 }
791
792 static void svm_queue_exception(struct kvm_vcpu *vcpu)
793 {
794         struct vcpu_svm *svm = to_svm(vcpu);
795         unsigned nr = vcpu->arch.exception.nr;
796         bool has_error_code = vcpu->arch.exception.has_error_code;
797         bool reinject = vcpu->arch.exception.injected;
798         u32 error_code = vcpu->arch.exception.error_code;
799
800         /*
801          * If we are within a nested VM we'd better #VMEXIT and let the guest
802          * handle the exception
803          */
804         if (!reinject &&
805             nested_svm_check_exception(svm, nr, has_error_code, error_code))
806                 return;
807
808         kvm_deliver_exception_payload(&svm->vcpu);
809
810         if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
811                 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
812
813                 /*
814                  * For guest debugging where we have to reinject #BP if some
815                  * INT3 is guest-owned:
816                  * Emulate nRIP by moving RIP forward. Will fail if injection
817                  * raises a fault that is not intercepted. Still better than
818                  * failing in all cases.
819                  */
820                 skip_emulated_instruction(&svm->vcpu);
821                 rip = kvm_rip_read(&svm->vcpu);
822                 svm->int3_rip = rip + svm->vmcb->save.cs.base;
823                 svm->int3_injected = rip - old_rip;
824         }
825
826         svm->vmcb->control.event_inj = nr
827                 | SVM_EVTINJ_VALID
828                 | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
829                 | SVM_EVTINJ_TYPE_EXEPT;
830         svm->vmcb->control.event_inj_err = error_code;
831 }
832
833 static void svm_init_erratum_383(void)
834 {
835         u32 low, high;
836         int err;
837         u64 val;
838
839         if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH))
840                 return;
841
842         /* Use _safe variants to not break nested virtualization */
843         val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err);
844         if (err)
845                 return;
846
847         val |= (1ULL << 47);
848
849         low  = lower_32_bits(val);
850         high = upper_32_bits(val);
851
852         native_write_msr_safe(MSR_AMD64_DC_CFG, low, high);
853
854         erratum_383_found = true;
855 }
856
857 static void svm_init_osvw(struct kvm_vcpu *vcpu)
858 {
859         /*
860          * Guests should see errata 400 and 415 as fixed (assuming that
861          * HLT and IO instructions are intercepted).
862          */
863         vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3;
864         vcpu->arch.osvw.status = osvw_status & ~(6ULL);
865
866         /*
867          * By increasing VCPU's osvw.length to 3 we are telling the guest that
868          * all osvw.status bits inside that length, including bit 0 (which is
869          * reserved for erratum 298), are valid. However, if host processor's
870          * osvw_len is 0 then osvw_status[0] carries no information. We need to
871          * be conservative here and therefore we tell the guest that erratum 298
872          * is present (because we really don't know).
873          */
874         if (osvw_len == 0 && boot_cpu_data.x86 == 0x10)
875                 vcpu->arch.osvw.status |= 1;
876 }
877
878 static int has_svm(void)
879 {
880         const char *msg;
881
882         if (!cpu_has_svm(&msg)) {
883                 printk(KERN_INFO "has_svm: %s\n", msg);
884                 return 0;
885         }
886
887         return 1;
888 }
889
890 static void svm_hardware_disable(void)
891 {
892         /* Make sure we clean up behind us */
893         if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
894                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
895
896         cpu_svm_disable();
897
898         amd_pmu_disable_virt();
899 }
900
901 static int svm_hardware_enable(void)
902 {
903
904         struct svm_cpu_data *sd;
905         uint64_t efer;
906         struct desc_struct *gdt;
907         int me = raw_smp_processor_id();
908
909         rdmsrl(MSR_EFER, efer);
910         if (efer & EFER_SVME)
911                 return -EBUSY;
912
913         if (!has_svm()) {
914                 pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me);
915                 return -EINVAL;
916         }
917         sd = per_cpu(svm_data, me);
918         if (!sd) {
919                 pr_err("%s: svm_data is NULL on %d\n", __func__, me);
920                 return -EINVAL;
921         }
922
923         sd->asid_generation = 1;
924         sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
925         sd->next_asid = sd->max_asid + 1;
926         sd->min_asid = max_sev_asid + 1;
927
928         gdt = get_current_gdt_rw();
929         sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
930
931         wrmsrl(MSR_EFER, efer | EFER_SVME);
932
933         wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
934
935         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
936                 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
937                 __this_cpu_write(current_tsc_ratio, TSC_RATIO_DEFAULT);
938         }
939
940
941         /*
942          * Get OSVW bits.
943          *
944          * Note that it is possible to have a system with mixed processor
945          * revisions and therefore different OSVW bits. If bits are not the same
946          * on different processors then choose the worst case (i.e. if erratum
947          * is present on one processor and not on another then assume that the
948          * erratum is present everywhere).
949          */
950         if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) {
951                 uint64_t len, status = 0;
952                 int err;
953
954                 len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err);
955                 if (!err)
956                         status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS,
957                                                       &err);
958
959                 if (err)
960                         osvw_status = osvw_len = 0;
961                 else {
962                         if (len < osvw_len)
963                                 osvw_len = len;
964                         osvw_status |= status;
965                         osvw_status &= (1ULL << osvw_len) - 1;
966                 }
967         } else
968                 osvw_status = osvw_len = 0;
969
970         svm_init_erratum_383();
971
972         amd_pmu_enable_virt();
973
974         return 0;
975 }
976
977 static void svm_cpu_uninit(int cpu)
978 {
979         struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
980
981         if (!sd)
982                 return;
983
984         per_cpu(svm_data, raw_smp_processor_id()) = NULL;
985         kfree(sd->sev_vmcbs);
986         __free_page(sd->save_area);
987         kfree(sd);
988 }
989
990 static int svm_cpu_init(int cpu)
991 {
992         struct svm_cpu_data *sd;
993         int r;
994
995         sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
996         if (!sd)
997                 return -ENOMEM;
998         sd->cpu = cpu;
999         r = -ENOMEM;
1000         sd->save_area = alloc_page(GFP_KERNEL);
1001         if (!sd->save_area)
1002                 goto err_1;
1003
1004         if (svm_sev_enabled()) {
1005                 r = -ENOMEM;
1006                 sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
1007                                               sizeof(void *),
1008                                               GFP_KERNEL);
1009                 if (!sd->sev_vmcbs)
1010                         goto err_1;
1011         }
1012
1013         per_cpu(svm_data, cpu) = sd;
1014
1015         return 0;
1016
1017 err_1:
1018         kfree(sd);
1019         return r;
1020
1021 }
1022
1023 static bool valid_msr_intercept(u32 index)
1024 {
1025         int i;
1026
1027         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++)
1028                 if (direct_access_msrs[i].index == index)
1029                         return true;
1030
1031         return false;
1032 }
1033
1034 static bool msr_write_intercepted(struct kvm_vcpu *vcpu, unsigned msr)
1035 {
1036         u8 bit_write;
1037         unsigned long tmp;
1038         u32 offset;
1039         u32 *msrpm;
1040
1041         msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm:
1042                                       to_svm(vcpu)->msrpm;
1043
1044         offset    = svm_msrpm_offset(msr);
1045         bit_write = 2 * (msr & 0x0f) + 1;
1046         tmp       = msrpm[offset];
1047
1048         BUG_ON(offset == MSR_INVALID);
1049
1050         return !!test_bit(bit_write,  &tmp);
1051 }
1052
1053 static void set_msr_interception(u32 *msrpm, unsigned msr,
1054                                  int read, int write)
1055 {
1056         u8 bit_read, bit_write;
1057         unsigned long tmp;
1058         u32 offset;
1059
1060         /*
1061          * If this warning triggers extend the direct_access_msrs list at the
1062          * beginning of the file
1063          */
1064         WARN_ON(!valid_msr_intercept(msr));
1065
1066         offset    = svm_msrpm_offset(msr);
1067         bit_read  = 2 * (msr & 0x0f);
1068         bit_write = 2 * (msr & 0x0f) + 1;
1069         tmp       = msrpm[offset];
1070
1071         BUG_ON(offset == MSR_INVALID);
1072
1073         read  ? clear_bit(bit_read,  &tmp) : set_bit(bit_read,  &tmp);
1074         write ? clear_bit(bit_write, &tmp) : set_bit(bit_write, &tmp);
1075
1076         msrpm[offset] = tmp;
1077 }
1078
1079 static void svm_vcpu_init_msrpm(u32 *msrpm)
1080 {
1081         int i;
1082
1083         memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
1084
1085         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
1086                 if (!direct_access_msrs[i].always)
1087                         continue;
1088
1089                 set_msr_interception(msrpm, direct_access_msrs[i].index, 1, 1);
1090         }
1091 }
1092
1093 static void add_msr_offset(u32 offset)
1094 {
1095         int i;
1096
1097         for (i = 0; i < MSRPM_OFFSETS; ++i) {
1098
1099                 /* Offset already in list? */
1100                 if (msrpm_offsets[i] == offset)
1101                         return;
1102
1103                 /* Slot used by another offset? */
1104                 if (msrpm_offsets[i] != MSR_INVALID)
1105                         continue;
1106
1107                 /* Add offset to list */
1108                 msrpm_offsets[i] = offset;
1109
1110                 return;
1111         }
1112
1113         /*
1114          * If this BUG triggers the msrpm_offsets table has an overflow. Just
1115          * increase MSRPM_OFFSETS in this case.
1116          */
1117         BUG();
1118 }
1119
1120 static void init_msrpm_offsets(void)
1121 {
1122         int i;
1123
1124         memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets));
1125
1126         for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) {
1127                 u32 offset;
1128
1129                 offset = svm_msrpm_offset(direct_access_msrs[i].index);
1130                 BUG_ON(offset == MSR_INVALID);
1131
1132                 add_msr_offset(offset);
1133         }
1134 }
1135
1136 static void svm_enable_lbrv(struct vcpu_svm *svm)
1137 {
1138         u32 *msrpm = svm->msrpm;
1139
1140         svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
1141         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
1142         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
1143         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
1144         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
1145 }
1146
1147 static void svm_disable_lbrv(struct vcpu_svm *svm)
1148 {
1149         u32 *msrpm = svm->msrpm;
1150
1151         svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
1152         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
1153         set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
1154         set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
1155         set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
1156 }
1157
1158 static void disable_nmi_singlestep(struct vcpu_svm *svm)
1159 {
1160         svm->nmi_singlestep = false;
1161
1162         if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) {
1163                 /* Clear our flags if they were not set by the guest */
1164                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
1165                         svm->vmcb->save.rflags &= ~X86_EFLAGS_TF;
1166                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
1167                         svm->vmcb->save.rflags &= ~X86_EFLAGS_RF;
1168         }
1169 }
1170
1171 /* Note:
1172  * This hash table is used to map VM_ID to a struct kvm_svm,
1173  * when handling AMD IOMMU GALOG notification to schedule in
1174  * a particular vCPU.
1175  */
1176 #define SVM_VM_DATA_HASH_BITS   8
1177 static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
1178 static u32 next_vm_id = 0;
1179 static bool next_vm_id_wrapped = 0;
1180 static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
1181
1182 /* Note:
1183  * This function is called from IOMMU driver to notify
1184  * SVM to schedule in a particular vCPU of a particular VM.
1185  */
1186 static int avic_ga_log_notifier(u32 ga_tag)
1187 {
1188         unsigned long flags;
1189         struct kvm_svm *kvm_svm;
1190         struct kvm_vcpu *vcpu = NULL;
1191         u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag);
1192         u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag);
1193
1194         pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id);
1195
1196         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1197         hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) {
1198                 if (kvm_svm->avic_vm_id != vm_id)
1199                         continue;
1200                 vcpu = kvm_get_vcpu_by_id(&kvm_svm->kvm, vcpu_id);
1201                 break;
1202         }
1203         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1204
1205         /* Note:
1206          * At this point, the IOMMU should have already set the pending
1207          * bit in the vAPIC backing page. So, we just need to schedule
1208          * in the vcpu.
1209          */
1210         if (vcpu)
1211                 kvm_vcpu_wake_up(vcpu);
1212
1213         return 0;
1214 }
1215
1216 static __init int sev_hardware_setup(void)
1217 {
1218         struct sev_user_data_status *status;
1219         int rc;
1220
1221         /* Maximum number of encrypted guests supported simultaneously */
1222         max_sev_asid = cpuid_ecx(0x8000001F);
1223
1224         if (!max_sev_asid)
1225                 return 1;
1226
1227         /* Minimum ASID value that should be used for SEV guest */
1228         min_sev_asid = cpuid_edx(0x8000001F);
1229
1230         /* Initialize SEV ASID bitmap */
1231         sev_asid_bitmap = bitmap_zalloc(max_sev_asid, GFP_KERNEL);
1232         if (!sev_asid_bitmap)
1233                 return 1;
1234
1235         status = kmalloc(sizeof(*status), GFP_KERNEL);
1236         if (!status)
1237                 return 1;
1238
1239         /*
1240          * Check SEV platform status.
1241          *
1242          * PLATFORM_STATUS can be called in any state, if we failed to query
1243          * the PLATFORM status then either PSP firmware does not support SEV
1244          * feature or SEV firmware is dead.
1245          */
1246         rc = sev_platform_status(status, NULL);
1247         if (rc)
1248                 goto err;
1249
1250         pr_info("SEV supported\n");
1251
1252 err:
1253         kfree(status);
1254         return rc;
1255 }
1256
1257 static void grow_ple_window(struct kvm_vcpu *vcpu)
1258 {
1259         struct vcpu_svm *svm = to_svm(vcpu);
1260         struct vmcb_control_area *control = &svm->vmcb->control;
1261         int old = control->pause_filter_count;
1262
1263         control->pause_filter_count = __grow_ple_window(old,
1264                                                         pause_filter_count,
1265                                                         pause_filter_count_grow,
1266                                                         pause_filter_count_max);
1267
1268         if (control->pause_filter_count != old)
1269                 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1270
1271         trace_kvm_ple_window_grow(vcpu->vcpu_id,
1272                                   control->pause_filter_count, old);
1273 }
1274
1275 static void shrink_ple_window(struct kvm_vcpu *vcpu)
1276 {
1277         struct vcpu_svm *svm = to_svm(vcpu);
1278         struct vmcb_control_area *control = &svm->vmcb->control;
1279         int old = control->pause_filter_count;
1280
1281         control->pause_filter_count =
1282                                 __shrink_ple_window(old,
1283                                                     pause_filter_count,
1284                                                     pause_filter_count_shrink,
1285                                                     pause_filter_count);
1286         if (control->pause_filter_count != old)
1287                 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1288
1289         trace_kvm_ple_window_shrink(vcpu->vcpu_id,
1290                                     control->pause_filter_count, old);
1291 }
1292
1293 static __init int svm_hardware_setup(void)
1294 {
1295         int cpu;
1296         struct page *iopm_pages;
1297         void *iopm_va;
1298         int r;
1299
1300         iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
1301
1302         if (!iopm_pages)
1303                 return -ENOMEM;
1304
1305         iopm_va = page_address(iopm_pages);
1306         memset(iopm_va, 0xff, PAGE_SIZE * (1 << IOPM_ALLOC_ORDER));
1307         iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
1308
1309         init_msrpm_offsets();
1310
1311         if (boot_cpu_has(X86_FEATURE_NX))
1312                 kvm_enable_efer_bits(EFER_NX);
1313
1314         if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
1315                 kvm_enable_efer_bits(EFER_FFXSR);
1316
1317         if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
1318                 kvm_has_tsc_control = true;
1319                 kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
1320                 kvm_tsc_scaling_ratio_frac_bits = 32;
1321         }
1322
1323         /* Check for pause filtering support */
1324         if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
1325                 pause_filter_count = 0;
1326                 pause_filter_thresh = 0;
1327         } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
1328                 pause_filter_thresh = 0;
1329         }
1330
1331         if (nested) {
1332                 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
1333                 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
1334         }
1335
1336         if (sev) {
1337                 if (boot_cpu_has(X86_FEATURE_SEV) &&
1338                     IS_ENABLED(CONFIG_KVM_AMD_SEV)) {
1339                         r = sev_hardware_setup();
1340                         if (r)
1341                                 sev = false;
1342                 } else {
1343                         sev = false;
1344                 }
1345         }
1346
1347         for_each_possible_cpu(cpu) {
1348                 r = svm_cpu_init(cpu);
1349                 if (r)
1350                         goto err;
1351         }
1352
1353         if (!boot_cpu_has(X86_FEATURE_NPT))
1354                 npt_enabled = false;
1355
1356         if (npt_enabled && !npt) {
1357                 printk(KERN_INFO "kvm: Nested Paging disabled\n");
1358                 npt_enabled = false;
1359         }
1360
1361         if (npt_enabled) {
1362                 printk(KERN_INFO "kvm: Nested Paging enabled\n");
1363                 kvm_enable_tdp();
1364         } else
1365                 kvm_disable_tdp();
1366
1367         if (avic) {
1368                 if (!npt_enabled ||
1369                     !boot_cpu_has(X86_FEATURE_AVIC) ||
1370                     !IS_ENABLED(CONFIG_X86_LOCAL_APIC)) {
1371                         avic = false;
1372                 } else {
1373                         pr_info("AVIC enabled\n");
1374
1375                         amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
1376                 }
1377         }
1378
1379         if (vls) {
1380                 if (!npt_enabled ||
1381                     !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
1382                     !IS_ENABLED(CONFIG_X86_64)) {
1383                         vls = false;
1384                 } else {
1385                         pr_info("Virtual VMLOAD VMSAVE supported\n");
1386                 }
1387         }
1388
1389         if (vgif) {
1390                 if (!boot_cpu_has(X86_FEATURE_VGIF))
1391                         vgif = false;
1392                 else
1393                         pr_info("Virtual GIF supported\n");
1394         }
1395
1396         return 0;
1397
1398 err:
1399         __free_pages(iopm_pages, IOPM_ALLOC_ORDER);
1400         iopm_base = 0;
1401         return r;
1402 }
1403
1404 static __exit void svm_hardware_unsetup(void)
1405 {
1406         int cpu;
1407
1408         if (svm_sev_enabled())
1409                 bitmap_free(sev_asid_bitmap);
1410
1411         for_each_possible_cpu(cpu)
1412                 svm_cpu_uninit(cpu);
1413
1414         __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
1415         iopm_base = 0;
1416 }
1417
1418 static void init_seg(struct vmcb_seg *seg)
1419 {
1420         seg->selector = 0;
1421         seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK |
1422                       SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */
1423         seg->limit = 0xffff;
1424         seg->base = 0;
1425 }
1426
1427 static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
1428 {
1429         seg->selector = 0;
1430         seg->attrib = SVM_SELECTOR_P_MASK | type;
1431         seg->limit = 0xffff;
1432         seg->base = 0;
1433 }
1434
1435 static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
1436 {
1437         struct vcpu_svm *svm = to_svm(vcpu);
1438
1439         if (is_guest_mode(vcpu))
1440                 return svm->nested.hsave->control.tsc_offset;
1441
1442         return vcpu->arch.tsc_offset;
1443 }
1444
1445 static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1446 {
1447         struct vcpu_svm *svm = to_svm(vcpu);
1448         u64 g_tsc_offset = 0;
1449
1450         if (is_guest_mode(vcpu)) {
1451                 /* Write L1's TSC offset.  */
1452                 g_tsc_offset = svm->vmcb->control.tsc_offset -
1453                                svm->nested.hsave->control.tsc_offset;
1454                 svm->nested.hsave->control.tsc_offset = offset;
1455         }
1456
1457         trace_kvm_write_tsc_offset(vcpu->vcpu_id,
1458                                    svm->vmcb->control.tsc_offset - g_tsc_offset,
1459                                    offset);
1460
1461         svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
1462
1463         mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
1464         return svm->vmcb->control.tsc_offset;
1465 }
1466
1467 static void avic_init_vmcb(struct vcpu_svm *svm)
1468 {
1469         struct vmcb *vmcb = svm->vmcb;
1470         struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
1471         phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
1472         phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
1473         phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page));
1474
1475         vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
1476         vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
1477         vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK;
1478         vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT;
1479         vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
1480 }
1481
1482 static void init_vmcb(struct vcpu_svm *svm)
1483 {
1484         struct vmcb_control_area *control = &svm->vmcb->control;
1485         struct vmcb_save_area *save = &svm->vmcb->save;
1486
1487         svm->vcpu.arch.hflags = 0;
1488
1489         set_cr_intercept(svm, INTERCEPT_CR0_READ);
1490         set_cr_intercept(svm, INTERCEPT_CR3_READ);
1491         set_cr_intercept(svm, INTERCEPT_CR4_READ);
1492         set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1493         set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1494         set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
1495         if (!kvm_vcpu_apicv_active(&svm->vcpu))
1496                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
1497
1498         set_dr_intercepts(svm);
1499
1500         set_exception_intercept(svm, PF_VECTOR);
1501         set_exception_intercept(svm, UD_VECTOR);
1502         set_exception_intercept(svm, MC_VECTOR);
1503         set_exception_intercept(svm, AC_VECTOR);
1504         set_exception_intercept(svm, DB_VECTOR);
1505         /*
1506          * Guest access to VMware backdoor ports could legitimately
1507          * trigger #GP because of TSS I/O permission bitmap.
1508          * We intercept those #GP and allow access to them anyway
1509          * as VMware does.
1510          */
1511         if (enable_vmware_backdoor)
1512                 set_exception_intercept(svm, GP_VECTOR);
1513
1514         set_intercept(svm, INTERCEPT_INTR);
1515         set_intercept(svm, INTERCEPT_NMI);
1516         set_intercept(svm, INTERCEPT_SMI);
1517         set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
1518         set_intercept(svm, INTERCEPT_RDPMC);
1519         set_intercept(svm, INTERCEPT_CPUID);
1520         set_intercept(svm, INTERCEPT_INVD);
1521         set_intercept(svm, INTERCEPT_INVLPG);
1522         set_intercept(svm, INTERCEPT_INVLPGA);
1523         set_intercept(svm, INTERCEPT_IOIO_PROT);
1524         set_intercept(svm, INTERCEPT_MSR_PROT);
1525         set_intercept(svm, INTERCEPT_TASK_SWITCH);
1526         set_intercept(svm, INTERCEPT_SHUTDOWN);
1527         set_intercept(svm, INTERCEPT_VMRUN);
1528         set_intercept(svm, INTERCEPT_VMMCALL);
1529         set_intercept(svm, INTERCEPT_VMLOAD);
1530         set_intercept(svm, INTERCEPT_VMSAVE);
1531         set_intercept(svm, INTERCEPT_STGI);
1532         set_intercept(svm, INTERCEPT_CLGI);
1533         set_intercept(svm, INTERCEPT_SKINIT);
1534         set_intercept(svm, INTERCEPT_WBINVD);
1535         set_intercept(svm, INTERCEPT_XSETBV);
1536         set_intercept(svm, INTERCEPT_RSM);
1537
1538         if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
1539                 set_intercept(svm, INTERCEPT_MONITOR);
1540                 set_intercept(svm, INTERCEPT_MWAIT);
1541         }
1542
1543         if (!kvm_hlt_in_guest(svm->vcpu.kvm))
1544                 set_intercept(svm, INTERCEPT_HLT);
1545
1546         control->iopm_base_pa = __sme_set(iopm_base);
1547         control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
1548         control->int_ctl = V_INTR_MASKING_MASK;
1549
1550         init_seg(&save->es);
1551         init_seg(&save->ss);
1552         init_seg(&save->ds);
1553         init_seg(&save->fs);
1554         init_seg(&save->gs);
1555
1556         save->cs.selector = 0xf000;
1557         save->cs.base = 0xffff0000;
1558         /* Executable/Readable Code Segment */
1559         save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK |
1560                 SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK;
1561         save->cs.limit = 0xffff;
1562
1563         save->gdtr.limit = 0xffff;
1564         save->idtr.limit = 0xffff;
1565
1566         init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
1567         init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
1568
1569         svm_set_efer(&svm->vcpu, 0);
1570         save->dr6 = 0xffff0ff0;
1571         kvm_set_rflags(&svm->vcpu, 2);
1572         save->rip = 0x0000fff0;
1573         svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
1574
1575         /*
1576          * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
1577          * It also updates the guest-visible cr0 value.
1578          */
1579         svm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
1580         kvm_mmu_reset_context(&svm->vcpu);
1581
1582         save->cr4 = X86_CR4_PAE;
1583         /* rdx = ?? */
1584
1585         if (npt_enabled) {
1586                 /* Setup VMCB for Nested Paging */
1587                 control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE;
1588                 clr_intercept(svm, INTERCEPT_INVLPG);
1589                 clr_exception_intercept(svm, PF_VECTOR);
1590                 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
1591                 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
1592                 save->g_pat = svm->vcpu.arch.pat;
1593                 save->cr3 = 0;
1594                 save->cr4 = 0;
1595         }
1596         svm->asid_generation = 0;
1597
1598         svm->nested.vmcb = 0;
1599         svm->vcpu.arch.hflags = 0;
1600
1601         if (pause_filter_count) {
1602                 control->pause_filter_count = pause_filter_count;
1603                 if (pause_filter_thresh)
1604                         control->pause_filter_thresh = pause_filter_thresh;
1605                 set_intercept(svm, INTERCEPT_PAUSE);
1606         } else {
1607                 clr_intercept(svm, INTERCEPT_PAUSE);
1608         }
1609
1610         if (kvm_vcpu_apicv_active(&svm->vcpu))
1611                 avic_init_vmcb(svm);
1612
1613         /*
1614          * If hardware supports Virtual VMLOAD VMSAVE then enable it
1615          * in VMCB and clear intercepts to avoid #VMEXIT.
1616          */
1617         if (vls) {
1618                 clr_intercept(svm, INTERCEPT_VMLOAD);
1619                 clr_intercept(svm, INTERCEPT_VMSAVE);
1620                 svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
1621         }
1622
1623         if (vgif) {
1624                 clr_intercept(svm, INTERCEPT_STGI);
1625                 clr_intercept(svm, INTERCEPT_CLGI);
1626                 svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK;
1627         }
1628
1629         if (sev_guest(svm->vcpu.kvm)) {
1630                 svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE;
1631                 clr_exception_intercept(svm, UD_VECTOR);
1632         }
1633
1634         mark_all_dirty(svm->vmcb);
1635
1636         enable_gif(svm);
1637
1638 }
1639
1640 static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu,
1641                                        unsigned int index)
1642 {
1643         u64 *avic_physical_id_table;
1644         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
1645
1646         if (index >= AVIC_MAX_PHYSICAL_ID_COUNT)
1647                 return NULL;
1648
1649         avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page);
1650
1651         return &avic_physical_id_table[index];
1652 }
1653
1654 /**
1655  * Note:
1656  * AVIC hardware walks the nested page table to check permissions,
1657  * but does not use the SPA address specified in the leaf page
1658  * table entry since it uses  address in the AVIC_BACKING_PAGE pointer
1659  * field of the VMCB. Therefore, we set up the
1660  * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here.
1661  */
1662 static int avic_init_access_page(struct kvm_vcpu *vcpu)
1663 {
1664         struct kvm *kvm = vcpu->kvm;
1665         int ret = 0;
1666
1667         mutex_lock(&kvm->slots_lock);
1668         if (kvm->arch.apic_access_page_done)
1669                 goto out;
1670
1671         ret = __x86_set_memory_region(kvm,
1672                                       APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
1673                                       APIC_DEFAULT_PHYS_BASE,
1674                                       PAGE_SIZE);
1675         if (ret)
1676                 goto out;
1677
1678         kvm->arch.apic_access_page_done = true;
1679 out:
1680         mutex_unlock(&kvm->slots_lock);
1681         return ret;
1682 }
1683
1684 static int avic_init_backing_page(struct kvm_vcpu *vcpu)
1685 {
1686         int ret;
1687         u64 *entry, new_entry;
1688         int id = vcpu->vcpu_id;
1689         struct vcpu_svm *svm = to_svm(vcpu);
1690
1691         ret = avic_init_access_page(vcpu);
1692         if (ret)
1693                 return ret;
1694
1695         if (id >= AVIC_MAX_PHYSICAL_ID_COUNT)
1696                 return -EINVAL;
1697
1698         if (!svm->vcpu.arch.apic->regs)
1699                 return -EINVAL;
1700
1701         svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs);
1702
1703         /* Setting AVIC backing page address in the phy APIC ID table */
1704         entry = avic_get_physical_id_entry(vcpu, id);
1705         if (!entry)
1706                 return -EINVAL;
1707
1708         new_entry = READ_ONCE(*entry);
1709         new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
1710                               AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
1711                               AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
1712         WRITE_ONCE(*entry, new_entry);
1713
1714         svm->avic_physical_id_cache = entry;
1715
1716         return 0;
1717 }
1718
1719 static void __sev_asid_free(int asid)
1720 {
1721         struct svm_cpu_data *sd;
1722         int cpu, pos;
1723
1724         pos = asid - 1;
1725         clear_bit(pos, sev_asid_bitmap);
1726
1727         for_each_possible_cpu(cpu) {
1728                 sd = per_cpu(svm_data, cpu);
1729                 sd->sev_vmcbs[pos] = NULL;
1730         }
1731 }
1732
1733 static void sev_asid_free(struct kvm *kvm)
1734 {
1735         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1736
1737         __sev_asid_free(sev->asid);
1738 }
1739
1740 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
1741 {
1742         struct sev_data_decommission *decommission;
1743         struct sev_data_deactivate *data;
1744
1745         if (!handle)
1746                 return;
1747
1748         data = kzalloc(sizeof(*data), GFP_KERNEL);
1749         if (!data)
1750                 return;
1751
1752         /* deactivate handle */
1753         data->handle = handle;
1754         sev_guest_deactivate(data, NULL);
1755
1756         wbinvd_on_all_cpus();
1757         sev_guest_df_flush(NULL);
1758         kfree(data);
1759
1760         decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
1761         if (!decommission)
1762                 return;
1763
1764         /* decommission handle */
1765         decommission->handle = handle;
1766         sev_guest_decommission(decommission, NULL);
1767
1768         kfree(decommission);
1769 }
1770
1771 static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
1772                                     unsigned long ulen, unsigned long *n,
1773                                     int write)
1774 {
1775         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1776         unsigned long npages, npinned, size;
1777         unsigned long locked, lock_limit;
1778         struct page **pages;
1779         unsigned long first, last;
1780
1781         if (ulen == 0 || uaddr + ulen < uaddr)
1782                 return NULL;
1783
1784         /* Calculate number of pages. */
1785         first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
1786         last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
1787         npages = (last - first + 1);
1788
1789         locked = sev->pages_locked + npages;
1790         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1791         if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
1792                 pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
1793                 return NULL;
1794         }
1795
1796         /* Avoid using vmalloc for smaller buffers. */
1797         size = npages * sizeof(struct page *);
1798         if (size > PAGE_SIZE)
1799                 pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
1800                                   PAGE_KERNEL);
1801         else
1802                 pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
1803
1804         if (!pages)
1805                 return NULL;
1806
1807         /* Pin the user virtual address. */
1808         npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
1809         if (npinned != npages) {
1810                 pr_err("SEV: Failure locking %lu pages.\n", npages);
1811                 goto err;
1812         }
1813
1814         *n = npages;
1815         sev->pages_locked = locked;
1816
1817         return pages;
1818
1819 err:
1820         if (npinned > 0)
1821                 release_pages(pages, npinned);
1822
1823         kvfree(pages);
1824         return NULL;
1825 }
1826
1827 static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
1828                              unsigned long npages)
1829 {
1830         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1831
1832         release_pages(pages, npages);
1833         kvfree(pages);
1834         sev->pages_locked -= npages;
1835 }
1836
1837 static void sev_clflush_pages(struct page *pages[], unsigned long npages)
1838 {
1839         uint8_t *page_virtual;
1840         unsigned long i;
1841
1842         if (npages == 0 || pages == NULL)
1843                 return;
1844
1845         for (i = 0; i < npages; i++) {
1846                 page_virtual = kmap_atomic(pages[i]);
1847                 clflush_cache_range(page_virtual, PAGE_SIZE);
1848                 kunmap_atomic(page_virtual);
1849         }
1850 }
1851
1852 static void __unregister_enc_region_locked(struct kvm *kvm,
1853                                            struct enc_region *region)
1854 {
1855         /*
1856          * The guest may change the memory encryption attribute from C=0 -> C=1
1857          * or vice versa for this memory range. Lets make sure caches are
1858          * flushed to ensure that guest data gets written into memory with
1859          * correct C-bit.
1860          */
1861         sev_clflush_pages(region->pages, region->npages);
1862
1863         sev_unpin_memory(kvm, region->pages, region->npages);
1864         list_del(&region->list);
1865         kfree(region);
1866 }
1867
1868 static struct kvm *svm_vm_alloc(void)
1869 {
1870         struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
1871                                             GFP_KERNEL_ACCOUNT | __GFP_ZERO,
1872                                             PAGE_KERNEL);
1873         return &kvm_svm->kvm;
1874 }
1875
1876 static void svm_vm_free(struct kvm *kvm)
1877 {
1878         vfree(to_kvm_svm(kvm));
1879 }
1880
1881 static void sev_vm_destroy(struct kvm *kvm)
1882 {
1883         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
1884         struct list_head *head = &sev->regions_list;
1885         struct list_head *pos, *q;
1886
1887         if (!sev_guest(kvm))
1888                 return;
1889
1890         mutex_lock(&kvm->lock);
1891
1892         /*
1893          * if userspace was terminated before unregistering the memory regions
1894          * then lets unpin all the registered memory.
1895          */
1896         if (!list_empty(head)) {
1897                 list_for_each_safe(pos, q, head) {
1898                         __unregister_enc_region_locked(kvm,
1899                                 list_entry(pos, struct enc_region, list));
1900                 }
1901         }
1902
1903         mutex_unlock(&kvm->lock);
1904
1905         sev_unbind_asid(kvm, sev->handle);
1906         sev_asid_free(kvm);
1907 }
1908
1909 static void avic_vm_destroy(struct kvm *kvm)
1910 {
1911         unsigned long flags;
1912         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1913
1914         if (!avic)
1915                 return;
1916
1917         if (kvm_svm->avic_logical_id_table_page)
1918                 __free_page(kvm_svm->avic_logical_id_table_page);
1919         if (kvm_svm->avic_physical_id_table_page)
1920                 __free_page(kvm_svm->avic_physical_id_table_page);
1921
1922         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1923         hash_del(&kvm_svm->hnode);
1924         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1925 }
1926
1927 static void svm_vm_destroy(struct kvm *kvm)
1928 {
1929         avic_vm_destroy(kvm);
1930         sev_vm_destroy(kvm);
1931 }
1932
1933 static int avic_vm_init(struct kvm *kvm)
1934 {
1935         unsigned long flags;
1936         int err = -ENOMEM;
1937         struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
1938         struct kvm_svm *k2;
1939         struct page *p_page;
1940         struct page *l_page;
1941         u32 vm_id;
1942
1943         if (!avic)
1944                 return 0;
1945
1946         /* Allocating physical APIC ID table (4KB) */
1947         p_page = alloc_page(GFP_KERNEL_ACCOUNT);
1948         if (!p_page)
1949                 goto free_avic;
1950
1951         kvm_svm->avic_physical_id_table_page = p_page;
1952         clear_page(page_address(p_page));
1953
1954         /* Allocating logical APIC ID table (4KB) */
1955         l_page = alloc_page(GFP_KERNEL_ACCOUNT);
1956         if (!l_page)
1957                 goto free_avic;
1958
1959         kvm_svm->avic_logical_id_table_page = l_page;
1960         clear_page(page_address(l_page));
1961
1962         spin_lock_irqsave(&svm_vm_data_hash_lock, flags);
1963  again:
1964         vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK;
1965         if (vm_id == 0) { /* id is 1-based, zero is not okay */
1966                 next_vm_id_wrapped = 1;
1967                 goto again;
1968         }
1969         /* Is it still in use? Only possible if wrapped at least once */
1970         if (next_vm_id_wrapped) {
1971                 hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) {
1972                         if (k2->avic_vm_id == vm_id)
1973                                 goto again;
1974                 }
1975         }
1976         kvm_svm->avic_vm_id = vm_id;
1977         hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id);
1978         spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
1979
1980         return 0;
1981
1982 free_avic:
1983         avic_vm_destroy(kvm);
1984         return err;
1985 }
1986
1987 static inline int
1988 avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r)
1989 {
1990         int ret = 0;
1991         unsigned long flags;
1992         struct amd_svm_iommu_ir *ir;
1993         struct vcpu_svm *svm = to_svm(vcpu);
1994
1995         if (!kvm_arch_has_assigned_device(vcpu->kvm))
1996                 return 0;
1997
1998         /*
1999          * Here, we go through the per-vcpu ir_list to update all existing
2000          * interrupt remapping table entry targeting this vcpu.
2001          */
2002         spin_lock_irqsave(&svm->ir_list_lock, flags);
2003
2004         if (list_empty(&svm->ir_list))
2005                 goto out;
2006
2007         list_for_each_entry(ir, &svm->ir_list, node) {
2008                 ret = amd_iommu_update_ga(cpu, r, ir->data);
2009                 if (ret)
2010                         break;
2011         }
2012 out:
2013         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
2014         return ret;
2015 }
2016
2017 static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2018 {
2019         u64 entry;
2020         /* ID = 0xff (broadcast), ID > 0xff (reserved) */
2021         int h_physical_id = kvm_cpu_get_apicid(cpu);
2022         struct vcpu_svm *svm = to_svm(vcpu);
2023
2024         if (!kvm_vcpu_apicv_active(vcpu))
2025                 return;
2026
2027         if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
2028                 return;
2029
2030         entry = READ_ONCE(*(svm->avic_physical_id_cache));
2031         WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
2032
2033         entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
2034         entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
2035
2036         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
2037         if (svm->avic_is_running)
2038                 entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
2039
2040         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
2041         avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
2042                                         svm->avic_is_running);
2043 }
2044
2045 static void avic_vcpu_put(struct kvm_vcpu *vcpu)
2046 {
2047         u64 entry;
2048         struct vcpu_svm *svm = to_svm(vcpu);
2049
2050         if (!kvm_vcpu_apicv_active(vcpu))
2051                 return;
2052
2053         entry = READ_ONCE(*(svm->avic_physical_id_cache));
2054         if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
2055                 avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
2056
2057         entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
2058         WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
2059 }
2060
2061 /**
2062  * This function is called during VCPU halt/unhalt.
2063  */
2064 static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
2065 {
2066         struct vcpu_svm *svm = to_svm(vcpu);
2067
2068         svm->avic_is_running = is_run;
2069         if (is_run)
2070                 avic_vcpu_load(vcpu, vcpu->cpu);
2071         else
2072                 avic_vcpu_put(vcpu);
2073 }
2074
2075 static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
2076 {
2077         struct vcpu_svm *svm = to_svm(vcpu);
2078         u32 dummy;
2079         u32 eax = 1;
2080
2081         vcpu->arch.microcode_version = 0x01000065;
2082         svm->spec_ctrl = 0;
2083         svm->virt_spec_ctrl = 0;
2084
2085         if (!init_event) {
2086                 svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
2087                                            MSR_IA32_APICBASE_ENABLE;
2088                 if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
2089                         svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
2090         }
2091         init_vmcb(svm);
2092
2093         kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
2094         kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
2095
2096         if (kvm_vcpu_apicv_active(vcpu) && !init_event)
2097                 avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
2098 }
2099
2100 static int avic_init_vcpu(struct vcpu_svm *svm)
2101 {
2102         int ret;
2103
2104         if (!kvm_vcpu_apicv_active(&svm->vcpu))
2105                 return 0;
2106
2107         ret = avic_init_backing_page(&svm->vcpu);
2108         if (ret)
2109                 return ret;
2110
2111         INIT_LIST_HEAD(&svm->ir_list);
2112         spin_lock_init(&svm->ir_list_lock);
2113         svm->dfr_reg = APIC_DFR_FLAT;
2114
2115         return ret;
2116 }
2117
2118 static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
2119 {
2120         struct vcpu_svm *svm;
2121         struct page *page;
2122         struct page *msrpm_pages;
2123         struct page *hsave_page;
2124         struct page *nested_msrpm_pages;
2125         int err;
2126
2127         svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
2128         if (!svm) {
2129                 err = -ENOMEM;
2130                 goto out;
2131         }
2132
2133         svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
2134                                                      GFP_KERNEL_ACCOUNT);
2135         if (!svm->vcpu.arch.guest_fpu) {
2136                 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
2137                 err = -ENOMEM;
2138                 goto free_partial_svm;
2139         }
2140
2141         err = kvm_vcpu_init(&svm->vcpu, kvm, id);
2142         if (err)
2143                 goto free_svm;
2144
2145         err = -ENOMEM;
2146         page = alloc_page(GFP_KERNEL_ACCOUNT);
2147         if (!page)
2148                 goto uninit;
2149
2150         msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
2151         if (!msrpm_pages)
2152                 goto free_page1;
2153
2154         nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
2155         if (!nested_msrpm_pages)
2156                 goto free_page2;
2157
2158         hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
2159         if (!hsave_page)
2160                 goto free_page3;
2161
2162         err = avic_init_vcpu(svm);
2163         if (err)
2164                 goto free_page4;
2165
2166         /* We initialize this flag to true to make sure that the is_running
2167          * bit would be set the first time the vcpu is loaded.
2168          */
2169         svm->avic_is_running = true;
2170
2171         svm->nested.hsave = page_address(hsave_page);
2172
2173         svm->msrpm = page_address(msrpm_pages);
2174         svm_vcpu_init_msrpm(svm->msrpm);
2175
2176         svm->nested.msrpm = page_address(nested_msrpm_pages);
2177         svm_vcpu_init_msrpm(svm->nested.msrpm);
2178
2179         svm->vmcb = page_address(page);
2180         clear_page(svm->vmcb);
2181         svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
2182         svm->asid_generation = 0;
2183         init_vmcb(svm);
2184
2185         svm_init_osvw(&svm->vcpu);
2186
2187         return &svm->vcpu;
2188
2189 free_page4:
2190         __free_page(hsave_page);
2191 free_page3:
2192         __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
2193 free_page2:
2194         __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
2195 free_page1:
2196         __free_page(page);
2197 uninit:
2198         kvm_vcpu_uninit(&svm->vcpu);
2199 free_svm:
2200         kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
2201 free_partial_svm:
2202         kmem_cache_free(kvm_vcpu_cache, svm);
2203 out:
2204         return ERR_PTR(err);
2205 }
2206
2207 static void svm_clear_current_vmcb(struct vmcb *vmcb)
2208 {
2209         int i;
2210
2211         for_each_online_cpu(i)
2212                 cmpxchg(&per_cpu(svm_data, i)->current_vmcb, vmcb, NULL);
2213 }
2214
2215 static void svm_free_vcpu(struct kvm_vcpu *vcpu)
2216 {
2217         struct vcpu_svm *svm = to_svm(vcpu);
2218
2219         /*
2220          * The vmcb page can be recycled, causing a false negative in
2221          * svm_vcpu_load(). So, ensure that no logical CPU has this
2222          * vmcb page recorded as its current vmcb.
2223          */
2224         svm_clear_current_vmcb(svm->vmcb);
2225
2226         __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
2227         __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
2228         __free_page(virt_to_page(svm->nested.hsave));
2229         __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
2230         kvm_vcpu_uninit(vcpu);
2231         kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
2232         kmem_cache_free(kvm_vcpu_cache, svm);
2233 }
2234
2235 static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2236 {
2237         struct vcpu_svm *svm = to_svm(vcpu);
2238         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2239         int i;
2240
2241         if (unlikely(cpu != vcpu->cpu)) {
2242                 svm->asid_generation = 0;
2243                 mark_all_dirty(svm->vmcb);
2244         }
2245
2246 #ifdef CONFIG_X86_64
2247         rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
2248 #endif
2249         savesegment(fs, svm->host.fs);
2250         savesegment(gs, svm->host.gs);
2251         svm->host.ldt = kvm_read_ldt();
2252
2253         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
2254                 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
2255
2256         if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
2257                 u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio;
2258                 if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) {
2259                         __this_cpu_write(current_tsc_ratio, tsc_ratio);
2260                         wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio);
2261                 }
2262         }
2263         /* This assumes that the kernel never uses MSR_TSC_AUX */
2264         if (static_cpu_has(X86_FEATURE_RDTSCP))
2265                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
2266
2267         if (sd->current_vmcb != svm->vmcb) {
2268                 sd->current_vmcb = svm->vmcb;
2269                 indirect_branch_prediction_barrier();
2270         }
2271         avic_vcpu_load(vcpu, cpu);
2272 }
2273
2274 static void svm_vcpu_put(struct kvm_vcpu *vcpu)
2275 {
2276         struct vcpu_svm *svm = to_svm(vcpu);
2277         int i;
2278
2279         avic_vcpu_put(vcpu);
2280
2281         ++vcpu->stat.host_state_reload;
2282         kvm_load_ldt(svm->host.ldt);
2283 #ifdef CONFIG_X86_64
2284         loadsegment(fs, svm->host.fs);
2285         wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gsbase);
2286         load_gs_index(svm->host.gs);
2287 #else
2288 #ifdef CONFIG_X86_32_LAZY_GS
2289         loadsegment(gs, svm->host.gs);
2290 #endif
2291 #endif
2292         for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
2293                 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
2294 }
2295
2296 static void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
2297 {
2298         avic_set_running(vcpu, false);
2299 }
2300
2301 static void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
2302 {
2303         avic_set_running(vcpu, true);
2304 }
2305
2306 static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
2307 {
2308         struct vcpu_svm *svm = to_svm(vcpu);
2309         unsigned long rflags = svm->vmcb->save.rflags;
2310
2311         if (svm->nmi_singlestep) {
2312                 /* Hide our flags if they were not set by the guest */
2313                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF))
2314                         rflags &= ~X86_EFLAGS_TF;
2315                 if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF))
2316                         rflags &= ~X86_EFLAGS_RF;
2317         }
2318         return rflags;
2319 }
2320
2321 static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
2322 {
2323         if (to_svm(vcpu)->nmi_singlestep)
2324                 rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2325
2326        /*
2327         * Any change of EFLAGS.VM is accompanied by a reload of SS
2328         * (caused by either a task switch or an inter-privilege IRET),
2329         * so we do not need to update the CPL here.
2330         */
2331         to_svm(vcpu)->vmcb->save.rflags = rflags;
2332 }
2333
2334 static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
2335 {
2336         switch (reg) {
2337         case VCPU_EXREG_PDPTR:
2338                 BUG_ON(!npt_enabled);
2339                 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
2340                 break;
2341         default:
2342                 BUG();
2343         }
2344 }
2345
2346 static void svm_set_vintr(struct vcpu_svm *svm)
2347 {
2348         set_intercept(svm, INTERCEPT_VINTR);
2349 }
2350
2351 static void svm_clear_vintr(struct vcpu_svm *svm)
2352 {
2353         clr_intercept(svm, INTERCEPT_VINTR);
2354 }
2355
2356 static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
2357 {
2358         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
2359
2360         switch (seg) {
2361         case VCPU_SREG_CS: return &save->cs;
2362         case VCPU_SREG_DS: return &save->ds;
2363         case VCPU_SREG_ES: return &save->es;
2364         case VCPU_SREG_FS: return &save->fs;
2365         case VCPU_SREG_GS: return &save->gs;
2366         case VCPU_SREG_SS: return &save->ss;
2367         case VCPU_SREG_TR: return &save->tr;
2368         case VCPU_SREG_LDTR: return &save->ldtr;
2369         }
2370         BUG();
2371         return NULL;
2372 }
2373
2374 static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2375 {
2376         struct vmcb_seg *s = svm_seg(vcpu, seg);
2377
2378         return s->base;
2379 }
2380
2381 static void svm_get_segment(struct kvm_vcpu *vcpu,
2382                             struct kvm_segment *var, int seg)
2383 {
2384         struct vmcb_seg *s = svm_seg(vcpu, seg);
2385
2386         var->base = s->base;
2387         var->limit = s->limit;
2388         var->selector = s->selector;
2389         var->type = s->attrib & SVM_SELECTOR_TYPE_MASK;
2390         var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1;
2391         var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3;
2392         var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1;
2393         var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1;
2394         var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
2395         var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
2396
2397         /*
2398          * AMD CPUs circa 2014 track the G bit for all segments except CS.
2399          * However, the SVM spec states that the G bit is not observed by the
2400          * CPU, and some VMware virtual CPUs drop the G bit for all segments.
2401          * So let's synthesize a legal G bit for all segments, this helps
2402          * running KVM nested. It also helps cross-vendor migration, because
2403          * Intel's vmentry has a check on the 'G' bit.
2404          */
2405         var->g = s->limit > 0xfffff;
2406
2407         /*
2408          * AMD's VMCB does not have an explicit unusable field, so emulate it
2409          * for cross vendor migration purposes by "not present"
2410          */
2411         var->unusable = !var->present;
2412
2413         switch (seg) {
2414         case VCPU_SREG_TR:
2415                 /*
2416                  * Work around a bug where the busy flag in the tr selector
2417                  * isn't exposed
2418                  */
2419                 var->type |= 0x2;
2420                 break;
2421         case VCPU_SREG_DS:
2422         case VCPU_SREG_ES:
2423         case VCPU_SREG_FS:
2424         case VCPU_SREG_GS:
2425                 /*
2426                  * The accessed bit must always be set in the segment
2427                  * descriptor cache, although it can be cleared in the
2428                  * descriptor, the cached bit always remains at 1. Since
2429                  * Intel has a check on this, set it here to support
2430                  * cross-vendor migration.
2431                  */
2432                 if (!var->unusable)
2433                         var->type |= 0x1;
2434                 break;
2435         case VCPU_SREG_SS:
2436                 /*
2437                  * On AMD CPUs sometimes the DB bit in the segment
2438                  * descriptor is left as 1, although the whole segment has
2439                  * been made unusable. Clear it here to pass an Intel VMX
2440                  * entry check when cross vendor migrating.
2441                  */
2442                 if (var->unusable)
2443                         var->db = 0;
2444                 /* This is symmetric with svm_set_segment() */
2445                 var->dpl = to_svm(vcpu)->vmcb->save.cpl;
2446                 break;
2447         }
2448 }
2449
2450 static int svm_get_cpl(struct kvm_vcpu *vcpu)
2451 {
2452         struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
2453
2454         return save->cpl;
2455 }
2456
2457 static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2458 {
2459         struct vcpu_svm *svm = to_svm(vcpu);
2460
2461         dt->size = svm->vmcb->save.idtr.limit;
2462         dt->address = svm->vmcb->save.idtr.base;
2463 }
2464
2465 static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2466 {
2467         struct vcpu_svm *svm = to_svm(vcpu);
2468
2469         svm->vmcb->save.idtr.limit = dt->size;
2470         svm->vmcb->save.idtr.base = dt->address ;
2471         mark_dirty(svm->vmcb, VMCB_DT);
2472 }
2473
2474 static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2475 {
2476         struct vcpu_svm *svm = to_svm(vcpu);
2477
2478         dt->size = svm->vmcb->save.gdtr.limit;
2479         dt->address = svm->vmcb->save.gdtr.base;
2480 }
2481
2482 static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
2483 {
2484         struct vcpu_svm *svm = to_svm(vcpu);
2485
2486         svm->vmcb->save.gdtr.limit = dt->size;
2487         svm->vmcb->save.gdtr.base = dt->address ;
2488         mark_dirty(svm->vmcb, VMCB_DT);
2489 }
2490
2491 static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
2492 {
2493 }
2494
2495 static void svm_decache_cr3(struct kvm_vcpu *vcpu)
2496 {
2497 }
2498
2499 static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
2500 {
2501 }
2502
2503 static void update_cr0_intercept(struct vcpu_svm *svm)
2504 {
2505         ulong gcr0 = svm->vcpu.arch.cr0;
2506         u64 *hcr0 = &svm->vmcb->save.cr0;
2507
2508         *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
2509                 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
2510
2511         mark_dirty(svm->vmcb, VMCB_CR);
2512
2513         if (gcr0 == *hcr0) {
2514                 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
2515                 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2516         } else {
2517                 set_cr_intercept(svm, INTERCEPT_CR0_READ);
2518                 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
2519         }
2520 }
2521
2522 static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
2523 {
2524         struct vcpu_svm *svm = to_svm(vcpu);
2525
2526 #ifdef CONFIG_X86_64
2527         if (vcpu->arch.efer & EFER_LME) {
2528                 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
2529                         vcpu->arch.efer |= EFER_LMA;
2530                         svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
2531                 }
2532
2533                 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
2534                         vcpu->arch.efer &= ~EFER_LMA;
2535                         svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
2536                 }
2537         }
2538 #endif
2539         vcpu->arch.cr0 = cr0;
2540
2541         if (!npt_enabled)
2542                 cr0 |= X86_CR0_PG | X86_CR0_WP;
2543
2544         /*
2545          * re-enable caching here because the QEMU bios
2546          * does not do it - this results in some delay at
2547          * reboot
2548          */
2549         if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
2550                 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
2551         svm->vmcb->save.cr0 = cr0;
2552         mark_dirty(svm->vmcb, VMCB_CR);
2553         update_cr0_intercept(svm);
2554 }
2555
2556 static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
2557 {
2558         unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
2559         unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
2560
2561         if (cr4 & X86_CR4_VMXE)
2562                 return 1;
2563
2564         if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
2565                 svm_flush_tlb(vcpu, true);
2566
2567         vcpu->arch.cr4 = cr4;
2568         if (!npt_enabled)
2569                 cr4 |= X86_CR4_PAE;
2570         cr4 |= host_cr4_mce;
2571         to_svm(vcpu)->vmcb->save.cr4 = cr4;
2572         mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
2573         return 0;
2574 }
2575
2576 static void svm_set_segment(struct kvm_vcpu *vcpu,
2577                             struct kvm_segment *var, int seg)
2578 {
2579         struct vcpu_svm *svm = to_svm(vcpu);
2580         struct vmcb_seg *s = svm_seg(vcpu, seg);
2581
2582         s->base = var->base;
2583         s->limit = var->limit;
2584         s->selector = var->selector;
2585         s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK);
2586         s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT;
2587         s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT;
2588         s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT;
2589         s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT;
2590         s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT;
2591         s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT;
2592         s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT;
2593
2594         /*
2595          * This is always accurate, except if SYSRET returned to a segment
2596          * with SS.DPL != 3.  Intel does not have this quirk, and always
2597          * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it
2598          * would entail passing the CPL to userspace and back.
2599          */
2600         if (seg == VCPU_SREG_SS)
2601                 /* This is symmetric with svm_get_segment() */
2602                 svm->vmcb->save.cpl = (var->dpl & 3);
2603
2604         mark_dirty(svm->vmcb, VMCB_SEG);
2605 }
2606
2607 static void update_bp_intercept(struct kvm_vcpu *vcpu)
2608 {
2609         struct vcpu_svm *svm = to_svm(vcpu);
2610
2611         clr_exception_intercept(svm, BP_VECTOR);
2612
2613         if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
2614                 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2615                         set_exception_intercept(svm, BP_VECTOR);
2616         } else
2617                 vcpu->guest_debug = 0;
2618 }
2619
2620 static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
2621 {
2622         if (sd->next_asid > sd->max_asid) {
2623                 ++sd->asid_generation;
2624                 sd->next_asid = sd->min_asid;
2625                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
2626         }
2627
2628         svm->asid_generation = sd->asid_generation;
2629         svm->vmcb->control.asid = sd->next_asid++;
2630
2631         mark_dirty(svm->vmcb, VMCB_ASID);
2632 }
2633
2634 static u64 svm_get_dr6(struct kvm_vcpu *vcpu)
2635 {
2636         return to_svm(vcpu)->vmcb->save.dr6;
2637 }
2638
2639 static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value)
2640 {
2641         struct vcpu_svm *svm = to_svm(vcpu);
2642
2643         svm->vmcb->save.dr6 = value;
2644         mark_dirty(svm->vmcb, VMCB_DR);
2645 }
2646
2647 static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
2648 {
2649         struct vcpu_svm *svm = to_svm(vcpu);
2650
2651         get_debugreg(vcpu->arch.db[0], 0);
2652         get_debugreg(vcpu->arch.db[1], 1);
2653         get_debugreg(vcpu->arch.db[2], 2);
2654         get_debugreg(vcpu->arch.db[3], 3);
2655         vcpu->arch.dr6 = svm_get_dr6(vcpu);
2656         vcpu->arch.dr7 = svm->vmcb->save.dr7;
2657
2658         vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
2659         set_dr_intercepts(svm);
2660 }
2661
2662 static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
2663 {
2664         struct vcpu_svm *svm = to_svm(vcpu);
2665
2666         svm->vmcb->save.dr7 = value;
2667         mark_dirty(svm->vmcb, VMCB_DR);
2668 }
2669
2670 static int pf_interception(struct vcpu_svm *svm)
2671 {
2672         u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
2673         u64 error_code = svm->vmcb->control.exit_info_1;
2674
2675         return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
2676                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2677                         svm->vmcb->control.insn_bytes : NULL,
2678                         svm->vmcb->control.insn_len);
2679 }
2680
2681 static int npf_interception(struct vcpu_svm *svm)
2682 {
2683         u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2);
2684         u64 error_code = svm->vmcb->control.exit_info_1;
2685
2686         trace_kvm_page_fault(fault_address, error_code);
2687         return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
2688                         static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
2689                         svm->vmcb->control.insn_bytes : NULL,
2690                         svm->vmcb->control.insn_len);
2691 }
2692
2693 static int db_interception(struct vcpu_svm *svm)
2694 {
2695         struct kvm_run *kvm_run = svm->vcpu.run;
2696         struct kvm_vcpu *vcpu = &svm->vcpu;
2697
2698         if (!(svm->vcpu.guest_debug &
2699               (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
2700                 !svm->nmi_singlestep) {
2701                 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
2702                 return 1;
2703         }
2704
2705         if (svm->nmi_singlestep) {
2706                 disable_nmi_singlestep(svm);
2707                 /* Make sure we check for pending NMIs upon entry */
2708                 kvm_make_request(KVM_REQ_EVENT, vcpu);
2709         }
2710
2711         if (svm->vcpu.guest_debug &
2712             (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) {
2713                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2714                 kvm_run->debug.arch.pc =
2715                         svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2716                 kvm_run->debug.arch.exception = DB_VECTOR;
2717                 return 0;
2718         }
2719
2720         return 1;
2721 }
2722
2723 static int bp_interception(struct vcpu_svm *svm)
2724 {
2725         struct kvm_run *kvm_run = svm->vcpu.run;
2726
2727         kvm_run->exit_reason = KVM_EXIT_DEBUG;
2728         kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
2729         kvm_run->debug.arch.exception = BP_VECTOR;
2730         return 0;
2731 }
2732
2733 static int ud_interception(struct vcpu_svm *svm)
2734 {
2735         return handle_ud(&svm->vcpu);
2736 }
2737
2738 static int ac_interception(struct vcpu_svm *svm)
2739 {
2740         kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0);
2741         return 1;
2742 }
2743
2744 static int gp_interception(struct vcpu_svm *svm)
2745 {
2746         struct kvm_vcpu *vcpu = &svm->vcpu;
2747         u32 error_code = svm->vmcb->control.exit_info_1;
2748         int er;
2749
2750         WARN_ON_ONCE(!enable_vmware_backdoor);
2751
2752         er = kvm_emulate_instruction(vcpu,
2753                 EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
2754         if (er == EMULATE_USER_EXIT)
2755                 return 0;
2756         else if (er != EMULATE_DONE)
2757                 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
2758         return 1;
2759 }
2760
2761 static bool is_erratum_383(void)
2762 {
2763         int err, i;
2764         u64 value;
2765
2766         if (!erratum_383_found)
2767                 return false;
2768
2769         value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err);
2770         if (err)
2771                 return false;
2772
2773         /* Bit 62 may or may not be set for this mce */
2774         value &= ~(1ULL << 62);
2775
2776         if (value != 0xb600000000010015ULL)
2777                 return false;
2778
2779         /* Clear MCi_STATUS registers */
2780         for (i = 0; i < 6; ++i)
2781                 native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0);
2782
2783         value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err);
2784         if (!err) {
2785                 u32 low, high;
2786
2787                 value &= ~(1ULL << 2);
2788                 low    = lower_32_bits(value);
2789                 high   = upper_32_bits(value);
2790
2791                 native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high);
2792         }
2793
2794         /* Flush tlb to evict multi-match entries */
2795         __flush_tlb_all();
2796
2797         return true;
2798 }
2799
2800 static void svm_handle_mce(struct vcpu_svm *svm)
2801 {
2802         if (is_erratum_383()) {
2803                 /*
2804                  * Erratum 383 triggered. Guest state is corrupt so kill the
2805                  * guest.
2806                  */
2807                 pr_err("KVM: Guest triggered AMD Erratum 383\n");
2808
2809                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
2810
2811                 return;
2812         }
2813
2814         /*
2815          * On an #MC intercept the MCE handler is not called automatically in
2816          * the host. So do it by hand here.
2817          */
2818         asm volatile (
2819                 "int $0x12\n");
2820         /* not sure if we ever come back to this point */
2821
2822         return;
2823 }
2824
2825 static int mc_interception(struct vcpu_svm *svm)
2826 {
2827         return 1;
2828 }
2829
2830 static int shutdown_interception(struct vcpu_svm *svm)
2831 {
2832         struct kvm_run *kvm_run = svm->vcpu.run;
2833
2834         /*
2835          * VMCB is undefined after a SHUTDOWN intercept
2836          * so reinitialize it.
2837          */
2838         clear_page(svm->vmcb);
2839         init_vmcb(svm);
2840
2841         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
2842         return 0;
2843 }
2844
2845 static int io_interception(struct vcpu_svm *svm)
2846 {
2847         struct kvm_vcpu *vcpu = &svm->vcpu;
2848         u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
2849         int size, in, string;
2850         unsigned port;
2851
2852         ++svm->vcpu.stat.io_exits;
2853         string = (io_info & SVM_IOIO_STR_MASK) != 0;
2854         in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
2855         if (string)
2856                 return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
2857
2858         port = io_info >> 16;
2859         size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
2860         svm->next_rip = svm->vmcb->control.exit_info_2;
2861
2862         return kvm_fast_pio(&svm->vcpu, size, port, in);
2863 }
2864
2865 static int nmi_interception(struct vcpu_svm *svm)
2866 {
2867         return 1;
2868 }
2869
2870 static int intr_interception(struct vcpu_svm *svm)
2871 {
2872         ++svm->vcpu.stat.irq_exits;
2873         return 1;
2874 }
2875
2876 static int nop_on_interception(struct vcpu_svm *svm)
2877 {
2878         return 1;
2879 }
2880
2881 static int halt_interception(struct vcpu_svm *svm)
2882 {
2883         svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
2884         return kvm_emulate_halt(&svm->vcpu);
2885 }
2886
2887 static int vmmcall_interception(struct vcpu_svm *svm)
2888 {
2889         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2890         return kvm_emulate_hypercall(&svm->vcpu);
2891 }
2892
2893 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
2894 {
2895         struct vcpu_svm *svm = to_svm(vcpu);
2896
2897         return svm->nested.nested_cr3;
2898 }
2899
2900 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
2901 {
2902         struct vcpu_svm *svm = to_svm(vcpu);
2903         u64 cr3 = svm->nested.nested_cr3;
2904         u64 pdpte;
2905         int ret;
2906
2907         ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
2908                                        offset_in_page(cr3) + index * 8, 8);
2909         if (ret)
2910                 return 0;
2911         return pdpte;
2912 }
2913
2914 static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
2915                                    unsigned long root)
2916 {
2917         struct vcpu_svm *svm = to_svm(vcpu);
2918
2919         svm->vmcb->control.nested_cr3 = __sme_set(root);
2920         mark_dirty(svm->vmcb, VMCB_NPT);
2921 }
2922
2923 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
2924                                        struct x86_exception *fault)
2925 {
2926         struct vcpu_svm *svm = to_svm(vcpu);
2927
2928         if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
2929                 /*
2930                  * TODO: track the cause of the nested page fault, and
2931                  * correctly fill in the high bits of exit_info_1.
2932                  */
2933                 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
2934                 svm->vmcb->control.exit_code_hi = 0;
2935                 svm->vmcb->control.exit_info_1 = (1ULL << 32);
2936                 svm->vmcb->control.exit_info_2 = fault->address;
2937         }
2938
2939         svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
2940         svm->vmcb->control.exit_info_1 |= fault->error_code;
2941
2942         /*
2943          * The present bit is always zero for page structure faults on real
2944          * hardware.
2945          */
2946         if (svm->vmcb->control.exit_info_1 & (2ULL << 32))
2947                 svm->vmcb->control.exit_info_1 &= ~1;
2948
2949         nested_svm_vmexit(svm);
2950 }
2951
2952 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
2953 {
2954         WARN_ON(mmu_is_nested(vcpu));
2955
2956         vcpu->arch.mmu = &vcpu->arch.guest_mmu;
2957         kvm_init_shadow_mmu(vcpu);
2958         vcpu->arch.mmu->set_cr3           = nested_svm_set_tdp_cr3;
2959         vcpu->arch.mmu->get_cr3           = nested_svm_get_tdp_cr3;
2960         vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
2961         vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
2962         vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
2963         reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
2964         vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
2965 }
2966
2967 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
2968 {
2969         vcpu->arch.mmu = &vcpu->arch.root_mmu;
2970         vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
2971 }
2972
2973 static int nested_svm_check_permissions(struct vcpu_svm *svm)
2974 {
2975         if (!(svm->vcpu.arch.efer & EFER_SVME) ||
2976             !is_paging(&svm->vcpu)) {
2977                 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2978                 return 1;
2979         }
2980
2981         if (svm->vmcb->save.cpl) {
2982                 kvm_inject_gp(&svm->vcpu, 0);
2983                 return 1;
2984         }
2985
2986         return 0;
2987 }
2988
2989 static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
2990                                       bool has_error_code, u32 error_code)
2991 {
2992         int vmexit;
2993
2994         if (!is_guest_mode(&svm->vcpu))
2995                 return 0;
2996
2997         vmexit = nested_svm_intercept(svm);
2998         if (vmexit != NESTED_EXIT_DONE)
2999                 return 0;
3000
3001         svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
3002         svm->vmcb->control.exit_code_hi = 0;
3003         svm->vmcb->control.exit_info_1 = error_code;
3004
3005         /*
3006          * EXITINFO2 is undefined for all exception intercepts other
3007          * than #PF.
3008          */
3009         if (svm->vcpu.arch.exception.nested_apf)
3010                 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
3011         else if (svm->vcpu.arch.exception.has_payload)
3012                 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
3013         else
3014                 svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
3015
3016         svm->nested.exit_required = true;
3017         return vmexit;
3018 }
3019
3020 /* This function returns true if it is save to enable the irq window */
3021 static inline bool nested_svm_intr(struct vcpu_svm *svm)
3022 {
3023         if (!is_guest_mode(&svm->vcpu))
3024                 return true;
3025
3026         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
3027                 return true;
3028
3029         if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
3030                 return false;
3031
3032         /*
3033          * if vmexit was already requested (by intercepted exception
3034          * for instance) do not overwrite it with "external interrupt"
3035          * vmexit.
3036          */
3037         if (svm->nested.exit_required)
3038                 return false;
3039
3040         svm->vmcb->control.exit_code   = SVM_EXIT_INTR;
3041         svm->vmcb->control.exit_info_1 = 0;
3042         svm->vmcb->control.exit_info_2 = 0;
3043
3044         if (svm->nested.intercept & 1ULL) {
3045                 /*
3046                  * The #vmexit can't be emulated here directly because this
3047                  * code path runs with irqs and preemption disabled. A
3048                  * #vmexit emulation might sleep. Only signal request for
3049                  * the #vmexit here.
3050                  */
3051                 svm->nested.exit_required = true;
3052                 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
3053                 return false;
3054         }
3055
3056         return true;
3057 }
3058
3059 /* This function returns true if it is save to enable the nmi window */
3060 static inline bool nested_svm_nmi(struct vcpu_svm *svm)
3061 {
3062         if (!is_guest_mode(&svm->vcpu))
3063                 return true;
3064
3065         if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
3066                 return true;
3067
3068         svm->vmcb->control.exit_code = SVM_EXIT_NMI;
3069         svm->nested.exit_required = true;
3070
3071         return false;
3072 }
3073
3074 static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
3075 {
3076         struct page *page;
3077
3078         might_sleep();
3079
3080         page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
3081         if (is_error_page(page))
3082                 goto error;
3083
3084         *_page = page;
3085
3086         return kmap(page);
3087
3088 error:
3089         kvm_inject_gp(&svm->vcpu, 0);
3090
3091         return NULL;
3092 }
3093
3094 static void nested_svm_unmap(struct page *page)
3095 {
3096         kunmap(page);
3097         kvm_release_page_dirty(page);
3098 }
3099
3100 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
3101 {
3102         unsigned port, size, iopm_len;
3103         u16 val, mask;
3104         u8 start_bit;
3105         u64 gpa;
3106
3107         if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT)))
3108                 return NESTED_EXIT_HOST;
3109
3110         port = svm->vmcb->control.exit_info_1 >> 16;
3111         size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
3112                 SVM_IOIO_SIZE_SHIFT;
3113         gpa  = svm->nested.vmcb_iopm + (port / 8);
3114         start_bit = port % 8;
3115         iopm_len = (start_bit + size > 8) ? 2 : 1;
3116         mask = (0xf >> (4 - size)) << start_bit;
3117         val = 0;
3118
3119         if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
3120                 return NESTED_EXIT_DONE;
3121
3122         return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
3123 }
3124
3125 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
3126 {
3127         u32 offset, msr, value;
3128         int write, mask;
3129
3130         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
3131                 return NESTED_EXIT_HOST;
3132
3133         msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
3134         offset = svm_msrpm_offset(msr);
3135         write  = svm->vmcb->control.exit_info_1 & 1;
3136         mask   = 1 << ((2 * (msr & 0xf)) + write);
3137
3138         if (offset == MSR_INVALID)
3139                 return NESTED_EXIT_DONE;
3140
3141         /* Offset is in 32 bit units but need in 8 bit units */
3142         offset *= 4;
3143
3144         if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.vmcb_msrpm + offset, &value, 4))
3145                 return NESTED_EXIT_DONE;
3146
3147         return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
3148 }
3149
3150 /* DB exceptions for our internal use must not cause vmexit */
3151 static int nested_svm_intercept_db(struct vcpu_svm *svm)
3152 {
3153         unsigned long dr6;
3154
3155         /* if we're not singlestepping, it's not ours */
3156         if (!svm->nmi_singlestep)
3157                 return NESTED_EXIT_DONE;
3158
3159         /* if it's not a singlestep exception, it's not ours */
3160         if (kvm_get_dr(&svm->vcpu, 6, &dr6))
3161                 return NESTED_EXIT_DONE;
3162         if (!(dr6 & DR6_BS))
3163                 return NESTED_EXIT_DONE;
3164
3165         /* if the guest is singlestepping, it should get the vmexit */
3166         if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) {
3167                 disable_nmi_singlestep(svm);
3168                 return NESTED_EXIT_DONE;
3169         }
3170
3171         /* it's ours, the nested hypervisor must not see this one */
3172         return NESTED_EXIT_HOST;
3173 }
3174
3175 static int nested_svm_exit_special(struct vcpu_svm *svm)
3176 {
3177         u32 exit_code = svm->vmcb->control.exit_code;
3178
3179         switch (exit_code) {
3180         case SVM_EXIT_INTR:
3181         case SVM_EXIT_NMI:
3182         case SVM_EXIT_EXCP_BASE + MC_VECTOR:
3183                 return NESTED_EXIT_HOST;
3184         case SVM_EXIT_NPF:
3185                 /* For now we are always handling NPFs when using them */
3186                 if (npt_enabled)
3187                         return NESTED_EXIT_HOST;
3188                 break;
3189         case SVM_EXIT_EXCP_BASE + PF_VECTOR:
3190                 /* When we're shadowing, trap PFs, but not async PF */
3191                 if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
3192                         return NESTED_EXIT_HOST;
3193                 break;
3194         default:
3195                 break;
3196         }
3197
3198         return NESTED_EXIT_CONTINUE;
3199 }
3200
3201 /*
3202  * If this function returns true, this #vmexit was already handled
3203  */
3204 static int nested_svm_intercept(struct vcpu_svm *svm)
3205 {
3206         u32 exit_code = svm->vmcb->control.exit_code;
3207         int vmexit = NESTED_EXIT_HOST;
3208
3209         switch (exit_code) {
3210         case SVM_EXIT_MSR:
3211                 vmexit = nested_svm_exit_handled_msr(svm);
3212                 break;
3213         case SVM_EXIT_IOIO:
3214                 vmexit = nested_svm_intercept_ioio(svm);
3215                 break;
3216         case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
3217                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
3218                 if (svm->nested.intercept_cr & bit)
3219                         vmexit = NESTED_EXIT_DONE;
3220                 break;
3221         }
3222         case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
3223                 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
3224                 if (svm->nested.intercept_dr & bit)
3225                         vmexit = NESTED_EXIT_DONE;
3226                 break;
3227         }
3228         case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
3229                 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
3230                 if (svm->nested.intercept_exceptions & excp_bits) {
3231                         if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR)
3232                                 vmexit = nested_svm_intercept_db(svm);
3233                         else
3234                                 vmexit = NESTED_EXIT_DONE;
3235                 }
3236                 /* async page fault always cause vmexit */
3237                 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
3238                          svm->vcpu.arch.exception.nested_apf != 0)
3239                         vmexit = NESTED_EXIT_DONE;
3240                 break;
3241         }
3242         case SVM_EXIT_ERR: {
3243                 vmexit = NESTED_EXIT_DONE;
3244                 break;
3245         }
3246         default: {
3247                 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
3248                 if (svm->nested.intercept & exit_bits)
3249                         vmexit = NESTED_EXIT_DONE;
3250         }
3251         }
3252
3253         return vmexit;
3254 }
3255
3256 static int nested_svm_exit_handled(struct vcpu_svm *svm)
3257 {
3258         int vmexit;
3259
3260         vmexit = nested_svm_intercept(svm);
3261
3262         if (vmexit == NESTED_EXIT_DONE)
3263                 nested_svm_vmexit(svm);
3264
3265         return vmexit;
3266 }
3267
3268 static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *from_vmcb)
3269 {
3270         struct vmcb_control_area *dst  = &dst_vmcb->control;
3271         struct vmcb_control_area *from = &from_vmcb->control;
3272
3273         dst->intercept_cr         = from->intercept_cr;
3274         dst->intercept_dr         = from->intercept_dr;
3275         dst->intercept_exceptions = from->intercept_exceptions;
3276         dst->intercept            = from->intercept;
3277         dst->iopm_base_pa         = from->iopm_base_pa;
3278         dst->msrpm_base_pa        = from->msrpm_base_pa;
3279         dst->tsc_offset           = from->tsc_offset;
3280         dst->asid                 = from->asid;
3281         dst->tlb_ctl              = from->tlb_ctl;
3282         dst->int_ctl              = from->int_ctl;
3283         dst->int_vector           = from->int_vector;
3284         dst->int_state            = from->int_state;
3285         dst->exit_code            = from->exit_code;
3286         dst->exit_code_hi         = from->exit_code_hi;
3287         dst->exit_info_1          = from->exit_info_1;
3288         dst->exit_info_2          = from->exit_info_2;
3289         dst->exit_int_info        = from->exit_int_info;
3290         dst->exit_int_info_err    = from->exit_int_info_err;
3291         dst->nested_ctl           = from->nested_ctl;
3292         dst->event_inj            = from->event_inj;
3293         dst->event_inj_err        = from->event_inj_err;
3294         dst->nested_cr3           = from->nested_cr3;
3295         dst->virt_ext              = from->virt_ext;
3296         dst->pause_filter_count   = from->pause_filter_count;
3297         dst->pause_filter_thresh  = from->pause_filter_thresh;
3298 }
3299
3300 static int nested_svm_vmexit(struct vcpu_svm *svm)
3301 {
3302         struct vmcb *nested_vmcb;
3303         struct vmcb *hsave = svm->nested.hsave;
3304         struct vmcb *vmcb = svm->vmcb;
3305         struct page *page;
3306
3307         trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
3308                                        vmcb->control.exit_info_1,
3309                                        vmcb->control.exit_info_2,
3310                                        vmcb->control.exit_int_info,
3311                                        vmcb->control.exit_int_info_err,
3312                                        KVM_ISA_SVM);
3313
3314         nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
3315         if (!nested_vmcb)
3316                 return 1;
3317
3318         /* Exit Guest-Mode */
3319         leave_guest_mode(&svm->vcpu);
3320         svm->nested.vmcb = 0;
3321
3322         /* Give the current vmcb to the guest */
3323         disable_gif(svm);
3324
3325         nested_vmcb->save.es     = vmcb->save.es;
3326         nested_vmcb->save.cs     = vmcb->save.cs;
3327         nested_vmcb->save.ss     = vmcb->save.ss;
3328         nested_vmcb->save.ds     = vmcb->save.ds;
3329         nested_vmcb->save.gdtr   = vmcb->save.gdtr;
3330         nested_vmcb->save.idtr   = vmcb->save.idtr;
3331         nested_vmcb->save.efer   = svm->vcpu.arch.efer;
3332         nested_vmcb->save.cr0    = kvm_read_cr0(&svm->vcpu);
3333         nested_vmcb->save.cr3    = kvm_read_cr3(&svm->vcpu);
3334         nested_vmcb->save.cr2    = vmcb->save.cr2;
3335         nested_vmcb->save.cr4    = svm->vcpu.arch.cr4;
3336         nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
3337         nested_vmcb->save.rip    = vmcb->save.rip;
3338         nested_vmcb->save.rsp    = vmcb->save.rsp;
3339         nested_vmcb->save.rax    = vmcb->save.rax;
3340         nested_vmcb->save.dr7    = vmcb->save.dr7;
3341         nested_vmcb->save.dr6    = vmcb->save.dr6;
3342         nested_vmcb->save.cpl    = vmcb->save.cpl;
3343
3344         nested_vmcb->control.int_ctl           = vmcb->control.int_ctl;
3345         nested_vmcb->control.int_vector        = vmcb->control.int_vector;
3346         nested_vmcb->control.int_state         = vmcb->control.int_state;
3347         nested_vmcb->control.exit_code         = vmcb->control.exit_code;
3348         nested_vmcb->control.exit_code_hi      = vmcb->control.exit_code_hi;
3349         nested_vmcb->control.exit_info_1       = vmcb->control.exit_info_1;
3350         nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
3351         nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
3352         nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
3353
3354         if (svm->nrips_enabled)
3355                 nested_vmcb->control.next_rip  = vmcb->control.next_rip;
3356
3357         /*
3358          * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
3359          * to make sure that we do not lose injected events. So check event_inj
3360          * here and copy it to exit_int_info if it is valid.
3361          * Exit_int_info and event_inj can't be both valid because the case
3362          * below only happens on a VMRUN instruction intercept which has
3363          * no valid exit_int_info set.
3364          */
3365         if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
3366                 struct vmcb_control_area *nc = &nested_vmcb->control;
3367
3368                 nc->exit_int_info     = vmcb->control.event_inj;
3369                 nc->exit_int_info_err = vmcb->control.event_inj_err;
3370         }
3371
3372         nested_vmcb->control.tlb_ctl           = 0;
3373         nested_vmcb->control.event_inj         = 0;
3374         nested_vmcb->control.event_inj_err     = 0;
3375
3376         nested_vmcb->control.pause_filter_count =
3377                 svm->vmcb->control.pause_filter_count;
3378         nested_vmcb->control.pause_filter_thresh =
3379                 svm->vmcb->control.pause_filter_thresh;
3380
3381         /* We always set V_INTR_MASKING and remember the old value in hflags */
3382         if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
3383                 nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
3384
3385         /* Restore the original control entries */
3386         copy_vmcb_control_area(vmcb, hsave);
3387
3388         svm->vcpu.arch.tsc_offset = svm->vmcb->control.tsc_offset;
3389         kvm_clear_exception_queue(&svm->vcpu);
3390         kvm_clear_interrupt_queue(&svm->vcpu);
3391
3392         svm->nested.nested_cr3 = 0;
3393
3394         /* Restore selected save entries */
3395         svm->vmcb->save.es = hsave->save.es;
3396         svm->vmcb->save.cs = hsave->save.cs;
3397         svm->vmcb->save.ss = hsave->save.ss;
3398         svm->vmcb->save.ds = hsave->save.ds;
3399         svm->vmcb->save.gdtr = hsave->save.gdtr;
3400         svm->vmcb->save.idtr = hsave->save.idtr;
3401         kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
3402         svm_set_efer(&svm->vcpu, hsave->save.efer);
3403         svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
3404         svm_set_cr4(&svm->vcpu, hsave->save.cr4);
3405         if (npt_enabled) {
3406                 svm->vmcb->save.cr3 = hsave->save.cr3;
3407                 svm->vcpu.arch.cr3 = hsave->save.cr3;
3408         } else {
3409                 (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
3410         }
3411         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
3412         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
3413         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
3414         svm->vmcb->save.dr7 = 0;
3415         svm->vmcb->save.cpl = 0;
3416         svm->vmcb->control.exit_int_info = 0;
3417
3418         mark_all_dirty(svm->vmcb);
3419
3420         nested_svm_unmap(page);
3421
3422         nested_svm_uninit_mmu_context(&svm->vcpu);
3423         kvm_mmu_reset_context(&svm->vcpu);
3424         kvm_mmu_load(&svm->vcpu);
3425
3426         /*
3427          * Drop what we picked up for L2 via svm_complete_interrupts() so it
3428          * doesn't end up in L1.
3429          */
3430         svm->vcpu.arch.nmi_injected = false;
3431         kvm_clear_exception_queue(&svm->vcpu);
3432         kvm_clear_interrupt_queue(&svm->vcpu);
3433
3434         return 0;
3435 }
3436
3437 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
3438 {
3439         /*
3440          * This function merges the msr permission bitmaps of kvm and the
3441          * nested vmcb. It is optimized in that it only merges the parts where
3442          * the kvm msr permission bitmap may contain zero bits
3443          */
3444         int i;
3445
3446         if (!(svm->nested.intercept & (1ULL << INTERCEPT_MSR_PROT)))
3447                 return true;
3448
3449         for (i = 0; i < MSRPM_OFFSETS; i++) {
3450                 u32 value, p;
3451                 u64 offset;
3452
3453                 if (msrpm_offsets[i] == 0xffffffff)
3454                         break;
3455
3456                 p      = msrpm_offsets[i];
3457                 offset = svm->nested.vmcb_msrpm + (p * 4);
3458
3459                 if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
3460                         return false;
3461
3462                 svm->nested.msrpm[p] = svm->msrpm[p] | value;
3463         }
3464
3465         svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
3466
3467         return true;
3468 }
3469
3470 static bool nested_vmcb_checks(struct vmcb *vmcb)
3471 {
3472         if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
3473                 return false;
3474
3475         if (vmcb->control.asid == 0)
3476                 return false;
3477
3478         if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) &&
3479             !npt_enabled)
3480                 return false;
3481
3482         return true;
3483 }
3484
3485 static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
3486                                  struct vmcb *nested_vmcb, struct page *page)
3487 {
3488         if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
3489                 svm->vcpu.arch.hflags |= HF_HIF_MASK;
3490         else
3491                 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
3492
3493         if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
3494                 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
3495                 nested_svm_init_mmu_context(&svm->vcpu);
3496         }
3497
3498         /* Load the nested guest state */
3499         svm->vmcb->save.es = nested_vmcb->save.es;
3500         svm->vmcb->save.cs = nested_vmcb->save.cs;
3501         svm->vmcb->save.ss = nested_vmcb->save.ss;
3502         svm->vmcb->save.ds = nested_vmcb->save.ds;
3503         svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
3504         svm->vmcb->save.idtr = nested_vmcb->save.idtr;
3505         kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
3506         svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
3507         svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
3508         svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
3509         if (npt_enabled) {
3510                 svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
3511                 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
3512         } else
3513                 (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
3514
3515         /* Guest paging mode is active - reset mmu */
3516         kvm_mmu_reset_context(&svm->vcpu);
3517
3518         svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
3519         kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
3520         kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
3521         kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
3522
3523         /* In case we don't even reach vcpu_run, the fields are not updated */
3524         svm->vmcb->save.rax = nested_vmcb->save.rax;
3525         svm->vmcb->save.rsp = nested_vmcb->save.rsp;
3526         svm->vmcb->save.rip = nested_vmcb->save.rip;
3527         svm->vmcb->save.dr7 = nested_vmcb->save.dr7;
3528         svm->vmcb->save.dr6 = nested_vmcb->save.dr6;
3529         svm->vmcb->save.cpl = nested_vmcb->save.cpl;
3530
3531         svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL;
3532         svm->nested.vmcb_iopm  = nested_vmcb->control.iopm_base_pa  & ~0x0fffULL;
3533
3534         /* cache intercepts */
3535         svm->nested.intercept_cr         = nested_vmcb->control.intercept_cr;
3536         svm->nested.intercept_dr         = nested_vmcb->control.intercept_dr;
3537         svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
3538         svm->nested.intercept            = nested_vmcb->control.intercept;
3539
3540         svm_flush_tlb(&svm->vcpu, true);
3541         svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
3542         if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
3543                 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
3544         else
3545                 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
3546
3547         if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
3548                 /* We only want the cr8 intercept bits of the guest */
3549                 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
3550                 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
3551         }
3552
3553         /* We don't want to see VMMCALLs from a nested guest */
3554         clr_intercept(svm, INTERCEPT_VMMCALL);
3555
3556         svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset;
3557         svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
3558
3559         svm->vmcb->control.virt_ext = nested_vmcb->control.virt_ext;
3560         svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
3561         svm->vmcb->control.int_state = nested_vmcb->control.int_state;
3562         svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
3563         svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
3564
3565         svm->vmcb->control.pause_filter_count =
3566                 nested_vmcb->control.pause_filter_count;
3567         svm->vmcb->control.pause_filter_thresh =
3568                 nested_vmcb->control.pause_filter_thresh;
3569
3570         nested_svm_unmap(page);
3571
3572         /* Enter Guest-Mode */
3573         enter_guest_mode(&svm->vcpu);
3574
3575         /*
3576          * Merge guest and host intercepts - must be called  with vcpu in
3577          * guest-mode to take affect here
3578          */
3579         recalc_intercepts(svm);
3580
3581         svm->nested.vmcb = vmcb_gpa;
3582
3583         enable_gif(svm);
3584
3585         mark_all_dirty(svm->vmcb);
3586 }
3587
3588 static bool nested_svm_vmrun(struct vcpu_svm *svm)
3589 {
3590         struct vmcb *nested_vmcb;
3591         struct vmcb *hsave = svm->nested.hsave;
3592         struct vmcb *vmcb = svm->vmcb;
3593         struct page *page;
3594         u64 vmcb_gpa;
3595
3596         vmcb_gpa = svm->vmcb->save.rax;
3597
3598         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3599         if (!nested_vmcb)
3600                 return false;
3601
3602         if (!nested_vmcb_checks(nested_vmcb)) {
3603                 nested_vmcb->control.exit_code    = SVM_EXIT_ERR;
3604                 nested_vmcb->control.exit_code_hi = 0;
3605                 nested_vmcb->control.exit_info_1  = 0;
3606                 nested_vmcb->control.exit_info_2  = 0;
3607
3608                 nested_svm_unmap(page);
3609
3610                 return false;
3611         }
3612
3613         trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
3614                                nested_vmcb->save.rip,
3615                                nested_vmcb->control.int_ctl,
3616                                nested_vmcb->control.event_inj,
3617                                nested_vmcb->control.nested_ctl);
3618
3619         trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
3620                                     nested_vmcb->control.intercept_cr >> 16,
3621                                     nested_vmcb->control.intercept_exceptions,
3622                                     nested_vmcb->control.intercept);
3623
3624         /* Clear internal status */
3625         kvm_clear_exception_queue(&svm->vcpu);
3626         kvm_clear_interrupt_queue(&svm->vcpu);
3627
3628         /*
3629          * Save the old vmcb, so we don't need to pick what we save, but can
3630          * restore everything when a VMEXIT occurs
3631          */
3632         hsave->save.es     = vmcb->save.es;
3633         hsave->save.cs     = vmcb->save.cs;
3634         hsave->save.ss     = vmcb->save.ss;
3635         hsave->save.ds     = vmcb->save.ds;
3636         hsave->save.gdtr   = vmcb->save.gdtr;
3637         hsave->save.idtr   = vmcb->save.idtr;
3638         hsave->save.efer   = svm->vcpu.arch.efer;
3639         hsave->save.cr0    = kvm_read_cr0(&svm->vcpu);
3640         hsave->save.cr4    = svm->vcpu.arch.cr4;
3641         hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
3642         hsave->save.rip    = kvm_rip_read(&svm->vcpu);
3643         hsave->save.rsp    = vmcb->save.rsp;
3644         hsave->save.rax    = vmcb->save.rax;
3645         if (npt_enabled)
3646                 hsave->save.cr3    = vmcb->save.cr3;
3647         else
3648                 hsave->save.cr3    = kvm_read_cr3(&svm->vcpu);
3649
3650         copy_vmcb_control_area(hsave, vmcb);
3651
3652         enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
3653
3654         return true;
3655 }
3656
3657 static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
3658 {
3659         to_vmcb->save.fs = from_vmcb->save.fs;
3660         to_vmcb->save.gs = from_vmcb->save.gs;
3661         to_vmcb->save.tr = from_vmcb->save.tr;
3662         to_vmcb->save.ldtr = from_vmcb->save.ldtr;
3663         to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
3664         to_vmcb->save.star = from_vmcb->save.star;
3665         to_vmcb->save.lstar = from_vmcb->save.lstar;
3666         to_vmcb->save.cstar = from_vmcb->save.cstar;
3667         to_vmcb->save.sfmask = from_vmcb->save.sfmask;
3668         to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
3669         to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
3670         to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
3671 }
3672
3673 static int vmload_interception(struct vcpu_svm *svm)
3674 {
3675         struct vmcb *nested_vmcb;
3676         struct page *page;
3677         int ret;
3678
3679         if (nested_svm_check_permissions(svm))
3680                 return 1;
3681
3682         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3683         if (!nested_vmcb)
3684                 return 1;
3685
3686         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3687         ret = kvm_skip_emulated_instruction(&svm->vcpu);
3688
3689         nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
3690         nested_svm_unmap(page);
3691
3692         return ret;
3693 }
3694
3695 static int vmsave_interception(struct vcpu_svm *svm)
3696 {
3697         struct vmcb *nested_vmcb;
3698         struct page *page;
3699         int ret;
3700
3701         if (nested_svm_check_permissions(svm))
3702                 return 1;
3703
3704         nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
3705         if (!nested_vmcb)
3706                 return 1;
3707
3708         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3709         ret = kvm_skip_emulated_instruction(&svm->vcpu);
3710
3711         nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
3712         nested_svm_unmap(page);
3713
3714         return ret;
3715 }
3716
3717 static int vmrun_interception(struct vcpu_svm *svm)
3718 {
3719         if (nested_svm_check_permissions(svm))
3720                 return 1;
3721
3722         /* Save rip after vmrun instruction */
3723         kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
3724
3725         if (!nested_svm_vmrun(svm))
3726                 return 1;
3727
3728         if (!nested_svm_vmrun_msrpm(svm))
3729                 goto failed;
3730
3731         return 1;
3732
3733 failed:
3734
3735         svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
3736         svm->vmcb->control.exit_code_hi = 0;
3737         svm->vmcb->control.exit_info_1  = 0;
3738         svm->vmcb->control.exit_info_2  = 0;
3739
3740         nested_svm_vmexit(svm);
3741
3742         return 1;
3743 }
3744
3745 static int stgi_interception(struct vcpu_svm *svm)
3746 {
3747         int ret;
3748
3749         if (nested_svm_check_permissions(svm))
3750                 return 1;
3751
3752         /*
3753          * If VGIF is enabled, the STGI intercept is only added to
3754          * detect the opening of the SMI/NMI window; remove it now.
3755          */
3756         if (vgif_enabled(svm))
3757                 clr_intercept(svm, INTERCEPT_STGI);
3758
3759         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3760         ret = kvm_skip_emulated_instruction(&svm->vcpu);
3761         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3762
3763         enable_gif(svm);
3764
3765         return ret;
3766 }
3767
3768 static int clgi_interception(struct vcpu_svm *svm)
3769 {
3770         int ret;
3771
3772         if (nested_svm_check_permissions(svm))
3773                 return 1;
3774
3775         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3776         ret = kvm_skip_emulated_instruction(&svm->vcpu);
3777
3778         disable_gif(svm);
3779
3780         /* After a CLGI no interrupts should come */
3781         if (!kvm_vcpu_apicv_active(&svm->vcpu)) {
3782                 svm_clear_vintr(svm);
3783                 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3784                 mark_dirty(svm->vmcb, VMCB_INTR);
3785         }
3786
3787         return ret;
3788 }
3789
3790 static int invlpga_interception(struct vcpu_svm *svm)
3791 {
3792         struct kvm_vcpu *vcpu = &svm->vcpu;
3793
3794         trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
3795                           kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3796
3797         /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
3798         kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3799
3800         svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3801         return kvm_skip_emulated_instruction(&svm->vcpu);
3802 }
3803
3804 static int skinit_interception(struct vcpu_svm *svm)
3805 {
3806         trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
3807
3808         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
3809         return 1;
3810 }
3811
3812 static int wbinvd_interception(struct vcpu_svm *svm)
3813 {
3814         return kvm_emulate_wbinvd(&svm->vcpu);
3815 }
3816
3817 static int xsetbv_interception(struct vcpu_svm *svm)
3818 {
3819         u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
3820         u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
3821
3822         if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
3823                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
3824                 return kvm_skip_emulated_instruction(&svm->vcpu);
3825         }
3826
3827         return 1;
3828 }
3829
3830 static int task_switch_interception(struct vcpu_svm *svm)
3831 {
3832         u16 tss_selector;
3833         int reason;
3834         int int_type = svm->vmcb->control.exit_int_info &
3835                 SVM_EXITINTINFO_TYPE_MASK;
3836         int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
3837         uint32_t type =
3838                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
3839         uint32_t idt_v =
3840                 svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
3841         bool has_error_code = false;
3842         u32 error_code = 0;
3843
3844         tss_selector = (u16)svm->vmcb->control.exit_info_1;
3845
3846         if (svm->vmcb->control.exit_info_2 &
3847             (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
3848                 reason = TASK_SWITCH_IRET;
3849         else if (svm->vmcb->control.exit_info_2 &
3850                  (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
3851                 reason = TASK_SWITCH_JMP;
3852         else if (idt_v)
3853                 reason = TASK_SWITCH_GATE;
3854         else
3855                 reason = TASK_SWITCH_CALL;
3856
3857         if (reason == TASK_SWITCH_GATE) {
3858                 switch (type) {
3859                 case SVM_EXITINTINFO_TYPE_NMI:
3860                         svm->vcpu.arch.nmi_injected = false;
3861                         break;
3862                 case SVM_EXITINTINFO_TYPE_EXEPT:
3863                         if (svm->vmcb->control.exit_info_2 &
3864                             (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) {
3865                                 has_error_code = true;
3866                                 error_code =
3867                                         (u32)svm->vmcb->control.exit_info_2;
3868                         }
3869                         kvm_clear_exception_queue(&svm->vcpu);
3870                         break;
3871                 case SVM_EXITINTINFO_TYPE_INTR:
3872                         kvm_clear_interrupt_queue(&svm->vcpu);
3873                         break;
3874                 default:
3875                         break;
3876                 }
3877         }
3878
3879         if (reason != TASK_SWITCH_GATE ||
3880             int_type == SVM_EXITINTINFO_TYPE_SOFT ||
3881             (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
3882              (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
3883                 skip_emulated_instruction(&svm->vcpu);
3884
3885         if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
3886                 int_vec = -1;
3887
3888         if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
3889                                 has_error_code, error_code) == EMULATE_FAIL) {
3890                 svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3891                 svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3892                 svm->vcpu.run->internal.ndata = 0;
3893                 return 0;
3894         }
3895         return 1;
3896 }
3897
3898 static int cpuid_interception(struct vcpu_svm *svm)
3899 {
3900         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
3901         return kvm_emulate_cpuid(&svm->vcpu);
3902 }
3903
3904 static int iret_interception(struct vcpu_svm *svm)
3905 {
3906         ++svm->vcpu.stat.nmi_window_exits;
3907         clr_intercept(svm, INTERCEPT_IRET);
3908         svm->vcpu.arch.hflags |= HF_IRET_MASK;
3909         svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
3910         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3911         return 1;
3912 }
3913
3914 static int invlpg_interception(struct vcpu_svm *svm)
3915 {
3916         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3917                 return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3918
3919         kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
3920         return kvm_skip_emulated_instruction(&svm->vcpu);
3921 }
3922
3923 static int emulate_on_interception(struct vcpu_svm *svm)
3924 {
3925         return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
3926 }
3927
3928 static int rsm_interception(struct vcpu_svm *svm)
3929 {
3930         return kvm_emulate_instruction_from_buffer(&svm->vcpu,
3931                                         rsm_ins_bytes, 2) == EMULATE_DONE;
3932 }
3933
3934 static int rdpmc_interception(struct vcpu_svm *svm)
3935 {
3936         int err;
3937
3938         if (!static_cpu_has(X86_FEATURE_NRIPS))
3939                 return emulate_on_interception(svm);
3940
3941         err = kvm_rdpmc(&svm->vcpu);
3942         return kvm_complete_insn_gp(&svm->vcpu, err);
3943 }
3944
3945 static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
3946                                             unsigned long val)
3947 {
3948         unsigned long cr0 = svm->vcpu.arch.cr0;
3949         bool ret = false;
3950         u64 intercept;
3951
3952         intercept = svm->nested.intercept;
3953
3954         if (!is_guest_mode(&svm->vcpu) ||
3955             (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
3956                 return false;
3957
3958         cr0 &= ~SVM_CR0_SELECTIVE_MASK;
3959         val &= ~SVM_CR0_SELECTIVE_MASK;
3960
3961         if (cr0 ^ val) {
3962                 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
3963                 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
3964         }
3965
3966         return ret;
3967 }
3968
3969 #define CR_VALID (1ULL << 63)
3970
3971 static int cr_interception(struct vcpu_svm *svm)
3972 {
3973         int reg, cr;
3974         unsigned long val;
3975         int err;
3976
3977         if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
3978                 return emulate_on_interception(svm);
3979
3980         if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
3981                 return emulate_on_interception(svm);
3982
3983         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
3984         if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
3985                 cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
3986         else
3987                 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
3988
3989         err = 0;
3990         if (cr >= 16) { /* mov to cr */
3991                 cr -= 16;
3992                 val = kvm_register_read(&svm->vcpu, reg);
3993                 switch (cr) {
3994                 case 0:
3995                         if (!check_selective_cr0_intercepted(svm, val))
3996                                 err = kvm_set_cr0(&svm->vcpu, val);
3997                         else
3998                                 return 1;
3999
4000                         break;
4001                 case 3:
4002                         err = kvm_set_cr3(&svm->vcpu, val);
4003                         break;
4004                 case 4:
4005                         err = kvm_set_cr4(&svm->vcpu, val);
4006                         break;
4007                 case 8:
4008                         err = kvm_set_cr8(&svm->vcpu, val);
4009                         break;
4010                 default:
4011                         WARN(1, "unhandled write to CR%d", cr);
4012                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
4013                         return 1;
4014                 }
4015         } else { /* mov from cr */
4016                 switch (cr) {
4017                 case 0:
4018                         val = kvm_read_cr0(&svm->vcpu);
4019                         break;
4020                 case 2:
4021                         val = svm->vcpu.arch.cr2;
4022                         break;
4023                 case 3:
4024                         val = kvm_read_cr3(&svm->vcpu);
4025                         break;
4026                 case 4:
4027                         val = kvm_read_cr4(&svm->vcpu);
4028                         break;
4029                 case 8:
4030                         val = kvm_get_cr8(&svm->vcpu);
4031                         break;
4032                 default:
4033                         WARN(1, "unhandled read from CR%d", cr);
4034                         kvm_queue_exception(&svm->vcpu, UD_VECTOR);
4035                         return 1;
4036                 }
4037                 kvm_register_write(&svm->vcpu, reg, val);
4038         }
4039         return kvm_complete_insn_gp(&svm->vcpu, err);
4040 }
4041
4042 static int dr_interception(struct vcpu_svm *svm)
4043 {
4044         int reg, dr;
4045         unsigned long val;
4046
4047         if (svm->vcpu.guest_debug == 0) {
4048                 /*
4049                  * No more DR vmexits; force a reload of the debug registers
4050                  * and reenter on this instruction.  The next vmexit will
4051                  * retrieve the full state of the debug registers.
4052                  */
4053                 clr_dr_intercepts(svm);
4054                 svm->vcpu.arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT;
4055                 return 1;
4056         }
4057
4058         if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
4059                 return emulate_on_interception(svm);
4060
4061         reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
4062         dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
4063
4064         if (dr >= 16) { /* mov to DRn */
4065                 if (!kvm_require_dr(&svm->vcpu, dr - 16))
4066                         return 1;
4067                 val = kvm_register_read(&svm->vcpu, reg);
4068                 kvm_set_dr(&svm->vcpu, dr - 16, val);
4069         } else {
4070                 if (!kvm_require_dr(&svm->vcpu, dr))
4071                         return 1;
4072                 kvm_get_dr(&svm->vcpu, dr, &val);
4073                 kvm_register_write(&svm->vcpu, reg, val);
4074         }
4075
4076         return kvm_skip_emulated_instruction(&svm->vcpu);
4077 }
4078
4079 static int cr8_write_interception(struct vcpu_svm *svm)
4080 {
4081         struct kvm_run *kvm_run = svm->vcpu.run;
4082         int r;
4083
4084         u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
4085         /* instruction emulation calls kvm_set_cr8() */
4086         r = cr_interception(svm);
4087         if (lapic_in_kernel(&svm->vcpu))
4088                 return r;
4089         if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
4090                 return r;
4091         kvm_run->exit_reason = KVM_EXIT_SET_TPR;
4092         return 0;
4093 }
4094
4095 static int svm_get_msr_feature(struct kvm_msr_entry *msr)
4096 {
4097         msr->data = 0;
4098
4099         switch (msr->index) {
4100         case MSR_F10H_DECFG:
4101                 if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
4102                         msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
4103                 break;
4104         default:
4105                 return 1;
4106         }
4107
4108         return 0;
4109 }
4110
4111 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
4112 {
4113         struct vcpu_svm *svm = to_svm(vcpu);
4114
4115         switch (msr_info->index) {
4116         case MSR_STAR:
4117                 msr_info->data = svm->vmcb->save.star;
4118                 break;
4119 #ifdef CONFIG_X86_64
4120         case MSR_LSTAR:
4121                 msr_info->data = svm->vmcb->save.lstar;
4122                 break;
4123         case MSR_CSTAR:
4124                 msr_info->data = svm->vmcb->save.cstar;
4125                 break;
4126         case MSR_KERNEL_GS_BASE:
4127                 msr_info->data = svm->vmcb->save.kernel_gs_base;
4128                 break;
4129         case MSR_SYSCALL_MASK:
4130                 msr_info->data = svm->vmcb->save.sfmask;
4131                 break;
4132 #endif
4133         case MSR_IA32_SYSENTER_CS:
4134                 msr_info->data = svm->vmcb->save.sysenter_cs;
4135                 break;
4136         case MSR_IA32_SYSENTER_EIP:
4137                 msr_info->data = svm->sysenter_eip;
4138                 break;
4139         case MSR_IA32_SYSENTER_ESP:
4140                 msr_info->data = svm->sysenter_esp;
4141                 break;
4142         case MSR_TSC_AUX:
4143                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
4144                         return 1;
4145                 msr_info->data = svm->tsc_aux;
4146                 break;
4147         /*
4148          * Nobody will change the following 5 values in the VMCB so we can
4149          * safely return them on rdmsr. They will always be 0 until LBRV is
4150          * implemented.
4151          */
4152         case MSR_IA32_DEBUGCTLMSR:
4153                 msr_info->data = svm->vmcb->save.dbgctl;
4154                 break;
4155         case MSR_IA32_LASTBRANCHFROMIP:
4156                 msr_info->data = svm->vmcb->save.br_from;
4157                 break;
4158         case MSR_IA32_LASTBRANCHTOIP:
4159                 msr_info->data = svm->vmcb->save.br_to;
4160                 break;
4161         case MSR_IA32_LASTINTFROMIP:
4162                 msr_info->data = svm->vmcb->save.last_excp_from;
4163                 break;
4164         case MSR_IA32_LASTINTTOIP:
4165                 msr_info->data = svm->vmcb->save.last_excp_to;
4166                 break;
4167         case MSR_VM_HSAVE_PA:
4168                 msr_info->data = svm->nested.hsave_msr;
4169                 break;
4170         case MSR_VM_CR:
4171                 msr_info->data = svm->nested.vm_cr_msr;
4172                 break;
4173         case MSR_IA32_SPEC_CTRL:
4174                 if (!msr_info->host_initiated &&
4175                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
4176                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
4177                         return 1;
4178
4179                 msr_info->data = svm->spec_ctrl;
4180                 break;
4181         case MSR_AMD64_VIRT_SPEC_CTRL:
4182                 if (!msr_info->host_initiated &&
4183                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
4184                         return 1;
4185
4186                 msr_info->data = svm->virt_spec_ctrl;
4187                 break;
4188         case MSR_F15H_IC_CFG: {
4189
4190                 int family, model;
4191
4192                 family = guest_cpuid_family(vcpu);
4193                 model  = guest_cpuid_model(vcpu);
4194
4195                 if (family < 0 || model < 0)
4196                         return kvm_get_msr_common(vcpu, msr_info);
4197
4198                 msr_info->data = 0;
4199
4200                 if (family == 0x15 &&
4201                     (model >= 0x2 && model < 0x20))
4202                         msr_info->data = 0x1E;
4203                 }
4204                 break;
4205         case MSR_F10H_DECFG:
4206                 msr_info->data = svm->msr_decfg;
4207                 break;
4208         default:
4209                 return kvm_get_msr_common(vcpu, msr_info);
4210         }
4211         return 0;
4212 }
4213
4214 static int rdmsr_interception(struct vcpu_svm *svm)
4215 {
4216         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
4217         struct msr_data msr_info;
4218
4219         msr_info.index = ecx;
4220         msr_info.host_initiated = false;
4221         if (svm_get_msr(&svm->vcpu, &msr_info)) {
4222                 trace_kvm_msr_read_ex(ecx);
4223                 kvm_inject_gp(&svm->vcpu, 0);
4224                 return 1;
4225         } else {
4226                 trace_kvm_msr_read(ecx, msr_info.data);
4227
4228                 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
4229                                    msr_info.data & 0xffffffff);
4230                 kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
4231                                    msr_info.data >> 32);
4232                 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
4233                 return kvm_skip_emulated_instruction(&svm->vcpu);
4234         }
4235 }
4236
4237 static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
4238 {
4239         struct vcpu_svm *svm = to_svm(vcpu);
4240         int svm_dis, chg_mask;
4241
4242         if (data & ~SVM_VM_CR_VALID_MASK)
4243                 return 1;
4244
4245         chg_mask = SVM_VM_CR_VALID_MASK;
4246
4247         if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK)
4248                 chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK);
4249
4250         svm->nested.vm_cr_msr &= ~chg_mask;
4251         svm->nested.vm_cr_msr |= (data & chg_mask);
4252
4253         svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK;
4254
4255         /* check for svm_disable while efer.svme is set */
4256         if (svm_dis && (vcpu->arch.efer & EFER_SVME))
4257                 return 1;
4258
4259         return 0;
4260 }
4261
4262 static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
4263 {
4264         struct vcpu_svm *svm = to_svm(vcpu);
4265
4266         u32 ecx = msr->index;
4267         u64 data = msr->data;
4268         switch (ecx) {
4269         case MSR_IA32_CR_PAT:
4270                 if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
4271                         return 1;
4272                 vcpu->arch.pat = data;
4273                 svm->vmcb->save.g_pat = data;
4274                 mark_dirty(svm->vmcb, VMCB_NPT);
4275                 break;
4276         case MSR_IA32_SPEC_CTRL:
4277                 if (!msr->host_initiated &&
4278                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
4279                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
4280                         return 1;
4281
4282                 /* The STIBP bit doesn't fault even if it's not advertised */
4283                 if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
4284                         return 1;
4285
4286                 svm->spec_ctrl = data;
4287
4288                 if (!data)
4289                         break;
4290
4291                 /*
4292                  * For non-nested:
4293                  * When it's written (to non-zero) for the first time, pass
4294                  * it through.
4295                  *
4296                  * For nested:
4297                  * The handling of the MSR bitmap for L2 guests is done in
4298                  * nested_svm_vmrun_msrpm.
4299                  * We update the L1 MSR bit as well since it will end up
4300                  * touching the MSR anyway now.
4301                  */
4302                 set_msr_interception(svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
4303                 break;
4304         case MSR_IA32_PRED_CMD:
4305                 if (!msr->host_initiated &&
4306                     !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
4307                         return 1;
4308
4309                 if (data & ~PRED_CMD_IBPB)
4310                         return 1;
4311
4312                 if (!data)
4313                         break;
4314
4315                 wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
4316                 if (is_guest_mode(vcpu))
4317                         break;
4318                 set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
4319                 break;
4320         case MSR_AMD64_VIRT_SPEC_CTRL:
4321                 if (!msr->host_initiated &&
4322                     !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD))
4323                         return 1;
4324
4325                 if (data & ~SPEC_CTRL_SSBD)
4326                         return 1;
4327
4328                 svm->virt_spec_ctrl = data;
4329                 break;
4330         case MSR_STAR:
4331                 svm->vmcb->save.star = data;
4332                 break;
4333 #ifdef CONFIG_X86_64
4334         case MSR_LSTAR:
4335                 svm->vmcb->save.lstar = data;
4336                 break;
4337         case MSR_CSTAR:
4338                 svm->vmcb->save.cstar = data;
4339                 break;
4340         case MSR_KERNEL_GS_BASE:
4341                 svm->vmcb->save.kernel_gs_base = data;
4342                 break;
4343         case MSR_SYSCALL_MASK:
4344                 svm->vmcb->save.sfmask = data;
4345                 break;
4346 #endif
4347         case MSR_IA32_SYSENTER_CS:
4348                 svm->vmcb->save.sysenter_cs = data;
4349                 break;
4350         case MSR_IA32_SYSENTER_EIP:
4351                 svm->sysenter_eip = data;
4352                 svm->vmcb->save.sysenter_eip = data;
4353                 break;
4354         case MSR_IA32_SYSENTER_ESP:
4355                 svm->sysenter_esp = data;
4356                 svm->vmcb->save.sysenter_esp = data;
4357                 break;
4358         case MSR_TSC_AUX:
4359                 if (!boot_cpu_has(X86_FEATURE_RDTSCP))
4360                         return 1;
4361
4362                 /*
4363                  * This is rare, so we update the MSR here instead of using
4364                  * direct_access_msrs.  Doing that would require a rdmsr in
4365                  * svm_vcpu_put.
4366                  */
4367                 svm->tsc_aux = data;
4368                 wrmsrl(MSR_TSC_AUX, svm->tsc_aux);
4369                 break;
4370         case MSR_IA32_DEBUGCTLMSR:
4371                 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
4372                         vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
4373                                     __func__, data);
4374                         break;
4375                 }
4376                 if (data & DEBUGCTL_RESERVED_BITS)
4377                         return 1;
4378
4379                 svm->vmcb->save.dbgctl = data;
4380                 mark_dirty(svm->vmcb, VMCB_LBR);
4381                 if (data & (1ULL<<0))
4382                         svm_enable_lbrv(svm);
4383                 else
4384                         svm_disable_lbrv(svm);
4385                 break;
4386         case MSR_VM_HSAVE_PA:
4387                 svm->nested.hsave_msr = data;
4388                 break;
4389         case MSR_VM_CR:
4390                 return svm_set_vm_cr(vcpu, data);
4391         case MSR_VM_IGNNE:
4392                 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
4393                 break;
4394         case MSR_F10H_DECFG: {
4395                 struct kvm_msr_entry msr_entry;
4396
4397                 msr_entry.index = msr->index;
4398                 if (svm_get_msr_feature(&msr_entry))
4399                         return 1;
4400
4401                 /* Check the supported bits */
4402                 if (data & ~msr_entry.data)
4403                         return 1;
4404
4405                 /* Don't allow the guest to change a bit, #GP */
4406                 if (!msr->host_initiated && (data ^ msr_entry.data))
4407                         return 1;
4408
4409                 svm->msr_decfg = data;
4410                 break;
4411         }
4412         case MSR_IA32_APICBASE:
4413                 if (kvm_vcpu_apicv_active(vcpu))
4414                         avic_update_vapic_bar(to_svm(vcpu), data);
4415                 /* Fall through */
4416         default:
4417                 return kvm_set_msr_common(vcpu, msr);
4418         }
4419         return 0;
4420 }
4421
4422 static int wrmsr_interception(struct vcpu_svm *svm)
4423 {
4424         struct msr_data msr;
4425         u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
4426         u64 data = kvm_read_edx_eax(&svm->vcpu);
4427
4428         msr.data = data;
4429         msr.index = ecx;
4430         msr.host_initiated = false;
4431
4432         svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
4433         if (kvm_set_msr(&svm->vcpu, &msr)) {
4434                 trace_kvm_msr_write_ex(ecx, data);
4435                 kvm_inject_gp(&svm->vcpu, 0);
4436                 return 1;
4437         } else {
4438                 trace_kvm_msr_write(ecx, data);
4439                 return kvm_skip_emulated_instruction(&svm->vcpu);
4440         }
4441 }
4442
4443 static int msr_interception(struct vcpu_svm *svm)
4444 {
4445         if (svm->vmcb->control.exit_info_1)
4446                 return wrmsr_interception(svm);
4447         else
4448                 return rdmsr_interception(svm);
4449 }
4450
4451 static int interrupt_window_interception(struct vcpu_svm *svm)
4452 {
4453         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
4454         svm_clear_vintr(svm);
4455         svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
4456         mark_dirty(svm->vmcb, VMCB_INTR);
4457         ++svm->vcpu.stat.irq_window_exits;
4458         return 1;
4459 }
4460
4461 static int pause_interception(struct vcpu_svm *svm)
4462 {
4463         struct kvm_vcpu *vcpu = &svm->vcpu;
4464         bool in_kernel = (svm_get_cpl(vcpu) == 0);
4465
4466         if (pause_filter_thresh)
4467                 grow_ple_window(vcpu);
4468
4469         kvm_vcpu_on_spin(vcpu, in_kernel);
4470         return 1;
4471 }
4472
4473 static int nop_interception(struct vcpu_svm *svm)
4474 {
4475         return kvm_skip_emulated_instruction(&(svm->vcpu));
4476 }
4477
4478 static int monitor_interception(struct vcpu_svm *svm)
4479 {
4480         printk_once(KERN_WARNING "kvm: MONITOR instruction emulated as NOP!\n");
4481         return nop_interception(svm);
4482 }
4483
4484 static int mwait_interception(struct vcpu_svm *svm)
4485 {
4486         printk_once(KERN_WARNING "kvm: MWAIT instruction emulated as NOP!\n");
4487         return nop_interception(svm);
4488 }
4489
4490 enum avic_ipi_failure_cause {
4491         AVIC_IPI_FAILURE_INVALID_INT_TYPE,
4492         AVIC_IPI_FAILURE_TARGET_NOT_RUNNING,
4493         AVIC_IPI_FAILURE_INVALID_TARGET,
4494         AVIC_IPI_FAILURE_INVALID_BACKING_PAGE,
4495 };
4496
4497 static int avic_incomplete_ipi_interception(struct vcpu_svm *svm)
4498 {
4499         u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
4500         u32 icrl = svm->vmcb->control.exit_info_1;
4501         u32 id = svm->vmcb->control.exit_info_2 >> 32;
4502         u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
4503         struct kvm_lapic *apic = svm->vcpu.arch.apic;
4504
4505         trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index);
4506
4507         switch (id) {
4508         case AVIC_IPI_FAILURE_INVALID_INT_TYPE:
4509                 /*
4510                  * AVIC hardware handles the generation of
4511                  * IPIs when the specified Message Type is Fixed
4512                  * (also known as fixed delivery mode) and
4513                  * the Trigger Mode is edge-triggered. The hardware
4514                  * also supports self and broadcast delivery modes
4515                  * specified via the Destination Shorthand(DSH)
4516                  * field of the ICRL. Logical and physical APIC ID
4517                  * formats are supported. All other IPI types cause
4518                  * a #VMEXIT, which needs to emulated.
4519                  */
4520                 kvm_lapic_reg_write(apic, APIC_ICR2, icrh);
4521                 kvm_lapic_reg_write(apic, APIC_ICR, icrl);
4522                 break;
4523         case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: {
4524                 int i;
4525                 struct kvm_vcpu *vcpu;
4526                 struct kvm *kvm = svm->vcpu.kvm;
4527                 struct kvm_lapic *apic = svm->vcpu.arch.apic;
4528
4529                 /*
4530                  * At this point, we expect that the AVIC HW has already
4531                  * set the appropriate IRR bits on the valid target
4532                  * vcpus. So, we just need to kick the appropriate vcpu.
4533                  */
4534                 kvm_for_each_vcpu(i, vcpu, kvm) {
4535                         bool m = kvm_apic_match_dest(vcpu, apic,
4536                                                      icrl & KVM_APIC_SHORT_MASK,
4537                                                      GET_APIC_DEST_FIELD(icrh),
4538                                                      icrl & KVM_APIC_DEST_MASK);
4539
4540                         if (m && !avic_vcpu_is_running(vcpu))
4541                                 kvm_vcpu_wake_up(vcpu);
4542                 }
4543                 break;
4544         }
4545         case AVIC_IPI_FAILURE_INVALID_TARGET:
4546                 WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
4547                           index, svm->vcpu.vcpu_id, icrh, icrl);
4548                 break;
4549         case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
4550                 WARN_ONCE(1, "Invalid backing page\n");
4551                 break;
4552         default:
4553                 pr_err("Unknown IPI interception\n");
4554         }
4555
4556         return 1;
4557 }
4558
4559 static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
4560 {
4561         struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
4562         int index;
4563         u32 *logical_apic_id_table;
4564         int dlid = GET_APIC_LOGICAL_ID(ldr);
4565
4566         if (!dlid)
4567                 return NULL;
4568
4569         if (flat) { /* flat */
4570                 index = ffs(dlid) - 1;
4571                 if (index > 7)
4572                         return NULL;
4573         } else { /* cluster */
4574                 int cluster = (dlid & 0xf0) >> 4;
4575                 int apic = ffs(dlid & 0x0f) - 1;
4576
4577                 if ((apic < 0) || (apic > 7) ||
4578                     (cluster >= 0xf))
4579                         return NULL;
4580                 index = (cluster << 2) + apic;
4581         }
4582
4583         logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page);
4584
4585         return &logical_apic_id_table[index];
4586 }
4587
4588 static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
4589 {
4590         bool flat;
4591         u32 *entry, new_entry;
4592
4593         flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT;
4594         entry = avic_get_logical_id_entry(vcpu, ldr, flat);
4595         if (!entry)
4596                 return -EINVAL;
4597
4598         new_entry = READ_ONCE(*entry);
4599         new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
4600         new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
4601         new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
4602         WRITE_ONCE(*entry, new_entry);
4603
4604         return 0;
4605 }
4606
4607 static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
4608 {
4609         struct vcpu_svm *svm = to_svm(vcpu);
4610         bool flat = svm->dfr_reg == APIC_DFR_FLAT;
4611         u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
4612
4613         if (entry)
4614                 clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
4615 }
4616
4617 static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
4618 {
4619         int ret = 0;
4620         struct vcpu_svm *svm = to_svm(vcpu);
4621         u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
4622
4623         if (ldr == svm->ldr_reg)
4624                 return 0;
4625
4626         avic_invalidate_logical_id_entry(vcpu);
4627
4628         if (ldr)
4629                 ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
4630
4631         if (!ret)
4632                 svm->ldr_reg = ldr;
4633
4634         return ret;
4635 }
4636
4637 static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
4638 {
4639         u64 *old, *new;
4640         struct vcpu_svm *svm = to_svm(vcpu);
4641         u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
4642         u32 id = (apic_id_reg >> 24) & 0xff;
4643
4644         if (vcpu->vcpu_id == id)
4645                 return 0;
4646
4647         old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id);
4648         new = avic_get_physical_id_entry(vcpu, id);
4649         if (!new || !old)
4650                 return 1;
4651
4652         /* We need to move physical_id_entry to new offset */
4653         *new = *old;
4654         *old = 0ULL;
4655         to_svm(vcpu)->avic_physical_id_cache = new;
4656
4657         /*
4658          * Also update the guest physical APIC ID in the logical
4659          * APIC ID table entry if already setup the LDR.
4660          */
4661         if (svm->ldr_reg)
4662                 avic_handle_ldr_update(vcpu);
4663
4664         return 0;
4665 }
4666
4667 static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
4668 {
4669         struct vcpu_svm *svm = to_svm(vcpu);
4670         u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
4671
4672         if (svm->dfr_reg == dfr)
4673                 return;
4674
4675         avic_invalidate_logical_id_entry(vcpu);
4676         svm->dfr_reg = dfr;
4677 }
4678
4679 static int avic_unaccel_trap_write(struct vcpu_svm *svm)
4680 {
4681         struct kvm_lapic *apic = svm->vcpu.arch.apic;
4682         u32 offset = svm->vmcb->control.exit_info_1 &
4683                                 AVIC_UNACCEL_ACCESS_OFFSET_MASK;
4684
4685         switch (offset) {
4686         case APIC_ID:
4687                 if (avic_handle_apic_id_update(&svm->vcpu))
4688                         return 0;
4689                 break;
4690         case APIC_LDR:
4691                 if (avic_handle_ldr_update(&svm->vcpu))
4692                         return 0;
4693                 break;
4694         case APIC_DFR:
4695                 avic_handle_dfr_update(&svm->vcpu);
4696                 break;
4697         default:
4698                 break;
4699         }
4700
4701         kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset));
4702
4703         return 1;
4704 }
4705
4706 static bool is_avic_unaccelerated_access_trap(u32 offset)
4707 {
4708         bool ret = false;
4709
4710         switch (offset) {
4711         case APIC_ID:
4712         case APIC_EOI:
4713         case APIC_RRR:
4714         case APIC_LDR:
4715         case APIC_DFR:
4716         case APIC_SPIV:
4717         case APIC_ESR:
4718         case APIC_ICR:
4719         case APIC_LVTT:
4720         case APIC_LVTTHMR:
4721         case APIC_LVTPC:
4722         case APIC_LVT0:
4723         case APIC_LVT1:
4724         case APIC_LVTERR:
4725         case APIC_TMICT:
4726         case APIC_TDCR:
4727                 ret = true;
4728                 break;
4729         default:
4730                 break;
4731         }
4732         return ret;
4733 }
4734
4735 static int avic_unaccelerated_access_interception(struct vcpu_svm *svm)
4736 {
4737         int ret = 0;
4738         u32 offset = svm->vmcb->control.exit_info_1 &
4739                      AVIC_UNACCEL_ACCESS_OFFSET_MASK;
4740         u32 vector = svm->vmcb->control.exit_info_2 &
4741                      AVIC_UNACCEL_ACCESS_VECTOR_MASK;
4742         bool write = (svm->vmcb->control.exit_info_1 >> 32) &
4743                      AVIC_UNACCEL_ACCESS_WRITE_MASK;
4744         bool trap = is_avic_unaccelerated_access_trap(offset);
4745
4746         trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset,
4747                                             trap, write, vector);
4748         if (trap) {
4749                 /* Handling Trap */
4750                 WARN_ONCE(!write, "svm: Handling trap read.\n");
4751                 ret = avic_unaccel_trap_write(svm);
4752         } else {
4753                 /* Handling Fault */
4754                 ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
4755         }
4756
4757         return ret;
4758 }
4759
4760 static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
4761         [SVM_EXIT_READ_CR0]                     = cr_interception,
4762         [SVM_EXIT_READ_CR3]                     = cr_interception,
4763         [SVM_EXIT_READ_CR4]                     = cr_interception,
4764         [SVM_EXIT_READ_CR8]                     = cr_interception,
4765         [SVM_EXIT_CR0_SEL_WRITE]                = cr_interception,
4766         [SVM_EXIT_WRITE_CR0]                    = cr_interception,
4767         [SVM_EXIT_WRITE_CR3]                    = cr_interception,
4768         [SVM_EXIT_WRITE_CR4]                    = cr_interception,
4769         [SVM_EXIT_WRITE_CR8]                    = cr8_write_interception,
4770         [SVM_EXIT_READ_DR0]                     = dr_interception,
4771         [SVM_EXIT_READ_DR1]                     = dr_interception,
4772         [SVM_EXIT_READ_DR2]                     = dr_interception,
4773         [SVM_EXIT_READ_DR3]                     = dr_interception,
4774         [SVM_EXIT_READ_DR4]                     = dr_interception,
4775         [SVM_EXIT_READ_DR5]                     = dr_interception,
4776         [SVM_EXIT_READ_DR6]                     = dr_interception,
4777         [SVM_EXIT_READ_DR7]                     = dr_interception,
4778         [SVM_EXIT_WRITE_DR0]                    = dr_interception,
4779         [SVM_EXIT_WRITE_DR1]                    = dr_interception,
4780         [SVM_EXIT_WRITE_DR2]                    = dr_interception,
4781         [SVM_EXIT_WRITE_DR3]                    = dr_interception,
4782         [SVM_EXIT_WRITE_DR4]                    = dr_interception,
4783         [SVM_EXIT_WRITE_DR5]                    = dr_interception,
4784         [SVM_EXIT_WRITE_DR6]                    = dr_interception,
4785         [SVM_EXIT_WRITE_DR7]                    = dr_interception,
4786         [SVM_EXIT_EXCP_BASE + DB_VECTOR]        = db_interception,
4787         [SVM_EXIT_EXCP_BASE + BP_VECTOR]        = bp_interception,
4788         [SVM_EXIT_EXCP_BASE + UD_VECTOR]        = ud_interception,
4789         [SVM_EXIT_EXCP_BASE + PF_VECTOR]        = pf_interception,
4790         [SVM_EXIT_EXCP_BASE + MC_VECTOR]        = mc_interception,
4791         [SVM_EXIT_EXCP_BASE + AC_VECTOR]        = ac_interception,
4792         [SVM_EXIT_EXCP_BASE + GP_VECTOR]        = gp_interception,
4793         [SVM_EXIT_INTR]                         = intr_interception,
4794         [SVM_EXIT_NMI]                          = nmi_interception,
4795         [SVM_EXIT_SMI]                          = nop_on_interception,
4796         [SVM_EXIT_INIT]                         = nop_on_interception,
4797         [SVM_EXIT_VINTR]                        = interrupt_window_interception,
4798         [SVM_EXIT_RDPMC]                        = rdpmc_interception,
4799         [SVM_EXIT_CPUID]                        = cpuid_interception,
4800         [SVM_EXIT_IRET]                         = iret_interception,
4801         [SVM_EXIT_INVD]                         = emulate_on_interception,
4802         [SVM_EXIT_PAUSE]                        = pause_interception,
4803         [SVM_EXIT_HLT]                          = halt_interception,
4804         [SVM_EXIT_INVLPG]                       = invlpg_interception,
4805         [SVM_EXIT_INVLPGA]                      = invlpga_interception,
4806         [SVM_EXIT_IOIO]                         = io_interception,
4807         [SVM_EXIT_MSR]                          = msr_interception,
4808         [SVM_EXIT_TASK_SWITCH]                  = task_switch_interception,
4809         [SVM_EXIT_SHUTDOWN]                     = shutdown_interception,
4810         [SVM_EXIT_VMRUN]                        = vmrun_interception,
4811         [SVM_EXIT_VMMCALL]                      = vmmcall_interception,
4812         [SVM_EXIT_VMLOAD]                       = vmload_interception,
4813         [SVM_EXIT_VMSAVE]                       = vmsave_interception,
4814         [SVM_EXIT_STGI]                         = stgi_interception,
4815         [SVM_EXIT_CLGI]                         = clgi_interception,
4816         [SVM_EXIT_SKINIT]                       = skinit_interception,
4817         [SVM_EXIT_WBINVD]                       = wbinvd_interception,
4818         [SVM_EXIT_MONITOR]                      = monitor_interception,
4819         [SVM_EXIT_MWAIT]                        = mwait_interception,
4820         [SVM_EXIT_XSETBV]                       = xsetbv_interception,
4821         [SVM_EXIT_NPF]                          = npf_interception,
4822         [SVM_EXIT_RSM]                          = rsm_interception,
4823         [SVM_EXIT_AVIC_INCOMPLETE_IPI]          = avic_incomplete_ipi_interception,
4824         [SVM_EXIT_AVIC_UNACCELERATED_ACCESS]    = avic_unaccelerated_access_interception,
4825 };
4826
4827 static void dump_vmcb(struct kvm_vcpu *vcpu)
4828 {
4829         struct vcpu_svm *svm = to_svm(vcpu);
4830         struct vmcb_control_area *control = &svm->vmcb->control;
4831         struct vmcb_save_area *save = &svm->vmcb->save;
4832
4833         pr_err("VMCB Control Area:\n");
4834         pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
4835         pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
4836         pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
4837         pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
4838         pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
4839         pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
4840         pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
4841         pr_err("%-20s%d\n", "pause filter threshold:",
4842                control->pause_filter_thresh);
4843         pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
4844         pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
4845         pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
4846         pr_err("%-20s%d\n", "asid:", control->asid);
4847         pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
4848         pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
4849         pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
4850         pr_err("%-20s%08x\n", "int_state:", control->int_state);
4851         pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
4852         pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
4853         pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
4854         pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
4855         pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
4856         pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
4857         pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
4858         pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar);
4859         pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
4860         pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
4861         pr_err("%-20s%lld\n", "virt_ext:", control->virt_ext);
4862         pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
4863         pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page);
4864         pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id);
4865         pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id);
4866         pr_err("VMCB State Save Area:\n");
4867         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4868                "es:",
4869                save->es.selector, save->es.attrib,
4870                save->es.limit, save->es.base);
4871         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4872                "cs:",
4873                save->cs.selector, save->cs.attrib,
4874                save->cs.limit, save->cs.base);
4875         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4876                "ss:",
4877                save->ss.selector, save->ss.attrib,
4878                save->ss.limit, save->ss.base);
4879         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4880                "ds:",
4881                save->ds.selector, save->ds.attrib,
4882                save->ds.limit, save->ds.base);
4883         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4884                "fs:",
4885                save->fs.selector, save->fs.attrib,
4886                save->fs.limit, save->fs.base);
4887         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4888                "gs:",
4889                save->gs.selector, save->gs.attrib,
4890                save->gs.limit, save->gs.base);
4891         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4892                "gdtr:",
4893                save->gdtr.selector, save->gdtr.attrib,
4894                save->gdtr.limit, save->gdtr.base);
4895         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4896                "ldtr:",
4897                save->ldtr.selector, save->ldtr.attrib,
4898                save->ldtr.limit, save->ldtr.base);
4899         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4900                "idtr:",
4901                save->idtr.selector, save->idtr.attrib,
4902                save->idtr.limit, save->idtr.base);
4903         pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
4904                "tr:",
4905                save->tr.selector, save->tr.attrib,
4906                save->tr.limit, save->tr.base);
4907         pr_err("cpl:            %d                efer:         %016llx\n",
4908                 save->cpl, save->efer);
4909         pr_err("%-15s %016llx %-13s %016llx\n",
4910                "cr0:", save->cr0, "cr2:", save->cr2);
4911         pr_err("%-15s %016llx %-13s %016llx\n",
4912                "cr3:", save->cr3, "cr4:", save->cr4);
4913         pr_err("%-15s %016llx %-13s %016llx\n",
4914                "dr6:", save->dr6, "dr7:", save->dr7);
4915         pr_err("%-15s %016llx %-13s %016llx\n",
4916                "rip:", save->rip, "rflags:", save->rflags);
4917         pr_err("%-15s %016llx %-13s %016llx\n",
4918                "rsp:", save->rsp, "rax:", save->rax);
4919         pr_err("%-15s %016llx %-13s %016llx\n",
4920                "star:", save->star, "lstar:", save->lstar);
4921         pr_err("%-15s %016llx %-13s %016llx\n",
4922                "cstar:", save->cstar, "sfmask:", save->sfmask);
4923         pr_err("%-15s %016llx %-13s %016llx\n",
4924                "kernel_gs_base:", save->kernel_gs_base,
4925                "sysenter_cs:", save->sysenter_cs);
4926         pr_err("%-15s %016llx %-13s %016llx\n",
4927                "sysenter_esp:", save->sysenter_esp,
4928                "sysenter_eip:", save->sysenter_eip);
4929         pr_err("%-15s %016llx %-13s %016llx\n",
4930                "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
4931         pr_err("%-15s %016llx %-13s %016llx\n",
4932                "br_from:", save->br_from, "br_to:", save->br_to);
4933         pr_err("%-15s %016llx %-13s %016llx\n",
4934                "excp_from:", save->last_excp_from,
4935                "excp_to:", save->last_excp_to);
4936 }
4937
4938 static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
4939 {
4940         struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
4941
4942         *info1 = control->exit_info_1;
4943         *info2 = control->exit_info_2;
4944 }
4945
4946 static int handle_exit(struct kvm_vcpu *vcpu)
4947 {
4948         struct vcpu_svm *svm = to_svm(vcpu);
4949         struct kvm_run *kvm_run = vcpu->run;
4950         u32 exit_code = svm->vmcb->control.exit_code;
4951
4952         trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
4953
4954         if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
4955                 vcpu->arch.cr0 = svm->vmcb->save.cr0;
4956         if (npt_enabled)
4957                 vcpu->arch.cr3 = svm->vmcb->save.cr3;
4958
4959         if (unlikely(svm->nested.exit_required)) {
4960                 nested_svm_vmexit(svm);
4961                 svm->nested.exit_required = false;
4962
4963                 return 1;
4964         }
4965
4966         if (is_guest_mode(vcpu)) {
4967                 int vmexit;
4968
4969                 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
4970                                         svm->vmcb->control.exit_info_1,
4971                                         svm->vmcb->control.exit_info_2,
4972                                         svm->vmcb->control.exit_int_info,
4973                                         svm->vmcb->control.exit_int_info_err,
4974                                         KVM_ISA_SVM);
4975
4976                 vmexit = nested_svm_exit_special(svm);
4977
4978                 if (vmexit == NESTED_EXIT_CONTINUE)
4979                         vmexit = nested_svm_exit_handled(svm);
4980
4981                 if (vmexit == NESTED_EXIT_DONE)
4982                         return 1;
4983         }
4984
4985         svm_complete_interrupts(svm);
4986
4987         if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
4988                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
4989                 kvm_run->fail_entry.hardware_entry_failure_reason
4990                         = svm->vmcb->control.exit_code;
4991                 pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
4992                 dump_vmcb(vcpu);
4993                 return 0;
4994         }
4995
4996         if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
4997             exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
4998             exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
4999             exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
5000                 printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
5001                        "exit_code 0x%x\n",
5002                        __func__, svm->vmcb->control.exit_int_info,
5003                        exit_code);
5004
5005         if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
5006             || !svm_exit_handlers[exit_code]) {
5007                 WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
5008                 kvm_queue_exception(vcpu, UD_VECTOR);
5009                 return 1;
5010         }
5011
5012         return svm_exit_handlers[exit_code](svm);
5013 }
5014
5015 static void reload_tss(struct kvm_vcpu *vcpu)
5016 {
5017         int cpu = raw_smp_processor_id();
5018
5019         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
5020         sd->tss_desc->type = 9; /* available 32/64-bit TSS */
5021         load_TR_desc();
5022 }
5023
5024 static void pre_sev_run(struct vcpu_svm *svm, int cpu)
5025 {
5026         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
5027         int asid = sev_get_asid(svm->vcpu.kvm);
5028
5029         /* Assign the asid allocated with this SEV guest */
5030         svm->vmcb->control.asid = asid;
5031
5032         /*
5033          * Flush guest TLB:
5034          *
5035          * 1) when different VMCB for the same ASID is to be run on the same host CPU.
5036          * 2) or this VMCB was executed on different host CPU in previous VMRUNs.
5037          */
5038         if (sd->sev_vmcbs[asid] == svm->vmcb &&
5039             svm->last_cpu == cpu)
5040                 return;
5041
5042         svm->last_cpu = cpu;
5043         sd->sev_vmcbs[asid] = svm->vmcb;
5044         svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
5045         mark_dirty(svm->vmcb, VMCB_ASID);
5046 }
5047
5048 static void pre_svm_run(struct vcpu_svm *svm)
5049 {
5050         int cpu = raw_smp_processor_id();
5051
5052         struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
5053
5054         if (sev_guest(svm->vcpu.kvm))
5055                 return pre_sev_run(svm, cpu);
5056
5057         /* FIXME: handle wraparound of asid_generation */
5058         if (svm->asid_generation != sd->asid_generation)
5059                 new_asid(svm, sd);
5060 }
5061
5062 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
5063 {
5064         struct vcpu_svm *svm = to_svm(vcpu);
5065
5066         svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
5067         vcpu->arch.hflags |= HF_NMI_MASK;
5068         set_intercept(svm, INTERCEPT_IRET);
5069         ++vcpu->stat.nmi_injections;
5070 }
5071
5072 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
5073 {
5074         struct vmcb_control_area *control;
5075
5076         /* The following fields are ignored when AVIC is enabled */
5077         control = &svm->vmcb->control;
5078         control->int_vector = irq;
5079         control->int_ctl &= ~V_INTR_PRIO_MASK;
5080         control->int_ctl |= V_IRQ_MASK |
5081                 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
5082         mark_dirty(svm->vmcb, VMCB_INTR);
5083 }
5084
5085 static void svm_set_irq(struct kvm_vcpu *vcpu)
5086 {
5087         struct vcpu_svm *svm = to_svm(vcpu);
5088
5089         BUG_ON(!(gif_set(svm)));
5090
5091         trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
5092         ++vcpu->stat.irq_injections;
5093
5094         svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
5095                 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
5096 }
5097
5098 static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu)
5099 {
5100         return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK);
5101 }
5102
5103 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
5104 {
5105         struct vcpu_svm *svm = to_svm(vcpu);
5106
5107         if (svm_nested_virtualize_tpr(vcpu) ||
5108             kvm_vcpu_apicv_active(vcpu))
5109                 return;
5110
5111         clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
5112
5113         if (irr == -1)
5114                 return;
5115
5116         if (tpr >= irr)
5117                 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
5118 }
5119
5120 static void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
5121 {
5122         return;
5123 }
5124
5125 static bool svm_get_enable_apicv(struct kvm_vcpu *vcpu)
5126 {
5127         return avic && irqchip_split(vcpu->kvm);
5128 }
5129
5130 static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
5131 {
5132 }
5133
5134 static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
5135 {
5136 }
5137
5138 /* Note: Currently only used by Hyper-V. */
5139 static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
5140 {
5141         struct vcpu_svm *svm = to_svm(vcpu);
5142         struct vmcb *vmcb = svm->vmcb;
5143
5144         if (kvm_vcpu_apicv_active(vcpu))
5145                 vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
5146         else
5147                 vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
5148         mark_dirty(vmcb, VMCB_AVIC);
5149 }
5150
5151 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
5152 {
5153         return;
5154 }
5155
5156 static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
5157 {
5158         kvm_lapic_set_irr(vec, vcpu->arch.apic);
5159         smp_mb__after_atomic();
5160
5161         if (avic_vcpu_is_running(vcpu))
5162                 wrmsrl(SVM_AVIC_DOORBELL,
5163                        kvm_cpu_get_apicid(vcpu->cpu));
5164         else
5165                 kvm_vcpu_wake_up(vcpu);
5166 }
5167
5168 static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
5169 {
5170         unsigned long flags;
5171         struct amd_svm_iommu_ir *cur;
5172
5173         spin_lock_irqsave(&svm->ir_list_lock, flags);
5174         list_for_each_entry(cur, &svm->ir_list, node) {
5175                 if (cur->data != pi->ir_data)
5176                         continue;
5177                 list_del(&cur->node);
5178                 kfree(cur);
5179                 break;
5180         }
5181         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
5182 }
5183
5184 static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
5185 {
5186         int ret = 0;
5187         unsigned long flags;
5188         struct amd_svm_iommu_ir *ir;
5189
5190         /**
5191          * In some cases, the existing irte is updaed and re-set,
5192          * so we need to check here if it's already been * added
5193          * to the ir_list.
5194          */
5195         if (pi->ir_data && (pi->prev_ga_tag != 0)) {
5196                 struct kvm *kvm = svm->vcpu.kvm;
5197                 u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag);
5198                 struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id);
5199                 struct vcpu_svm *prev_svm;
5200
5201                 if (!prev_vcpu) {
5202                         ret = -EINVAL;
5203                         goto out;
5204                 }
5205
5206                 prev_svm = to_svm(prev_vcpu);
5207                 svm_ir_list_del(prev_svm, pi);
5208         }
5209
5210         /**
5211          * Allocating new amd_iommu_pi_data, which will get
5212          * add to the per-vcpu ir_list.
5213          */
5214         ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
5215         if (!ir) {
5216                 ret = -ENOMEM;
5217                 goto out;
5218         }
5219         ir->data = pi->ir_data;
5220
5221         spin_lock_irqsave(&svm->ir_list_lock, flags);
5222         list_add(&ir->node, &svm->ir_list);
5223         spin_unlock_irqrestore(&svm->ir_list_lock, flags);
5224 out:
5225         return ret;
5226 }
5227
5228 /**
5229  * Note:
5230  * The HW cannot support posting multicast/broadcast
5231  * interrupts to a vCPU. So, we still use legacy interrupt
5232  * remapping for these kind of interrupts.
5233  *
5234  * For lowest-priority interrupts, we only support
5235  * those with single CPU as the destination, e.g. user
5236  * configures the interrupts via /proc/irq or uses
5237  * irqbalance to make the interrupts single-CPU.
5238  */
5239 static int
5240 get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
5241                  struct vcpu_data *vcpu_info, struct vcpu_svm **svm)
5242 {
5243         struct kvm_lapic_irq irq;
5244         struct kvm_vcpu *vcpu = NULL;
5245
5246         kvm_set_msi_irq(kvm, e, &irq);
5247
5248         if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
5249                 pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
5250                          __func__, irq.vector);
5251                 return -1;
5252         }
5253
5254         pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
5255                  irq.vector);
5256         *svm = to_svm(vcpu);
5257         vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
5258         vcpu_info->vector = irq.vector;
5259
5260         return 0;
5261 }
5262
5263 /*
5264  * svm_update_pi_irte - set IRTE for Posted-Interrupts
5265  *
5266  * @kvm: kvm
5267  * @host_irq: host irq of the interrupt
5268  * @guest_irq: gsi of the interrupt
5269  * @set: set or unset PI
5270  * returns 0 on success, < 0 on failure
5271  */
5272 static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
5273                               uint32_t guest_irq, bool set)
5274 {
5275         struct kvm_kernel_irq_routing_entry *e;
5276         struct kvm_irq_routing_table *irq_rt;
5277         int idx, ret = -EINVAL;
5278
5279         if (!kvm_arch_has_assigned_device(kvm) ||
5280             !irq_remapping_cap(IRQ_POSTING_CAP))
5281                 return 0;
5282
5283         pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n",
5284                  __func__, host_irq, guest_irq, set);
5285
5286         idx = srcu_read_lock(&kvm->irq_srcu);
5287         irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
5288         WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
5289
5290         hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
5291                 struct vcpu_data vcpu_info;
5292                 struct vcpu_svm *svm = NULL;
5293
5294                 if (e->type != KVM_IRQ_ROUTING_MSI)
5295                         continue;
5296
5297                 /**
5298                  * Here, we setup with legacy mode in the following cases:
5299                  * 1. When cannot target interrupt to a specific vcpu.
5300                  * 2. Unsetting posted interrupt.
5301                  * 3. APIC virtialization is disabled for the vcpu.
5302                  */
5303                 if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
5304                     kvm_vcpu_apicv_active(&svm->vcpu)) {
5305                         struct amd_iommu_pi_data pi;
5306
5307                         /* Try to enable guest_mode in IRTE */
5308                         pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
5309                                             AVIC_HPA_MASK);
5310                         pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id,
5311                                                      svm->vcpu.vcpu_id);
5312                         pi.is_guest_mode = true;
5313                         pi.vcpu_data = &vcpu_info;
5314                         ret = irq_set_vcpu_affinity(host_irq, &pi);
5315
5316                         /**
5317                          * Here, we successfully setting up vcpu affinity in
5318                          * IOMMU guest mode. Now, we need to store the posted
5319                          * interrupt information in a per-vcpu ir_list so that
5320                          * we can reference to them directly when we update vcpu
5321                          * scheduling information in IOMMU irte.
5322                          */
5323                         if (!ret && pi.is_guest_mode)
5324                                 svm_ir_list_add(svm, &pi);
5325                 } else {
5326                         /* Use legacy mode in IRTE */
5327                         struct amd_iommu_pi_data pi;
5328
5329                         /**
5330                          * Here, pi is used to:
5331                          * - Tell IOMMU to use legacy mode for this interrupt.
5332                          * - Retrieve ga_tag of prior interrupt remapping data.
5333                          */
5334                         pi.is_guest_mode = false;
5335                         ret = irq_set_vcpu_affinity(host_irq, &pi);
5336
5337                         /**
5338                          * Check if the posted interrupt was previously
5339                          * setup with the guest_mode by checking if the ga_tag
5340                          * was cached. If so, we need to clean up the per-vcpu
5341                          * ir_list.
5342                          */
5343                         if (!ret && pi.prev_ga_tag) {
5344                                 int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag);
5345                                 struct kvm_vcpu *vcpu;
5346
5347                                 vcpu = kvm_get_vcpu_by_id(kvm, id);
5348                                 if (vcpu)
5349                                         svm_ir_list_del(to_svm(vcpu), &pi);
5350                         }
5351                 }
5352
5353                 if (!ret && svm) {
5354                         trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id,
5355                                                  e->gsi, vcpu_info.vector,
5356                                                  vcpu_info.pi_desc_addr, set);
5357                 }
5358
5359                 if (ret < 0) {
5360                         pr_err("%s: failed to update PI IRTE\n", __func__);
5361                         goto out;
5362                 }
5363         }
5364
5365         ret = 0;
5366 out:
5367         srcu_read_unlock(&kvm->irq_srcu, idx);
5368         return ret;
5369 }
5370
5371 static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
5372 {
5373         struct vcpu_svm *svm = to_svm(vcpu);
5374         struct vmcb *vmcb = svm->vmcb;
5375         int ret;
5376         ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
5377               !(svm->vcpu.arch.hflags & HF_NMI_MASK);
5378         ret = ret && gif_set(svm) && nested_svm_nmi(svm);
5379
5380         return ret;
5381 }
5382
5383 static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
5384 {
5385         struct vcpu_svm *svm = to_svm(vcpu);
5386
5387         return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
5388 }
5389
5390 static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
5391 {
5392         struct vcpu_svm *svm = to_svm(vcpu);
5393
5394         if (masked) {
5395                 svm->vcpu.arch.hflags |= HF_NMI_MASK;
5396                 set_intercept(svm, INTERCEPT_IRET);
5397         } else {
5398                 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
5399                 clr_intercept(svm, INTERCEPT_IRET);
5400         }
5401 }
5402
5403 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
5404 {
5405         struct vcpu_svm *svm = to_svm(vcpu);
5406         struct vmcb *vmcb = svm->vmcb;
5407         int ret;
5408
5409         if (!gif_set(svm) ||
5410              (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
5411                 return 0;
5412
5413         ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
5414
5415         if (is_guest_mode(vcpu))
5416                 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
5417
5418         return ret;
5419 }
5420
5421 static void enable_irq_window(struct kvm_vcpu *vcpu)
5422 {
5423         struct vcpu_svm *svm = to_svm(vcpu);
5424
5425         if (kvm_vcpu_apicv_active(vcpu))
5426                 return;
5427
5428         /*
5429          * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes
5430          * 1, because that's a separate STGI/VMRUN intercept.  The next time we
5431          * get that intercept, this function will be called again though and
5432          * we'll get the vintr intercept. However, if the vGIF feature is
5433          * enabled, the STGI interception will not occur. Enable the irq
5434          * window under the assumption that the hardware will set the GIF.
5435          */
5436         if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) {
5437                 svm_set_vintr(svm);
5438                 svm_inject_irq(svm, 0x0);
5439         }
5440 }
5441
5442 static void enable_nmi_window(struct kvm_vcpu *vcpu)
5443 {
5444         struct vcpu_svm *svm = to_svm(vcpu);
5445
5446         if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
5447             == HF_NMI_MASK)
5448                 return; /* IRET will cause a vm exit */
5449
5450         if (!gif_set(svm)) {
5451                 if (vgif_enabled(svm))
5452                         set_intercept(svm, INTERCEPT_STGI);
5453                 return; /* STGI will cause a vm exit */
5454         }
5455
5456         if (svm->nested.exit_required)
5457                 return; /* we're not going to run the guest yet */
5458
5459         /*
5460          * Something prevents NMI from been injected. Single step over possible
5461          * problem (IRET or exception injection or interrupt shadow)
5462          */
5463         svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu);
5464         svm->nmi_singlestep = true;
5465         svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
5466 }
5467
5468 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
5469 {
5470         return 0;
5471 }
5472
5473 static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5474 {
5475         return 0;
5476 }
5477
5478 static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
5479 {
5480         struct vcpu_svm *svm = to_svm(vcpu);
5481
5482         if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
5483                 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
5484         else
5485                 svm->asid_generation--;
5486 }
5487
5488 static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
5489 {
5490         struct vcpu_svm *svm = to_svm(vcpu);
5491
5492         invlpga(gva, svm->vmcb->control.asid);
5493 }
5494
5495 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
5496 {
5497 }
5498
5499 static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
5500 {
5501         struct vcpu_svm *svm = to_svm(vcpu);
5502
5503         if (svm_nested_virtualize_tpr(vcpu))
5504                 return;
5505
5506         if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
5507                 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
5508                 kvm_set_cr8(vcpu, cr8);
5509         }
5510 }
5511
5512 static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
5513 {
5514         struct vcpu_svm *svm = to_svm(vcpu);
5515         u64 cr8;
5516
5517         if (svm_nested_virtualize_tpr(vcpu) ||
5518             kvm_vcpu_apicv_active(vcpu))
5519                 return;
5520
5521         cr8 = kvm_get_cr8(vcpu);
5522         svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
5523         svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
5524 }
5525
5526 static void svm_complete_interrupts(struct vcpu_svm *svm)
5527 {
5528         u8 vector;
5529         int type;
5530         u32 exitintinfo = svm->vmcb->control.exit_int_info;
5531         unsigned int3_injected = svm->int3_injected;
5532
5533         svm->int3_injected = 0;
5534
5535         /*
5536          * If we've made progress since setting HF_IRET_MASK, we've
5537          * executed an IRET and can allow NMI injection.
5538          */
5539         if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
5540             && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
5541                 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
5542                 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
5543         }
5544
5545         svm->vcpu.arch.nmi_injected = false;
5546         kvm_clear_exception_queue(&svm->vcpu);
5547         kvm_clear_interrupt_queue(&svm->vcpu);
5548
5549         if (!(exitintinfo & SVM_EXITINTINFO_VALID))
5550                 return;
5551
5552         kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
5553
5554         vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
5555         type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
5556
5557         switch (type) {
5558         case SVM_EXITINTINFO_TYPE_NMI:
5559                 svm->vcpu.arch.nmi_injected = true;
5560                 break;
5561         case SVM_EXITINTINFO_TYPE_EXEPT:
5562                 /*
5563                  * In case of software exceptions, do not reinject the vector,
5564                  * but re-execute the instruction instead. Rewind RIP first
5565                  * if we emulated INT3 before.
5566                  */
5567                 if (kvm_exception_is_soft(vector)) {
5568                         if (vector == BP_VECTOR && int3_injected &&
5569                             kvm_is_linear_rip(&svm->vcpu, svm->int3_rip))
5570                                 kvm_rip_write(&svm->vcpu,
5571                                               kvm_rip_read(&svm->vcpu) -
5572                                               int3_injected);
5573                         break;
5574                 }
5575                 if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
5576                         u32 err = svm->vmcb->control.exit_int_info_err;
5577                         kvm_requeue_exception_e(&svm->vcpu, vector, err);
5578
5579                 } else
5580                         kvm_requeue_exception(&svm->vcpu, vector);
5581                 break;
5582         case SVM_EXITINTINFO_TYPE_INTR:
5583                 kvm_queue_interrupt(&svm->vcpu, vector, false);
5584                 break;
5585         default:
5586                 break;
5587         }
5588 }
5589
5590 static void svm_cancel_injection(struct kvm_vcpu *vcpu)
5591 {
5592         struct vcpu_svm *svm = to_svm(vcpu);
5593         struct vmcb_control_area *control = &svm->vmcb->control;
5594
5595         control->exit_int_info = control->event_inj;
5596         control->exit_int_info_err = control->event_inj_err;
5597         control->event_inj = 0;
5598         svm_complete_interrupts(svm);
5599 }
5600
5601 static void svm_vcpu_run(struct kvm_vcpu *vcpu)
5602 {
5603         struct vcpu_svm *svm = to_svm(vcpu);
5604
5605         svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
5606         svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
5607         svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
5608
5609         /*
5610          * A vmexit emulation is required before the vcpu can be executed
5611          * again.
5612          */
5613         if (unlikely(svm->nested.exit_required))
5614                 return;
5615
5616         /*
5617          * Disable singlestep if we're injecting an interrupt/exception.
5618          * We don't want our modified rflags to be pushed on the stack where
5619          * we might not be able to easily reset them if we disabled NMI
5620          * singlestep later.
5621          */
5622         if (svm->nmi_singlestep && svm->vmcb->control.event_inj) {
5623                 /*
5624                  * Event injection happens before external interrupts cause a
5625                  * vmexit and interrupts are disabled here, so smp_send_reschedule
5626                  * is enough to force an immediate vmexit.
5627                  */
5628                 disable_nmi_singlestep(svm);
5629                 smp_send_reschedule(vcpu->cpu);
5630         }
5631
5632         pre_svm_run(svm);
5633
5634         sync_lapic_to_cr8(vcpu);
5635
5636         svm->vmcb->save.cr2 = vcpu->arch.cr2;
5637
5638         clgi();
5639         kvm_load_guest_xcr0(vcpu);
5640
5641         /*
5642          * If this vCPU has touched SPEC_CTRL, restore the guest's value if
5643          * it's non-zero. Since vmentry is serialising on affected CPUs, there
5644          * is no need to worry about the conditional branch over the wrmsr
5645          * being speculatively taken.
5646          */
5647         x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl);
5648
5649         local_irq_enable();
5650
5651         asm volatile (
5652                 "push %%" _ASM_BP "; \n\t"
5653                 "mov %c[rbx](%[svm]), %%" _ASM_BX " \n\t"
5654                 "mov %c[rcx](%[svm]), %%" _ASM_CX " \n\t"
5655                 "mov %c[rdx](%[svm]), %%" _ASM_DX " \n\t"
5656                 "mov %c[rsi](%[svm]), %%" _ASM_SI " \n\t"
5657                 "mov %c[rdi](%[svm]), %%" _ASM_DI " \n\t"
5658                 "mov %c[rbp](%[svm]), %%" _ASM_BP " \n\t"
5659 #ifdef CONFIG_X86_64
5660                 "mov %c[r8](%[svm]),  %%r8  \n\t"
5661                 "mov %c[r9](%[svm]),  %%r9  \n\t"
5662                 "mov %c[r10](%[svm]), %%r10 \n\t"
5663                 "mov %c[r11](%[svm]), %%r11 \n\t"
5664                 "mov %c[r12](%[svm]), %%r12 \n\t"
5665                 "mov %c[r13](%[svm]), %%r13 \n\t"
5666                 "mov %c[r14](%[svm]), %%r14 \n\t"
5667                 "mov %c[r15](%[svm]), %%r15 \n\t"
5668 #endif
5669
5670                 /* Enter guest mode */
5671                 "push %%" _ASM_AX " \n\t"
5672                 "mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
5673                 __ex("vmload %%" _ASM_AX) "\n\t"
5674                 __ex("vmrun %%" _ASM_AX) "\n\t"
5675                 __ex("vmsave %%" _ASM_AX) "\n\t"
5676                 "pop %%" _ASM_AX " \n\t"
5677
5678                 /* Save guest registers, load host registers */
5679                 "mov %%" _ASM_BX ", %c[rbx](%[svm]) \n\t"
5680                 "mov %%" _ASM_CX ", %c[rcx](%[svm]) \n\t"
5681                 "mov %%" _ASM_DX ", %c[rdx](%[svm]) \n\t"
5682                 "mov %%" _ASM_SI ", %c[rsi](%[svm]) \n\t"
5683                 "mov %%" _ASM_DI ", %c[rdi](%[svm]) \n\t"
5684                 "mov %%" _ASM_BP ", %c[rbp](%[svm]) \n\t"
5685 #ifdef CONFIG_X86_64
5686                 "mov %%r8,  %c[r8](%[svm]) \n\t"
5687                 "mov %%r9,  %c[r9](%[svm]) \n\t"
5688                 "mov %%r10, %c[r10](%[svm]) \n\t"
5689                 "mov %%r11, %c[r11](%[svm]) \n\t"
5690                 "mov %%r12, %c[r12](%[svm]) \n\t"
5691                 "mov %%r13, %c[r13](%[svm]) \n\t"
5692                 "mov %%r14, %c[r14](%[svm]) \n\t"
5693                 "mov %%r15, %c[r15](%[svm]) \n\t"
5694                 /*
5695                 * Clear host registers marked as clobbered to prevent
5696                 * speculative use.
5697                 */
5698                 "xor %%r8d, %%r8d \n\t"
5699                 "xor %%r9d, %%r9d \n\t"
5700                 "xor %%r10d, %%r10d \n\t"
5701                 "xor %%r11d, %%r11d \n\t"
5702                 "xor %%r12d, %%r12d \n\t"
5703                 "xor %%r13d, %%r13d \n\t"
5704                 "xor %%r14d, %%r14d \n\t"
5705                 "xor %%r15d, %%r15d \n\t"
5706 #endif
5707                 "xor %%ebx, %%ebx \n\t"
5708                 "xor %%ecx, %%ecx \n\t"
5709                 "xor %%edx, %%edx \n\t"
5710                 "xor %%esi, %%esi \n\t"
5711                 "xor %%edi, %%edi \n\t"
5712                 "pop %%" _ASM_BP
5713                 :
5714                 : [svm]"a"(svm),
5715                   [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
5716                   [rbx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBX])),
5717                   [rcx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RCX])),
5718                   [rdx]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDX])),
5719                   [rsi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RSI])),
5720                   [rdi]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RDI])),
5721                   [rbp]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_RBP]))
5722 #ifdef CONFIG_X86_64
5723                   , [r8]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R8])),
5724                   [r9]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R9])),
5725                   [r10]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R10])),
5726                   [r11]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R11])),
5727                   [r12]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R12])),
5728                   [r13]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R13])),
5729                   [r14]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R14])),
5730                   [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
5731 #endif
5732                 : "cc", "memory"
5733 #ifdef CONFIG_X86_64
5734                 , "rbx", "rcx", "rdx", "rsi", "rdi"
5735                 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
5736 #else
5737                 , "ebx", "ecx", "edx", "esi", "edi"
5738 #endif
5739                 );
5740
5741         /* Eliminate branch target predictions from guest mode */
5742         vmexit_fill_RSB();
5743
5744 #ifdef CONFIG_X86_64
5745         wrmsrl(MSR_GS_BASE, svm->host.gs_base);
5746 #else
5747         loadsegment(fs, svm->host.fs);
5748 #ifndef CONFIG_X86_32_LAZY_GS
5749         loadsegment(gs, svm->host.gs);
5750 #endif
5751 #endif
5752
5753         /*
5754          * We do not use IBRS in the kernel. If this vCPU has used the
5755          * SPEC_CTRL MSR it may have left it on; save the value and
5756          * turn it off. This is much more efficient than blindly adding
5757          * it to the atomic save/restore list. Especially as the former
5758          * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
5759          *
5760          * For non-nested case:
5761          * If the L01 MSR bitmap does not intercept the MSR, then we need to
5762          * save it.
5763          *
5764          * For nested case:
5765          * If the L02 MSR bitmap does not intercept the MSR, then we need to
5766          * save it.
5767          */
5768         if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
5769                 svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
5770
5771         reload_tss(vcpu);
5772
5773         local_irq_disable();
5774
5775         x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl);
5776
5777         vcpu->arch.cr2 = svm->vmcb->save.cr2;
5778         vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
5779         vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
5780         vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
5781
5782         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5783                 kvm_before_interrupt(&svm->vcpu);
5784
5785         kvm_put_guest_xcr0(vcpu);
5786         stgi();
5787
5788         /* Any pending NMI will happen here */
5789
5790         if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
5791                 kvm_after_interrupt(&svm->vcpu);
5792
5793         sync_cr8_to_lapic(vcpu);
5794
5795         svm->next_rip = 0;
5796
5797         svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
5798
5799         /* if exit due to PF check for async PF */
5800         if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
5801                 svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
5802
5803         if (npt_enabled) {
5804                 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
5805                 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
5806         }
5807
5808         /*
5809          * We need to handle MC intercepts here before the vcpu has a chance to
5810          * change the physical cpu
5811          */
5812         if (unlikely(svm->vmcb->control.exit_code ==
5813                      SVM_EXIT_EXCP_BASE + MC_VECTOR))
5814                 svm_handle_mce(svm);
5815
5816         mark_all_clean(svm->vmcb);
5817 }
5818 STACK_FRAME_NON_STANDARD(svm_vcpu_run);
5819
5820 static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5821 {
5822         struct vcpu_svm *svm = to_svm(vcpu);
5823
5824         svm->vmcb->save.cr3 = __sme_set(root);
5825         mark_dirty(svm->vmcb, VMCB_CR);
5826 }
5827
5828 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
5829 {
5830         struct vcpu_svm *svm = to_svm(vcpu);
5831
5832         svm->vmcb->control.nested_cr3 = __sme_set(root);
5833         mark_dirty(svm->vmcb, VMCB_NPT);
5834
5835         /* Also sync guest cr3 here in case we live migrate */
5836         svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
5837         mark_dirty(svm->vmcb, VMCB_CR);
5838 }
5839
5840 static int is_disabled(void)
5841 {
5842         u64 vm_cr;
5843
5844         rdmsrl(MSR_VM_CR, vm_cr);
5845         if (vm_cr & (1 << SVM_VM_CR_SVM_DISABLE))
5846                 return 1;
5847
5848         return 0;
5849 }
5850
5851 static void
5852 svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
5853 {
5854         /*
5855          * Patch in the VMMCALL instruction:
5856          */
5857         hypercall[0] = 0x0f;
5858         hypercall[1] = 0x01;
5859         hypercall[2] = 0xd9;
5860 }
5861
5862 static void svm_check_processor_compat(void *rtn)
5863 {
5864         *(int *)rtn = 0;
5865 }
5866
5867 static bool svm_cpu_has_accelerated_tpr(void)
5868 {
5869         return false;
5870 }
5871
5872 static bool svm_has_emulated_msr(int index)
5873 {
5874         switch (index) {
5875         case MSR_IA32_MCG_EXT_CTL:
5876                 return false;
5877         default:
5878                 break;
5879         }
5880
5881         return true;
5882 }
5883
5884 static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
5885 {
5886         return 0;
5887 }
5888
5889 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
5890 {
5891         struct vcpu_svm *svm = to_svm(vcpu);
5892
5893         /* Update nrips enabled cache */
5894         svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS);
5895
5896         if (!kvm_vcpu_apicv_active(vcpu))
5897                 return;
5898
5899         guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
5900 }
5901
5902 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
5903 {
5904         switch (func) {
5905         case 0x1:
5906                 if (avic)
5907                         entry->ecx &= ~bit(X86_FEATURE_X2APIC);
5908                 break;
5909         case 0x80000001:
5910                 if (nested)
5911                         entry->ecx |= (1 << 2); /* Set SVM bit */
5912                 break;
5913         case 0x8000000A:
5914                 entry->eax = 1; /* SVM revision 1 */
5915                 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
5916                                    ASID emulation to nested SVM */
5917                 entry->ecx = 0; /* Reserved */
5918                 entry->edx = 0; /* Per default do not support any
5919                                    additional features */
5920
5921                 /* Support next_rip if host supports it */
5922                 if (boot_cpu_has(X86_FEATURE_NRIPS))
5923                         entry->edx |= SVM_FEATURE_NRIP;
5924
5925                 /* Support NPT for the guest if enabled */
5926                 if (npt_enabled)
5927                         entry->edx |= SVM_FEATURE_NPT;
5928
5929                 break;
5930         case 0x8000001F:
5931                 /* Support memory encryption cpuid if host supports it */
5932                 if (boot_cpu_has(X86_FEATURE_SEV))
5933                         cpuid(0x8000001f, &entry->eax, &entry->ebx,
5934                                 &entry->ecx, &entry->edx);
5935
5936         }
5937 }
5938
5939 static int svm_get_lpage_level(void)
5940 {
5941         return PT_PDPE_LEVEL;
5942 }
5943
5944 static bool svm_rdtscp_supported(void)
5945 {
5946         return boot_cpu_has(X86_FEATURE_RDTSCP);
5947 }
5948
5949 static bool svm_invpcid_supported(void)
5950 {
5951         return false;
5952 }
5953
5954 static bool svm_mpx_supported(void)
5955 {
5956         return false;
5957 }
5958
5959 static bool svm_xsaves_supported(void)
5960 {
5961         return false;
5962 }
5963
5964 static bool svm_umip_emulated(void)
5965 {
5966         return false;
5967 }
5968
5969 static bool svm_pt_supported(void)
5970 {
5971         return false;
5972 }
5973
5974 static bool svm_has_wbinvd_exit(void)
5975 {
5976         return true;
5977 }
5978
5979 #define PRE_EX(exit)  { .exit_code = (exit), \
5980                         .stage = X86_ICPT_PRE_EXCEPT, }
5981 #define POST_EX(exit) { .exit_code = (exit), \
5982                         .stage = X86_ICPT_POST_EXCEPT, }
5983 #define POST_MEM(exit) { .exit_code = (exit), \
5984                         .stage = X86_ICPT_POST_MEMACCESS, }
5985
5986 static const struct __x86_intercept {
5987         u32 exit_code;
5988         enum x86_intercept_stage stage;
5989 } x86_intercept_map[] = {
5990         [x86_intercept_cr_read]         = POST_EX(SVM_EXIT_READ_CR0),
5991         [x86_intercept_cr_write]        = POST_EX(SVM_EXIT_WRITE_CR0),
5992         [x86_intercept_clts]            = POST_EX(SVM_EXIT_WRITE_CR0),
5993         [x86_intercept_lmsw]            = POST_EX(SVM_EXIT_WRITE_CR0),
5994         [x86_intercept_smsw]            = POST_EX(SVM_EXIT_READ_CR0),
5995         [x86_intercept_dr_read]         = POST_EX(SVM_EXIT_READ_DR0),
5996         [x86_intercept_dr_write]        = POST_EX(SVM_EXIT_WRITE_DR0),
5997         [x86_intercept_sldt]            = POST_EX(SVM_EXIT_LDTR_READ),
5998         [x86_intercept_str]             = POST_EX(SVM_EXIT_TR_READ),
5999         [x86_intercept_lldt]            = POST_EX(SVM_EXIT_LDTR_WRITE),
6000         [x86_intercept_ltr]             = POST_EX(SVM_EXIT_TR_WRITE),
6001         [x86_intercept_sgdt]            = POST_EX(SVM_EXIT_GDTR_READ),
6002         [x86_intercept_sidt]            = POST_EX(SVM_EXIT_IDTR_READ),
6003         [x86_intercept_lgdt]            = POST_EX(SVM_EXIT_GDTR_WRITE),
6004         [x86_intercept_lidt]            = POST_EX(SVM_EXIT_IDTR_WRITE),
6005         [x86_intercept_vmrun]           = POST_EX(SVM_EXIT_VMRUN),
6006         [x86_intercept_vmmcall]         = POST_EX(SVM_EXIT_VMMCALL),
6007         [x86_intercept_vmload]          = POST_EX(SVM_EXIT_VMLOAD),
6008         [x86_intercept_vmsave]          = POST_EX(SVM_EXIT_VMSAVE),
6009         [x86_intercept_stgi]            = POST_EX(SVM_EXIT_STGI),
6010         [x86_intercept_clgi]            = POST_EX(SVM_EXIT_CLGI),
6011         [x86_intercept_skinit]          = POST_EX(SVM_EXIT_SKINIT),
6012         [x86_intercept_invlpga]         = POST_EX(SVM_EXIT_INVLPGA),
6013         [x86_intercept_rdtscp]          = POST_EX(SVM_EXIT_RDTSCP),
6014         [x86_intercept_monitor]         = POST_MEM(SVM_EXIT_MONITOR),
6015         [x86_intercept_mwait]           = POST_EX(SVM_EXIT_MWAIT),
6016         [x86_intercept_invlpg]          = POST_EX(SVM_EXIT_INVLPG),
6017         [x86_intercept_invd]            = POST_EX(SVM_EXIT_INVD),
6018         [x86_intercept_wbinvd]          = POST_EX(SVM_EXIT_WBINVD),
6019         [x86_intercept_wrmsr]           = POST_EX(SVM_EXIT_MSR),
6020         [x86_intercept_rdtsc]           = POST_EX(SVM_EXIT_RDTSC),
6021         [x86_intercept_rdmsr]           = POST_EX(SVM_EXIT_MSR),
6022         [x86_intercept_rdpmc]           = POST_EX(SVM_EXIT_RDPMC),
6023         [x86_intercept_cpuid]           = PRE_EX(SVM_EXIT_CPUID),
6024         [x86_intercept_rsm]             = PRE_EX(SVM_EXIT_RSM),
6025         [x86_intercept_pause]           = PRE_EX(SVM_EXIT_PAUSE),
6026         [x86_intercept_pushf]           = PRE_EX(SVM_EXIT_PUSHF),
6027         [x86_intercept_popf]            = PRE_EX(SVM_EXIT_POPF),
6028         [x86_intercept_intn]            = PRE_EX(SVM_EXIT_SWINT),
6029         [x86_intercept_iret]            = PRE_EX(SVM_EXIT_IRET),
6030         [x86_intercept_icebp]           = PRE_EX(SVM_EXIT_ICEBP),
6031         [x86_intercept_hlt]             = POST_EX(SVM_EXIT_HLT),
6032         [x86_intercept_in]              = POST_EX(SVM_EXIT_IOIO),
6033         [x86_intercept_ins]             = POST_EX(SVM_EXIT_IOIO),
6034         [x86_intercept_out]             = POST_EX(SVM_EXIT_IOIO),
6035         [x86_intercept_outs]            = POST_EX(SVM_EXIT_IOIO),
6036 };
6037
6038 #undef PRE_EX
6039 #undef POST_EX
6040 #undef POST_MEM
6041
6042 static int svm_check_intercept(struct kvm_vcpu *vcpu,
6043                                struct x86_instruction_info *info,
6044                                enum x86_intercept_stage stage)
6045 {
6046         struct vcpu_svm *svm = to_svm(vcpu);
6047         int vmexit, ret = X86EMUL_CONTINUE;
6048         struct __x86_intercept icpt_info;
6049         struct vmcb *vmcb = svm->vmcb;
6050
6051         if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
6052                 goto out;
6053
6054         icpt_info = x86_intercept_map[info->intercept];
6055
6056         if (stage != icpt_info.stage)
6057                 goto out;
6058
6059         switch (icpt_info.exit_code) {
6060         case SVM_EXIT_READ_CR0:
6061                 if (info->intercept == x86_intercept_cr_read)
6062                         icpt_info.exit_code += info->modrm_reg;
6063                 break;
6064         case SVM_EXIT_WRITE_CR0: {
6065                 unsigned long cr0, val;
6066                 u64 intercept;
6067
6068                 if (info->intercept == x86_intercept_cr_write)
6069                         icpt_info.exit_code += info->modrm_reg;
6070
6071                 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 ||
6072                     info->intercept == x86_intercept_clts)
6073                         break;
6074
6075                 intercept = svm->nested.intercept;
6076
6077                 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
6078                         break;
6079
6080                 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
6081                 val = info->src_val  & ~SVM_CR0_SELECTIVE_MASK;
6082
6083                 if (info->intercept == x86_intercept_lmsw) {
6084                         cr0 &= 0xfUL;
6085                         val &= 0xfUL;
6086                         /* lmsw can't clear PE - catch this here */
6087                         if (cr0 & X86_CR0_PE)
6088                                 val |= X86_CR0_PE;
6089                 }
6090
6091                 if (cr0 ^ val)
6092                         icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
6093
6094                 break;
6095         }
6096         case SVM_EXIT_READ_DR0:
6097         case SVM_EXIT_WRITE_DR0:
6098                 icpt_info.exit_code += info->modrm_reg;
6099                 break;
6100         case SVM_EXIT_MSR:
6101                 if (info->intercept == x86_intercept_wrmsr)
6102                         vmcb->control.exit_info_1 = 1;
6103                 else
6104                         vmcb->control.exit_info_1 = 0;
6105                 break;
6106         case SVM_EXIT_PAUSE:
6107                 /*
6108                  * We get this for NOP only, but pause
6109                  * is rep not, check this here
6110                  */
6111                 if (info->rep_prefix != REPE_PREFIX)
6112                         goto out;
6113                 break;
6114         case SVM_EXIT_IOIO: {
6115                 u64 exit_info;
6116                 u32 bytes;
6117
6118                 if (info->intercept == x86_intercept_in ||
6119                     info->intercept == x86_intercept_ins) {
6120                         exit_info = ((info->src_val & 0xffff) << 16) |
6121                                 SVM_IOIO_TYPE_MASK;
6122                         bytes = info->dst_bytes;
6123                 } else {
6124                         exit_info = (info->dst_val & 0xffff) << 16;
6125                         bytes = info->src_bytes;
6126                 }
6127
6128                 if (info->intercept == x86_intercept_outs ||
6129                     info->intercept == x86_intercept_ins)
6130                         exit_info |= SVM_IOIO_STR_MASK;
6131
6132                 if (info->rep_prefix)
6133                         exit_info |= SVM_IOIO_REP_MASK;
6134
6135                 bytes = min(bytes, 4u);
6136
6137                 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
6138
6139                 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
6140
6141                 vmcb->control.exit_info_1 = exit_info;
6142                 vmcb->control.exit_info_2 = info->next_rip;
6143
6144                 break;
6145         }
6146         default:
6147                 break;
6148         }
6149
6150         /* TODO: Advertise NRIPS to guest hypervisor unconditionally */
6151         if (static_cpu_has(X86_FEATURE_NRIPS))
6152                 vmcb->control.next_rip  = info->next_rip;
6153         vmcb->control.exit_code = icpt_info.exit_code;
6154         vmexit = nested_svm_exit_handled(svm);
6155
6156         ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
6157                                            : X86EMUL_CONTINUE;
6158
6159 out:
6160         return ret;
6161 }
6162
6163 static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
6164 {
6165         local_irq_enable();
6166         /*
6167          * We must have an instruction with interrupts enabled, so
6168          * the timer interrupt isn't delayed by the interrupt shadow.
6169          */
6170         asm("nop");
6171         local_irq_disable();
6172 }
6173
6174 static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
6175 {
6176         if (pause_filter_thresh)
6177                 shrink_ple_window(vcpu);
6178 }
6179
6180 static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
6181 {
6182         if (avic_handle_apic_id_update(vcpu) != 0)
6183                 return;
6184         avic_handle_dfr_update(vcpu);
6185         avic_handle_ldr_update(vcpu);
6186 }
6187
6188 static void svm_setup_mce(struct kvm_vcpu *vcpu)
6189 {
6190         /* [63:9] are reserved. */
6191         vcpu->arch.mcg_cap &= 0x1ff;
6192 }
6193
6194 static int svm_smi_allowed(struct kvm_vcpu *vcpu)
6195 {
6196         struct vcpu_svm *svm = to_svm(vcpu);
6197
6198         /* Per APM Vol.2 15.22.2 "Response to SMI" */
6199         if (!gif_set(svm))
6200                 return 0;
6201
6202         if (is_guest_mode(&svm->vcpu) &&
6203             svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
6204                 /* TODO: Might need to set exit_info_1 and exit_info_2 here */
6205                 svm->vmcb->control.exit_code = SVM_EXIT_SMI;
6206                 svm->nested.exit_required = true;
6207                 return 0;
6208         }
6209
6210         return 1;
6211 }
6212
6213 static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
6214 {
6215         struct vcpu_svm *svm = to_svm(vcpu);
6216         int ret;
6217
6218         if (is_guest_mode(vcpu)) {
6219                 /* FED8h - SVM Guest */
6220                 put_smstate(u64, smstate, 0x7ed8, 1);
6221                 /* FEE0h - SVM Guest VMCB Physical Address */
6222                 put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
6223
6224                 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
6225                 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
6226                 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
6227
6228                 ret = nested_svm_vmexit(svm);
6229                 if (ret)
6230                         return ret;
6231         }
6232         return 0;
6233 }
6234
6235 static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
6236 {
6237         struct vcpu_svm *svm = to_svm(vcpu);
6238         struct vmcb *nested_vmcb;
6239         struct page *page;
6240         u64 guest;
6241         u64 vmcb;
6242
6243         guest = GET_SMSTATE(u64, smstate, 0x7ed8);
6244         vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
6245
6246         if (guest) {
6247                 nested_vmcb = nested_svm_map(svm, vmcb, &page);
6248                 if (!nested_vmcb)
6249                         return 1;
6250                 enter_svm_guest_mode(svm, vmcb, nested_vmcb, page);
6251         }
6252         return 0;
6253 }
6254
6255 static int enable_smi_window(struct kvm_vcpu *vcpu)
6256 {
6257         struct vcpu_svm *svm = to_svm(vcpu);
6258
6259         if (!gif_set(svm)) {
6260                 if (vgif_enabled(svm))
6261                         set_intercept(svm, INTERCEPT_STGI);
6262                 /* STGI will cause a vm exit */
6263                 return 1;
6264         }
6265         return 0;
6266 }
6267
6268 static int sev_asid_new(void)
6269 {
6270         int pos;
6271
6272         /*
6273          * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid.
6274          */
6275         pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1);
6276         if (pos >= max_sev_asid)
6277                 return -EBUSY;
6278
6279         set_bit(pos, sev_asid_bitmap);
6280         return pos + 1;
6281 }
6282
6283 static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
6284 {
6285         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6286         int asid, ret;
6287
6288         ret = -EBUSY;
6289         if (unlikely(sev->active))
6290                 return ret;
6291
6292         asid = sev_asid_new();
6293         if (asid < 0)
6294                 return ret;
6295
6296         ret = sev_platform_init(&argp->error);
6297         if (ret)
6298                 goto e_free;
6299
6300         sev->active = true;
6301         sev->asid = asid;
6302         INIT_LIST_HEAD(&sev->regions_list);
6303
6304         return 0;
6305
6306 e_free:
6307         __sev_asid_free(asid);
6308         return ret;
6309 }
6310
6311 static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
6312 {
6313         struct sev_data_activate *data;
6314         int asid = sev_get_asid(kvm);
6315         int ret;
6316
6317         wbinvd_on_all_cpus();
6318
6319         ret = sev_guest_df_flush(error);
6320         if (ret)
6321                 return ret;
6322
6323         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6324         if (!data)
6325                 return -ENOMEM;
6326
6327         /* activate ASID on the given handle */
6328         data->handle = handle;
6329         data->asid   = asid;
6330         ret = sev_guest_activate(data, error);
6331         kfree(data);
6332
6333         return ret;
6334 }
6335
6336 static int __sev_issue_cmd(int fd, int id, void *data, int *error)
6337 {
6338         struct fd f;
6339         int ret;
6340
6341         f = fdget(fd);
6342         if (!f.file)
6343                 return -EBADF;
6344
6345         ret = sev_issue_cmd_external_user(f.file, id, data, error);
6346
6347         fdput(f);
6348         return ret;
6349 }
6350
6351 static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error)
6352 {
6353         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6354
6355         return __sev_issue_cmd(sev->fd, id, data, error);
6356 }
6357
6358 static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
6359 {
6360         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6361         struct sev_data_launch_start *start;
6362         struct kvm_sev_launch_start params;
6363         void *dh_blob, *session_blob;
6364         int *error = &argp->error;
6365         int ret;
6366
6367         if (!sev_guest(kvm))
6368                 return -ENOTTY;
6369
6370         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
6371                 return -EFAULT;
6372
6373         start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
6374         if (!start)
6375                 return -ENOMEM;
6376
6377         dh_blob = NULL;
6378         if (params.dh_uaddr) {
6379                 dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len);
6380                 if (IS_ERR(dh_blob)) {
6381                         ret = PTR_ERR(dh_blob);
6382                         goto e_free;
6383                 }
6384
6385                 start->dh_cert_address = __sme_set(__pa(dh_blob));
6386                 start->dh_cert_len = params.dh_len;
6387         }
6388
6389         session_blob = NULL;
6390         if (params.session_uaddr) {
6391                 session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len);
6392                 if (IS_ERR(session_blob)) {
6393                         ret = PTR_ERR(session_blob);
6394                         goto e_free_dh;
6395                 }
6396
6397                 start->session_address = __sme_set(__pa(session_blob));
6398                 start->session_len = params.session_len;
6399         }
6400
6401         start->handle = params.handle;
6402         start->policy = params.policy;
6403
6404         /* create memory encryption context */
6405         ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error);
6406         if (ret)
6407                 goto e_free_session;
6408
6409         /* Bind ASID to this guest */
6410         ret = sev_bind_asid(kvm, start->handle, error);
6411         if (ret)
6412                 goto e_free_session;
6413
6414         /* return handle to userspace */
6415         params.handle = start->handle;
6416         if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params))) {
6417                 sev_unbind_asid(kvm, start->handle);
6418                 ret = -EFAULT;
6419                 goto e_free_session;
6420         }
6421
6422         sev->handle = start->handle;
6423         sev->fd = argp->sev_fd;
6424
6425 e_free_session:
6426         kfree(session_blob);
6427 e_free_dh:
6428         kfree(dh_blob);
6429 e_free:
6430         kfree(start);
6431         return ret;
6432 }
6433
6434 static unsigned long get_num_contig_pages(unsigned long idx,
6435                                 struct page **inpages, unsigned long npages)
6436 {
6437         unsigned long paddr, next_paddr;
6438         unsigned long i = idx + 1, pages = 1;
6439
6440         /* find the number of contiguous pages starting from idx */
6441         paddr = __sme_page_pa(inpages[idx]);
6442         while (i < npages) {
6443                 next_paddr = __sme_page_pa(inpages[i++]);
6444                 if ((paddr + PAGE_SIZE) == next_paddr) {
6445                         pages++;
6446                         paddr = next_paddr;
6447                         continue;
6448                 }
6449                 break;
6450         }
6451
6452         return pages;
6453 }
6454
6455 static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
6456 {
6457         unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
6458         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6459         struct kvm_sev_launch_update_data params;
6460         struct sev_data_launch_update_data *data;
6461         struct page **inpages;
6462         int ret;
6463
6464         if (!sev_guest(kvm))
6465                 return -ENOTTY;
6466
6467         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
6468                 return -EFAULT;
6469
6470         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6471         if (!data)
6472                 return -ENOMEM;
6473
6474         vaddr = params.uaddr;
6475         size = params.len;
6476         vaddr_end = vaddr + size;
6477
6478         /* Lock the user memory. */
6479         inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1);
6480         if (!inpages) {
6481                 ret = -ENOMEM;
6482                 goto e_free;
6483         }
6484
6485         /*
6486          * The LAUNCH_UPDATE command will perform in-place encryption of the
6487          * memory content (i.e it will write the same memory region with C=1).
6488          * It's possible that the cache may contain the data with C=0, i.e.,
6489          * unencrypted so invalidate it first.
6490          */
6491         sev_clflush_pages(inpages, npages);
6492
6493         for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) {
6494                 int offset, len;
6495
6496                 /*
6497                  * If the user buffer is not page-aligned, calculate the offset
6498                  * within the page.
6499                  */
6500                 offset = vaddr & (PAGE_SIZE - 1);
6501
6502                 /* Calculate the number of pages that can be encrypted in one go. */
6503                 pages = get_num_contig_pages(i, inpages, npages);
6504
6505                 len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size);
6506
6507                 data->handle = sev->handle;
6508                 data->len = len;
6509                 data->address = __sme_page_pa(inpages[i]) + offset;
6510                 ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error);
6511                 if (ret)
6512                         goto e_unpin;
6513
6514                 size -= len;
6515                 next_vaddr = vaddr + len;
6516         }
6517
6518 e_unpin:
6519         /* content of memory is updated, mark pages dirty */
6520         for (i = 0; i < npages; i++) {
6521                 set_page_dirty_lock(inpages[i]);
6522                 mark_page_accessed(inpages[i]);
6523         }
6524         /* unlock the user pages */
6525         sev_unpin_memory(kvm, inpages, npages);
6526 e_free:
6527         kfree(data);
6528         return ret;
6529 }
6530
6531 static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp)
6532 {
6533         void __user *measure = (void __user *)(uintptr_t)argp->data;
6534         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6535         struct sev_data_launch_measure *data;
6536         struct kvm_sev_launch_measure params;
6537         void __user *p = NULL;
6538         void *blob = NULL;
6539         int ret;
6540
6541         if (!sev_guest(kvm))
6542                 return -ENOTTY;
6543
6544         if (copy_from_user(&params, measure, sizeof(params)))
6545                 return -EFAULT;
6546
6547         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6548         if (!data)
6549                 return -ENOMEM;
6550
6551         /* User wants to query the blob length */
6552         if (!params.len)
6553                 goto cmd;
6554
6555         p = (void __user *)(uintptr_t)params.uaddr;
6556         if (p) {
6557                 if (params.len > SEV_FW_BLOB_MAX_SIZE) {
6558                         ret = -EINVAL;
6559                         goto e_free;
6560                 }
6561
6562                 ret = -ENOMEM;
6563                 blob = kmalloc(params.len, GFP_KERNEL);
6564                 if (!blob)
6565                         goto e_free;
6566
6567                 data->address = __psp_pa(blob);
6568                 data->len = params.len;
6569         }
6570
6571 cmd:
6572         data->handle = sev->handle;
6573         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error);
6574
6575         /*
6576          * If we query the session length, FW responded with expected data.
6577          */
6578         if (!params.len)
6579                 goto done;
6580
6581         if (ret)
6582                 goto e_free_blob;
6583
6584         if (blob) {
6585                 if (copy_to_user(p, blob, params.len))
6586                         ret = -EFAULT;
6587         }
6588
6589 done:
6590         params.len = data->len;
6591         if (copy_to_user(measure, &params, sizeof(params)))
6592                 ret = -EFAULT;
6593 e_free_blob:
6594         kfree(blob);
6595 e_free:
6596         kfree(data);
6597         return ret;
6598 }
6599
6600 static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp)
6601 {
6602         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6603         struct sev_data_launch_finish *data;
6604         int ret;
6605
6606         if (!sev_guest(kvm))
6607                 return -ENOTTY;
6608
6609         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6610         if (!data)
6611                 return -ENOMEM;
6612
6613         data->handle = sev->handle;
6614         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error);
6615
6616         kfree(data);
6617         return ret;
6618 }
6619
6620 static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp)
6621 {
6622         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6623         struct kvm_sev_guest_status params;
6624         struct sev_data_guest_status *data;
6625         int ret;
6626
6627         if (!sev_guest(kvm))
6628                 return -ENOTTY;
6629
6630         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6631         if (!data)
6632                 return -ENOMEM;
6633
6634         data->handle = sev->handle;
6635         ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error);
6636         if (ret)
6637                 goto e_free;
6638
6639         params.policy = data->policy;
6640         params.state = data->state;
6641         params.handle = data->handle;
6642
6643         if (copy_to_user((void __user *)(uintptr_t)argp->data, &params, sizeof(params)))
6644                 ret = -EFAULT;
6645 e_free:
6646         kfree(data);
6647         return ret;
6648 }
6649
6650 static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src,
6651                                unsigned long dst, int size,
6652                                int *error, bool enc)
6653 {
6654         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6655         struct sev_data_dbg *data;
6656         int ret;
6657
6658         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6659         if (!data)
6660                 return -ENOMEM;
6661
6662         data->handle = sev->handle;
6663         data->dst_addr = dst;
6664         data->src_addr = src;
6665         data->len = size;
6666
6667         ret = sev_issue_cmd(kvm,
6668                             enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT,
6669                             data, error);
6670         kfree(data);
6671         return ret;
6672 }
6673
6674 static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr,
6675                              unsigned long dst_paddr, int sz, int *err)
6676 {
6677         int offset;
6678
6679         /*
6680          * Its safe to read more than we are asked, caller should ensure that
6681          * destination has enough space.
6682          */
6683         src_paddr = round_down(src_paddr, 16);
6684         offset = src_paddr & 15;
6685         sz = round_up(sz + offset, 16);
6686
6687         return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false);
6688 }
6689
6690 static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr,
6691                                   unsigned long __user dst_uaddr,
6692                                   unsigned long dst_paddr,
6693                                   int size, int *err)
6694 {
6695         struct page *tpage = NULL;
6696         int ret, offset;
6697
6698         /* if inputs are not 16-byte then use intermediate buffer */
6699         if (!IS_ALIGNED(dst_paddr, 16) ||
6700             !IS_ALIGNED(paddr,     16) ||
6701             !IS_ALIGNED(size,      16)) {
6702                 tpage = (void *)alloc_page(GFP_KERNEL);
6703                 if (!tpage)
6704                         return -ENOMEM;
6705
6706                 dst_paddr = __sme_page_pa(tpage);
6707         }
6708
6709         ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err);
6710         if (ret)
6711                 goto e_free;
6712
6713         if (tpage) {
6714                 offset = paddr & 15;
6715                 if (copy_to_user((void __user *)(uintptr_t)dst_uaddr,
6716                                  page_address(tpage) + offset, size))
6717                         ret = -EFAULT;
6718         }
6719
6720 e_free:
6721         if (tpage)
6722                 __free_page(tpage);
6723
6724         return ret;
6725 }
6726
6727 static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr,
6728                                   unsigned long __user vaddr,
6729                                   unsigned long dst_paddr,
6730                                   unsigned long __user dst_vaddr,
6731                                   int size, int *error)
6732 {
6733         struct page *src_tpage = NULL;
6734         struct page *dst_tpage = NULL;
6735         int ret, len = size;
6736
6737         /* If source buffer is not aligned then use an intermediate buffer */
6738         if (!IS_ALIGNED(vaddr, 16)) {
6739                 src_tpage = alloc_page(GFP_KERNEL);
6740                 if (!src_tpage)
6741                         return -ENOMEM;
6742
6743                 if (copy_from_user(page_address(src_tpage),
6744                                 (void __user *)(uintptr_t)vaddr, size)) {
6745                         __free_page(src_tpage);
6746                         return -EFAULT;
6747                 }
6748
6749                 paddr = __sme_page_pa(src_tpage);
6750         }
6751
6752         /*
6753          *  If destination buffer or length is not aligned then do read-modify-write:
6754          *   - decrypt destination in an intermediate buffer
6755          *   - copy the source buffer in an intermediate buffer
6756          *   - use the intermediate buffer as source buffer
6757          */
6758         if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) {
6759                 int dst_offset;
6760
6761                 dst_tpage = alloc_page(GFP_KERNEL);
6762                 if (!dst_tpage) {
6763                         ret = -ENOMEM;
6764                         goto e_free;
6765                 }
6766
6767                 ret = __sev_dbg_decrypt(kvm, dst_paddr,
6768                                         __sme_page_pa(dst_tpage), size, error);
6769                 if (ret)
6770                         goto e_free;
6771
6772                 /*
6773                  *  If source is kernel buffer then use memcpy() otherwise
6774                  *  copy_from_user().
6775                  */
6776                 dst_offset = dst_paddr & 15;
6777
6778                 if (src_tpage)
6779                         memcpy(page_address(dst_tpage) + dst_offset,
6780                                page_address(src_tpage), size);
6781                 else {
6782                         if (copy_from_user(page_address(dst_tpage) + dst_offset,
6783                                            (void __user *)(uintptr_t)vaddr, size)) {
6784                                 ret = -EFAULT;
6785                                 goto e_free;
6786                         }
6787                 }
6788
6789                 paddr = __sme_page_pa(dst_tpage);
6790                 dst_paddr = round_down(dst_paddr, 16);
6791                 len = round_up(size, 16);
6792         }
6793
6794         ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true);
6795
6796 e_free:
6797         if (src_tpage)
6798                 __free_page(src_tpage);
6799         if (dst_tpage)
6800                 __free_page(dst_tpage);
6801         return ret;
6802 }
6803
6804 static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec)
6805 {
6806         unsigned long vaddr, vaddr_end, next_vaddr;
6807         unsigned long dst_vaddr;
6808         struct page **src_p, **dst_p;
6809         struct kvm_sev_dbg debug;
6810         unsigned long n;
6811         unsigned int size;
6812         int ret;
6813
6814         if (!sev_guest(kvm))
6815                 return -ENOTTY;
6816
6817         if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
6818                 return -EFAULT;
6819
6820         if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
6821                 return -EINVAL;
6822         if (!debug.dst_uaddr)
6823                 return -EINVAL;
6824
6825         vaddr = debug.src_uaddr;
6826         size = debug.len;
6827         vaddr_end = vaddr + size;
6828         dst_vaddr = debug.dst_uaddr;
6829
6830         for (; vaddr < vaddr_end; vaddr = next_vaddr) {
6831                 int len, s_off, d_off;
6832
6833                 /* lock userspace source and destination page */
6834                 src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0);
6835                 if (!src_p)
6836                         return -EFAULT;
6837
6838                 dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1);
6839                 if (!dst_p) {
6840                         sev_unpin_memory(kvm, src_p, n);
6841                         return -EFAULT;
6842                 }
6843
6844                 /*
6845                  * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the
6846                  * memory content (i.e it will write the same memory region with C=1).
6847                  * It's possible that the cache may contain the data with C=0, i.e.,
6848                  * unencrypted so invalidate it first.
6849                  */
6850                 sev_clflush_pages(src_p, 1);
6851                 sev_clflush_pages(dst_p, 1);
6852
6853                 /*
6854                  * Since user buffer may not be page aligned, calculate the
6855                  * offset within the page.
6856                  */
6857                 s_off = vaddr & ~PAGE_MASK;
6858                 d_off = dst_vaddr & ~PAGE_MASK;
6859                 len = min_t(size_t, (PAGE_SIZE - s_off), size);
6860
6861                 if (dec)
6862                         ret = __sev_dbg_decrypt_user(kvm,
6863                                                      __sme_page_pa(src_p[0]) + s_off,
6864                                                      dst_vaddr,
6865                                                      __sme_page_pa(dst_p[0]) + d_off,
6866                                                      len, &argp->error);
6867                 else
6868                         ret = __sev_dbg_encrypt_user(kvm,
6869                                                      __sme_page_pa(src_p[0]) + s_off,
6870                                                      vaddr,
6871                                                      __sme_page_pa(dst_p[0]) + d_off,
6872                                                      dst_vaddr,
6873                                                      len, &argp->error);
6874
6875                 sev_unpin_memory(kvm, src_p, n);
6876                 sev_unpin_memory(kvm, dst_p, n);
6877
6878                 if (ret)
6879                         goto err;
6880
6881                 next_vaddr = vaddr + len;
6882                 dst_vaddr = dst_vaddr + len;
6883                 size -= len;
6884         }
6885 err:
6886         return ret;
6887 }
6888
6889 static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
6890 {
6891         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
6892         struct sev_data_launch_secret *data;
6893         struct kvm_sev_launch_secret params;
6894         struct page **pages;
6895         void *blob, *hdr;
6896         unsigned long n;
6897         int ret, offset;
6898
6899         if (!sev_guest(kvm))
6900                 return -ENOTTY;
6901
6902         if (copy_from_user(&params, (void __user *)(uintptr_t)argp->data, sizeof(params)))
6903                 return -EFAULT;
6904
6905         pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1);
6906         if (!pages)
6907                 return -ENOMEM;
6908
6909         /*
6910          * The secret must be copied into contiguous memory region, lets verify
6911          * that userspace memory pages are contiguous before we issue command.
6912          */
6913         if (get_num_contig_pages(0, pages, n) != n) {
6914                 ret = -EINVAL;
6915                 goto e_unpin_memory;
6916         }
6917
6918         ret = -ENOMEM;
6919         data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
6920         if (!data)
6921                 goto e_unpin_memory;
6922
6923         offset = params.guest_uaddr & (PAGE_SIZE - 1);
6924         data->guest_address = __sme_page_pa(pages[0]) + offset;
6925         data->guest_len = params.guest_len;
6926
6927         blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len);
6928         if (IS_ERR(blob)) {
6929                 ret = PTR_ERR(blob);
6930                 goto e_free;
6931         }
6932
6933         data->trans_address = __psp_pa(blob);
6934         data->trans_len = params.trans_len;
6935
6936         hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len);
6937         if (IS_ERR(hdr)) {
6938                 ret = PTR_ERR(hdr);
6939                 goto e_free_blob;
6940         }
6941         data->hdr_address = __psp_pa(hdr);
6942         data->hdr_len = params.hdr_len;
6943
6944         data->handle = sev->handle;
6945         ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error);
6946
6947         kfree(hdr);
6948
6949 e_free_blob:
6950         kfree(blob);
6951 e_free:
6952         kfree(data);
6953 e_unpin_memory:
6954         sev_unpin_memory(kvm, pages, n);
6955         return ret;
6956 }
6957
6958 static int svm_mem_enc_op(struct kvm *kvm, void __user *argp)
6959 {
6960         struct kvm_sev_cmd sev_cmd;
6961         int r;
6962
6963         if (!svm_sev_enabled())
6964                 return -ENOTTY;
6965
6966         if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd)))
6967                 return -EFAULT;
6968
6969         mutex_lock(&kvm->lock);
6970
6971         switch (sev_cmd.id) {
6972         case KVM_SEV_INIT:
6973                 r = sev_guest_init(kvm, &sev_cmd);
6974                 break;
6975         case KVM_SEV_LAUNCH_START:
6976                 r = sev_launch_start(kvm, &sev_cmd);
6977                 break;
6978         case KVM_SEV_LAUNCH_UPDATE_DATA:
6979                 r = sev_launch_update_data(kvm, &sev_cmd);
6980                 break;
6981         case KVM_SEV_LAUNCH_MEASURE:
6982                 r = sev_launch_measure(kvm, &sev_cmd);
6983                 break;
6984         case KVM_SEV_LAUNCH_FINISH:
6985                 r = sev_launch_finish(kvm, &sev_cmd);
6986                 break;
6987         case KVM_SEV_GUEST_STATUS:
6988                 r = sev_guest_status(kvm, &sev_cmd);
6989                 break;
6990         case KVM_SEV_DBG_DECRYPT:
6991                 r = sev_dbg_crypt(kvm, &sev_cmd, true);
6992                 break;
6993         case KVM_SEV_DBG_ENCRYPT:
6994                 r = sev_dbg_crypt(kvm, &sev_cmd, false);
6995                 break;
6996         case KVM_SEV_LAUNCH_SECRET:
6997                 r = sev_launch_secret(kvm, &sev_cmd);
6998                 break;
6999         default:
7000                 r = -EINVAL;
7001                 goto out;
7002         }
7003
7004         if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd)))
7005                 r = -EFAULT;
7006
7007 out:
7008         mutex_unlock(&kvm->lock);
7009         return r;
7010 }
7011
7012 static int svm_register_enc_region(struct kvm *kvm,
7013                                    struct kvm_enc_region *range)
7014 {
7015         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
7016         struct enc_region *region;
7017         int ret = 0;
7018
7019         if (!sev_guest(kvm))
7020                 return -ENOTTY;
7021
7022         if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
7023                 return -EINVAL;
7024
7025         region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
7026         if (!region)
7027                 return -ENOMEM;
7028
7029         region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
7030         if (!region->pages) {
7031                 ret = -ENOMEM;
7032                 goto e_free;
7033         }
7034
7035         /*
7036          * The guest may change the memory encryption attribute from C=0 -> C=1
7037          * or vice versa for this memory range. Lets make sure caches are
7038          * flushed to ensure that guest data gets written into memory with
7039          * correct C-bit.
7040          */
7041         sev_clflush_pages(region->pages, region->npages);
7042
7043         region->uaddr = range->addr;
7044         region->size = range->size;
7045
7046         mutex_lock(&kvm->lock);
7047         list_add_tail(&region->list, &sev->regions_list);
7048         mutex_unlock(&kvm->lock);
7049
7050         return ret;
7051
7052 e_free:
7053         kfree(region);
7054         return ret;
7055 }
7056
7057 static struct enc_region *
7058 find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
7059 {
7060         struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
7061         struct list_head *head = &sev->regions_list;
7062         struct enc_region *i;
7063
7064         list_for_each_entry(i, head, list) {
7065                 if (i->uaddr == range->addr &&
7066                     i->size == range->size)
7067                         return i;
7068         }
7069
7070         return NULL;
7071 }
7072
7073
7074 static int svm_unregister_enc_region(struct kvm *kvm,
7075                                      struct kvm_enc_region *range)
7076 {
7077         struct enc_region *region;
7078         int ret;
7079
7080         mutex_lock(&kvm->lock);
7081
7082         if (!sev_guest(kvm)) {
7083                 ret = -ENOTTY;
7084                 goto failed;
7085         }
7086
7087         region = find_enc_region(kvm, range);
7088         if (!region) {
7089                 ret = -EINVAL;
7090                 goto failed;
7091         }
7092
7093         __unregister_enc_region_locked(kvm, region);
7094
7095         mutex_unlock(&kvm->lock);
7096         return 0;
7097
7098 failed:
7099         mutex_unlock(&kvm->lock);
7100         return ret;
7101 }
7102
7103 static uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
7104 {
7105         /* Not supported */
7106         return 0;
7107 }
7108
7109 static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
7110                                    uint16_t *vmcs_version)
7111 {
7112         /* Intel-only feature */
7113         return -ENODEV;
7114 }
7115
7116 static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
7117 {
7118         bool is_user, smap;
7119
7120         is_user = svm_get_cpl(vcpu) == 3;
7121         smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
7122
7123         /*
7124          * Detect and workaround Errata 1096 Fam_17h_00_0Fh
7125          *
7126          * In non SEV guest, hypervisor will be able to read the guest
7127          * memory to decode the instruction pointer when insn_len is zero
7128          * so we return true to indicate that decoding is possible.
7129          *
7130          * But in the SEV guest, the guest memory is encrypted with the
7131          * guest specific key and hypervisor will not be able to decode the
7132          * instruction pointer so we will not able to workaround it. Lets
7133          * print the error and request to kill the guest.
7134          */
7135         if (is_user && smap) {
7136                 if (!sev_guest(vcpu->kvm))
7137                         return true;
7138
7139                 pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n");
7140                 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
7141         }
7142
7143         return false;
7144 }
7145
7146 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
7147         .cpu_has_kvm_support = has_svm,
7148         .disabled_by_bios = is_disabled,
7149         .hardware_setup = svm_hardware_setup,
7150         .hardware_unsetup = svm_hardware_unsetup,
7151         .check_processor_compatibility = svm_check_processor_compat,
7152         .hardware_enable = svm_hardware_enable,
7153         .hardware_disable = svm_hardware_disable,
7154         .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
7155         .has_emulated_msr = svm_has_emulated_msr,
7156
7157         .vcpu_create = svm_create_vcpu,
7158         .vcpu_free = svm_free_vcpu,
7159         .vcpu_reset = svm_vcpu_reset,
7160
7161         .vm_alloc = svm_vm_alloc,
7162         .vm_free = svm_vm_free,
7163         .vm_init = avic_vm_init,
7164         .vm_destroy = svm_vm_destroy,
7165
7166         .prepare_guest_switch = svm_prepare_guest_switch,
7167         .vcpu_load = svm_vcpu_load,
7168         .vcpu_put = svm_vcpu_put,
7169         .vcpu_blocking = svm_vcpu_blocking,
7170         .vcpu_unblocking = svm_vcpu_unblocking,
7171
7172         .update_bp_intercept = update_bp_intercept,
7173         .get_msr_feature = svm_get_msr_feature,
7174         .get_msr = svm_get_msr,
7175         .set_msr = svm_set_msr,
7176         .get_segment_base = svm_get_segment_base,
7177         .get_segment = svm_get_segment,
7178         .set_segment = svm_set_segment,
7179         .get_cpl = svm_get_cpl,
7180         .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
7181         .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
7182         .decache_cr3 = svm_decache_cr3,
7183         .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
7184         .set_cr0 = svm_set_cr0,
7185         .set_cr3 = svm_set_cr3,
7186         .set_cr4 = svm_set_cr4,
7187         .set_efer = svm_set_efer,
7188         .get_idt = svm_get_idt,
7189         .set_idt = svm_set_idt,
7190         .get_gdt = svm_get_gdt,
7191         .set_gdt = svm_set_gdt,
7192         .get_dr6 = svm_get_dr6,
7193         .set_dr6 = svm_set_dr6,
7194         .set_dr7 = svm_set_dr7,
7195         .sync_dirty_debug_regs = svm_sync_dirty_debug_regs,
7196         .cache_reg = svm_cache_reg,
7197         .get_rflags = svm_get_rflags,
7198         .set_rflags = svm_set_rflags,
7199
7200         .tlb_flush = svm_flush_tlb,
7201         .tlb_flush_gva = svm_flush_tlb_gva,
7202
7203         .run = svm_vcpu_run,
7204         .handle_exit = handle_exit,
7205         .skip_emulated_instruction = skip_emulated_instruction,
7206         .set_interrupt_shadow = svm_set_interrupt_shadow,
7207         .get_interrupt_shadow = svm_get_interrupt_shadow,
7208         .patch_hypercall = svm_patch_hypercall,
7209         .set_irq = svm_set_irq,
7210         .set_nmi = svm_inject_nmi,
7211         .queue_exception = svm_queue_exception,
7212         .cancel_injection = svm_cancel_injection,
7213         .interrupt_allowed = svm_interrupt_allowed,
7214         .nmi_allowed = svm_nmi_allowed,
7215         .get_nmi_mask = svm_get_nmi_mask,
7216         .set_nmi_mask = svm_set_nmi_mask,
7217         .enable_nmi_window = enable_nmi_window,
7218         .enable_irq_window = enable_irq_window,
7219         .update_cr8_intercept = update_cr8_intercept,
7220         .set_virtual_apic_mode = svm_set_virtual_apic_mode,
7221         .get_enable_apicv = svm_get_enable_apicv,
7222         .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
7223         .load_eoi_exitmap = svm_load_eoi_exitmap,
7224         .hwapic_irr_update = svm_hwapic_irr_update,
7225         .hwapic_isr_update = svm_hwapic_isr_update,
7226         .sync_pir_to_irr = kvm_lapic_find_highest_irr,
7227         .apicv_post_state_restore = avic_post_state_restore,
7228
7229         .set_tss_addr = svm_set_tss_addr,
7230         .set_identity_map_addr = svm_set_identity_map_addr,
7231         .get_tdp_level = get_npt_level,
7232         .get_mt_mask = svm_get_mt_mask,
7233
7234         .get_exit_info = svm_get_exit_info,
7235
7236         .get_lpage_level = svm_get_lpage_level,
7237
7238         .cpuid_update = svm_cpuid_update,
7239
7240         .rdtscp_supported = svm_rdtscp_supported,
7241         .invpcid_supported = svm_invpcid_supported,
7242         .mpx_supported = svm_mpx_supported,
7243         .xsaves_supported = svm_xsaves_supported,
7244         .umip_emulated = svm_umip_emulated,
7245         .pt_supported = svm_pt_supported,
7246
7247         .set_supported_cpuid = svm_set_supported_cpuid,
7248
7249         .has_wbinvd_exit = svm_has_wbinvd_exit,
7250
7251         .read_l1_tsc_offset = svm_read_l1_tsc_offset,
7252         .write_l1_tsc_offset = svm_write_l1_tsc_offset,
7253
7254         .set_tdp_cr3 = set_tdp_cr3,
7255
7256         .check_intercept = svm_check_intercept,
7257         .handle_external_intr = svm_handle_external_intr,
7258
7259         .request_immediate_exit = __kvm_request_immediate_exit,
7260
7261         .sched_in = svm_sched_in,
7262
7263         .pmu_ops = &amd_pmu_ops,
7264         .deliver_posted_interrupt = svm_deliver_avic_intr,
7265         .update_pi_irte = svm_update_pi_irte,
7266         .setup_mce = svm_setup_mce,
7267
7268         .smi_allowed = svm_smi_allowed,
7269         .pre_enter_smm = svm_pre_enter_smm,
7270         .pre_leave_smm = svm_pre_leave_smm,
7271         .enable_smi_window = enable_smi_window,
7272
7273         .mem_enc_op = svm_mem_enc_op,
7274         .mem_enc_reg_region = svm_register_enc_region,
7275         .mem_enc_unreg_region = svm_unregister_enc_region,
7276
7277         .nested_enable_evmcs = nested_enable_evmcs,
7278         .nested_get_evmcs_version = nested_get_evmcs_version,
7279
7280         .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
7281 };
7282
7283 static int __init svm_init(void)
7284 {
7285         return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm),
7286                         __alignof__(struct vcpu_svm), THIS_MODULE);
7287 }
7288
7289 static void __exit svm_exit(void)
7290 {
7291         kvm_exit();
7292 }
7293
7294 module_init(svm_init)
7295 module_exit(svm_exit)