]> asedeno.scripts.mit.edu Git - linux.git/commitdiff
Merge tag 'kvm-s390-next-4.12-1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorRadim Krčmář <rkrcmar@redhat.com>
Tue, 11 Apr 2017 18:54:40 +0000 (20:54 +0200)
committerRadim Krčmář <rkrcmar@redhat.com>
Tue, 11 Apr 2017 18:54:40 +0000 (20:54 +0200)
From: Christian Borntraeger <borntraeger@de.ibm.com>

KVM: s390: features for 4.12

1. guarded storage support for guests
   This contains an s390 base Linux feature branch that is necessary
   to implement the KVM part
2. Provide an interface to implement adapter interruption suppression
   which is necessary for proper zPCI support
3. Use more defines instead of numbers
4. Provide logging for lazy enablement of runtime instrumentation

62 files changed:
Documentation/virtual/kvm/api.txt
Documentation/virtual/kvm/hypercalls.txt
arch/arm/include/asm/kvm_host.h
arch/arm/include/uapi/asm/kvm.h
arch/arm/kvm/arm.c
arch/arm64/include/asm/kvm_host.h
arch/arm64/include/uapi/asm/kvm.h
arch/mips/Kconfig
arch/mips/include/asm/cpu-features.h
arch/mips/include/asm/cpu-info.h
arch/mips/include/asm/cpu.h
arch/mips/include/asm/kvm_host.h
arch/mips/include/asm/maar.h
arch/mips/include/asm/mipsregs.h
arch/mips/include/asm/tlb.h
arch/mips/include/uapi/asm/inst.h
arch/mips/include/uapi/asm/kvm.h
arch/mips/kernel/cpu-probe.c
arch/mips/kernel/time.c
arch/mips/kvm/Kconfig
arch/mips/kvm/Makefile
arch/mips/kvm/emulate.c
arch/mips/kvm/entry.c
arch/mips/kvm/hypcall.c [new file with mode: 0644]
arch/mips/kvm/interrupt.h
arch/mips/kvm/mips.c
arch/mips/kvm/mmu.c
arch/mips/kvm/tlb.c
arch/mips/kvm/trace.h
arch/mips/kvm/trap_emul.c
arch/mips/kvm/vz.c [new file with mode: 0644]
arch/mips/mm/cache.c
arch/mips/mm/init.c
arch/powerpc/include/asm/kvm_host.h
arch/powerpc/include/uapi/asm/kvm.h
arch/powerpc/kvm/powerpc.c
arch/x86/include/asm/kvm_host.h
arch/x86/include/asm/kvm_page_track.h
arch/x86/include/asm/vmx.h
arch/x86/include/uapi/asm/kvm.h
arch/x86/include/uapi/asm/vmx.h
arch/x86/kvm/Kconfig
arch/x86/kvm/Makefile
arch/x86/kvm/assigned-dev.c [deleted file]
arch/x86/kvm/assigned-dev.h [deleted file]
arch/x86/kvm/i8259.c
arch/x86/kvm/ioapic.c
arch/x86/kvm/iommu.c [deleted file]
arch/x86/kvm/mmu.c
arch/x86/kvm/mmu.h
arch/x86/kvm/page_track.c
arch/x86/kvm/paging_tmpl.h
arch/x86/kvm/svm.c
arch/x86/kvm/vmx.c
arch/x86/kvm/x86.c
drivers/ptp/ptp_kvm.c
include/linux/kvm_host.h
include/uapi/linux/kvm.h
tools/kvm/kvm_stat/kvm_stat
tools/kvm/kvm_stat/kvm_stat.txt
virt/kvm/eventfd.c
virt/kvm/kvm_main.c

index 598278cd0dc5d39d165fb430938fe26781a1ab43..e60be91d8036629110cd1f63751159912b81238f 100644 (file)
@@ -115,12 +115,17 @@ will access the virtual machine's physical address space; offset zero
 corresponds to guest physical address zero.  Use of mmap() on a VM fd
 is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is
 available.
-You most certainly want to use 0 as machine type.
+You probably want to use 0 as machine type.
 
 In order to create user controlled virtual machines on S390, check
 KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as
 privileged user (CAP_SYS_ADMIN).
 
+To use hardware assisted virtualization on MIPS (VZ ASE) rather than
+the default trap & emulate implementation (which changes the virtual
+memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the
+flag KVM_VM_MIPS_VZ.
+
 
 4.3 KVM_GET_MSR_INDEX_LIST
 
@@ -1321,130 +1326,6 @@ The flags bitmap is defined as:
    /* the host supports the ePAPR idle hcall
    #define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
 
-4.48 KVM_ASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Assigns a host PCI device to the VM.
-
-struct kvm_assigned_pci_dev {
-       __u32 assigned_dev_id;
-       __u32 busnr;
-       __u32 devfn;
-       __u32 flags;
-       __u32 segnr;
-       union {
-               __u32 reserved[11];
-       };
-};
-
-The PCI device is specified by the triple segnr, busnr, and devfn.
-Identification in succeeding service requests is done via assigned_dev_id. The
-following flags are specified:
-
-/* Depends on KVM_CAP_IOMMU */
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU    (1 << 0)
-/* The following two depend on KVM_CAP_PCI_2_3 */
-#define KVM_DEV_ASSIGN_PCI_2_3         (1 << 1)
-#define KVM_DEV_ASSIGN_MASK_INTX       (1 << 2)
-
-If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts
-via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other
-assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the
-guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details.
-
-The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure
-isolation of the device.  Usages not specifying this flag are deprecated.
-
-Only PCI header type 0 devices with PCI BAR resources are supported by
-device assignment.  The user requesting this ioctl must have read/write
-access to the PCI sysfs resource files associated with the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Ends PCI device assignment, releasing all associated resources.
-
-See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is
-used in kvm_assigned_pci_dev to identify the device.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-4.50 KVM_ASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Assigns an IRQ to a passed-through device.
-
-struct kvm_assigned_irq {
-       __u32 assigned_dev_id;
-       __u32 host_irq; /* ignored (legacy field) */
-       __u32 guest_irq;
-       __u32 flags;
-       union {
-               __u32 reserved[12];
-       };
-};
-
-The following flags are defined:
-
-#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
-#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
-#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
-
-#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
-#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
-#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
-
-It is not valid to specify multiple types per host or guest IRQ. However, the
-IRQ type of host and guest can differ or can even be null.
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
-
-4.51 KVM_DEASSIGN_DEV_IRQ (deprecated)
-
-Capability: KVM_CAP_ASSIGN_DEV_IRQ
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_irq (in)
-Returns: 0 on success, -1 on error
-
-Ends an IRQ assignment to a passed-through device.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified
-by assigned_dev_id, flags must correspond to the IRQ type specified on
-KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed.
-
-
 4.52 KVM_SET_GSI_ROUTING
 
 Capability: KVM_CAP_IRQ_ROUTING
@@ -1531,52 +1412,6 @@ struct kvm_irq_routing_hv_sint {
        __u32 sint;
 };
 
-4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_nr (in)
-Returns: 0 on success, -1 on error
-
-Set the number of MSI-X interrupts for an assigned device. The number is
-reset again by terminating the MSI-X assignment of the device via
-KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier
-point will fail.
-
-struct kvm_assigned_msix_nr {
-       __u32 assigned_dev_id;
-       __u16 entry_nr;
-       __u16 padding;
-};
-
-#define KVM_MAX_MSIX_PER_DEV           256
-
-
-4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated)
-
-Capability: none
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_msix_entry (in)
-Returns: 0 on success, -1 on error
-
-Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting
-the GSI vector to zero means disabling the interrupt.
-
-struct kvm_assigned_msix_entry {
-       __u32 assigned_dev_id;
-       __u32 gsi;
-       __u16 entry; /* The index of entry in the MSI-X table */
-       __u16 padding[3];
-};
-
-Errors:
-  ENOTTY: kernel does not support this ioctl
-
-  Other error conditions may be defined by individual device types or
-  have their standard meanings.
-
 
 4.55 KVM_SET_TSC_KHZ
 
@@ -1728,40 +1563,6 @@ should skip processing the bitmap and just invalidate everything.  It must
 be set to the number of set bits in the bitmap.
 
 
-4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated)
-
-Capability: KVM_CAP_PCI_2_3
-Architectures: x86
-Type: vm ioctl
-Parameters: struct kvm_assigned_pci_dev (in)
-Returns: 0 on success, -1 on error
-
-Allows userspace to mask PCI INTx interrupts from the assigned device.  The
-kernel will not deliver INTx interrupts to the guest between setting and
-clearing of KVM_ASSIGN_SET_INTX_MASK via this interface.  This enables use of
-and emulation of PCI 2.3 INTx disable command register behavior.
-
-This may be used for both PCI 2.3 devices supporting INTx disable natively and
-older devices lacking this support. Userspace is responsible for emulating the
-read value of the INTx disable bit in the guest visible PCI command register.
-When modifying the INTx disable state, userspace should precede updating the
-physical device command register by calling this ioctl to inform the kernel of
-the new intended INTx mask state.
-
-Note that the kernel uses the device INTx disable bit to internally manage the
-device interrupt state for PCI 2.3 devices.  Reads of this register may
-therefore not match the expected value.  Writes should always use the guest
-intended INTx disable value rather than attempting to read-copy-update the
-current physical device state.  Races between user and kernel updates to the
-INTx disable bit are handled lazily in the kernel.  It's possible the device
-may generate unintended interrupts, but they will not be injected into the
-guest.
-
-See KVM_ASSIGN_DEV_IRQ for the data structure.  The target device is specified
-by assigned_dev_id.  In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is
-evaluated.
-
-
 4.62 KVM_CREATE_SPAPR_TCE
 
 Capability: KVM_CAP_SPAPR_TCE
@@ -2068,11 +1869,23 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO0     | 64
   MIPS  | KVM_REG_MIPS_CP0_ENTRYLO1     | 64
   MIPS  | KVM_REG_MIPS_CP0_CONTEXT      | 64
+  MIPS  | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32
   MIPS  | KVM_REG_MIPS_CP0_USERLOCAL    | 64
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64
   MIPS  | KVM_REG_MIPS_CP0_PAGEMASK     | 32
+  MIPS  | KVM_REG_MIPS_CP0_PAGEGRAIN    | 32
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL0      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL1      | 64
+  MIPS  | KVM_REG_MIPS_CP0_SEGCTL2      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWBASE       | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWFIELD      | 64
+  MIPS  | KVM_REG_MIPS_CP0_PWSIZE       | 64
   MIPS  | KVM_REG_MIPS_CP0_WIRED        | 32
+  MIPS  | KVM_REG_MIPS_CP0_PWCTL        | 32
   MIPS  | KVM_REG_MIPS_CP0_HWRENA       | 32
   MIPS  | KVM_REG_MIPS_CP0_BADVADDR     | 64
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTR     | 32
+  MIPS  | KVM_REG_MIPS_CP0_BADINSTRP    | 32
   MIPS  | KVM_REG_MIPS_CP0_COUNT        | 32
   MIPS  | KVM_REG_MIPS_CP0_ENTRYHI      | 64
   MIPS  | KVM_REG_MIPS_CP0_COMPARE      | 32
@@ -2089,6 +1902,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_CONFIG4      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG5      | 32
   MIPS  | KVM_REG_MIPS_CP0_CONFIG7      | 32
+  MIPS  | KVM_REG_MIPS_CP0_XCONTEXT     | 64
   MIPS  | KVM_REG_MIPS_CP0_ERROREPC     | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH1    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH2    | 64
@@ -2096,6 +1910,7 @@ registers, find a list below:
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH4    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH5    | 64
   MIPS  | KVM_REG_MIPS_CP0_KSCRATCH6    | 64
+  MIPS  | KVM_REG_MIPS_CP0_MAAR(0..63)  | 64
   MIPS  | KVM_REG_MIPS_COUNT_CTL        | 64
   MIPS  | KVM_REG_MIPS_COUNT_RESUME     | 64
   MIPS  | KVM_REG_MIPS_COUNT_HZ         | 64
@@ -2162,6 +1977,10 @@ hardware, host kernel, guest, and whether XPA is present in the guest, i.e.
 with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and
 the PFNX field starting at bit 30.
 
+MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit
+patterns:
+  0x7030 0000 0001 01 <reg:8>
+
 MIPS KVM control registers (see above) have the following id bit patterns:
   0x7030 0000 0002 <reg:16>
 
@@ -3377,6 +3196,69 @@ struct kvm_ppc_resize_hpt {
        __u32 pad;
 };
 
+4.104 KVM_X86_GET_MCE_CAP_SUPPORTED
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: system ioctl
+Parameters: u64 mce_cap (out)
+Returns: 0 on success, -1 on error
+
+Returns supported MCE capabilities. The u64 mce_cap parameter
+has the same format as the MSR_IA32_MCG_CAP register. Supported
+capabilities will have the corresponding bits set.
+
+4.105 KVM_X86_SETUP_MCE
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: u64 mcg_cap (in)
+Returns: 0 on success,
+         -EFAULT if u64 mcg_cap cannot be read,
+         -EINVAL if the requested number of banks is invalid,
+         -EINVAL if requested MCE capability is not supported.
+
+Initializes MCE support for use. The u64 mcg_cap parameter
+has the same format as the MSR_IA32_MCG_CAP register and
+specifies which capabilities should be enabled. The maximum
+supported number of error-reporting banks can be retrieved when
+checking for KVM_CAP_MCE. The supported capabilities can be
+retrieved with KVM_X86_GET_MCE_CAP_SUPPORTED.
+
+4.106 KVM_X86_SET_MCE
+
+Capability: KVM_CAP_MCE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_x86_mce (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_x86_mce cannot be read,
+         -EINVAL if the bank number is invalid,
+         -EINVAL if VAL bit is not set in status field.
+
+Inject a machine check error (MCE) into the guest. The input
+parameter is:
+
+struct kvm_x86_mce {
+       __u64 status;
+       __u64 addr;
+       __u64 misc;
+       __u64 mcg_status;
+       __u8 bank;
+       __u8 pad1[7];
+       __u64 pad2[3];
+};
+
+If the MCE being reported is an uncorrected error, KVM will
+inject it as an MCE exception into the guest. If the guest
+MCG_STATUS register reports that an MCE is in progress, KVM
+causes an KVM_EXIT_SHUTDOWN vmexit.
+
+Otherwise, if the MCE is a corrected error, KVM will just
+store it in the corresponding bank (provided this bank is
+not holding a previously reported uncorrected error).
+
 5. The kvm_run structure
 ------------------------
 
@@ -4164,3 +4046,68 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is
 available, means that that the kernel can support guests using the
 hashed page table MMU defined in Power ISA V3.00 (as implemented in
 the POWER9 processor), including in-memory segment tables.
+
+8.5 KVM_CAP_MIPS_VZ
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that full hardware assisted virtualization capabilities
+of the hardware are available for use through KVM. An appropriate
+KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which
+utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using full hardware assisted virtualization
+capabilities of the hardware. This is useful to check after creating a VM with
+KVM_VM_MIPS_DEFAULT.
+
+The value returned by KVM_CHECK_EXTENSION should be compared against known
+values (see below). All other values are reserved. This is to allow for the
+possibility of other hardware assisted virtualization implementations which
+may be incompatible with the MIPS VZ ASE.
+
+ 0: The trap & emulate implementation is in use to run guest code in user
+    mode. Guest virtual memory segments are rearranged to fit the guest in the
+    user mode address space.
+
+ 1: The MIPS VZ ASE is in use, providing full hardware assisted
+    virtualization, including standard guest virtual memory segments.
+
+8.6 KVM_CAP_MIPS_TE
+
+Architectures: mips
+
+This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that
+it is available, means that the trap & emulate implementation is available to
+run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware
+assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed
+to KVM_CREATE_VM to create a VM which utilises it.
+
+If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is
+available, it means that the VM is using trap & emulate.
+
+8.7 KVM_CAP_MIPS_64BIT
+
+Architectures: mips
+
+This capability indicates the supported architecture type of the guest, i.e. the
+supported register and address width.
+
+The values returned when this capability is checked by KVM_CHECK_EXTENSION on a
+kvm VM handle correspond roughly to the CP0_Config.AT register field, and should
+be checked specifically against known values (see below). All other values are
+reserved.
+
+ 0: MIPS32 or microMIPS32.
+    Both registers and addresses are 32-bits wide.
+    It will only be possible to run 32-bit guest code.
+
+ 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments.
+    Registers are 64-bits wide, but addresses are 32-bits wide.
+    64-bit guest code may run but cannot access MIPS64 memory segments.
+    It will also be possible to run 32-bit guest code.
+
+ 2: MIPS64 or microMIPS64 with access to all address segments.
+    Both registers and addresses are 64-bits wide.
+    It will be possible to run 64-bit or 32-bit guest code.
index feaaa634f154bb44d400758f82adcc3f1a6c85da..a890529c63ed6a3be4e2c38eb739377b37c6bc4c 100644 (file)
@@ -28,6 +28,11 @@ S390:
   property inside the device tree's /hypervisor node.
   For more information refer to Documentation/virtual/kvm/ppc-pv.txt
 
+MIPS:
+  KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall
+  number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and
+  the return value is placed in $2 (v0).
+
 KVM Hypercalls Documentation
 ===========================
 The template for each hypercall is:
index 31ee468ce667dee8a219f775f1106714879088c2..de67ce64750173c019703bef1d38ee622bd1a91d 100644 (file)
@@ -30,7 +30,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 32
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
index 6ebd3e6a1fd12d3202067020b48446fd9bdcff98..254a38cace2a2d7a0a23918e66663757bf255956 100644 (file)
@@ -27,6 +27,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)                                               \
        (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
index 96dba7cd8be7b4b6f29d9896e2d4515c477ca963..e3c8105ada65a87c70e2bf362e3a4008d526c2dd 100644 (file)
@@ -209,9 +209,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
-       case KVM_CAP_COALESCED_MMIO:
-               r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-               break;
        case KVM_CAP_ARM_SET_DEVICE_ADDR:
                r = 1;
                break;
index e7705e7bb07b133de4da9b2809a152f94ceb0b4b..522e4f60976ee1e6939cbb48c302552708c863e1 100644 (file)
@@ -31,7 +31,6 @@
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
 #define KVM_USER_MEM_SLOTS 512
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #include <kvm/arm_vgic.h>
index c2860358ae3e0c3271d9ca4b944351986276e397..aa5ab69c1312dc4143042c27c4375159c53e1383 100644 (file)
@@ -39,6 +39,8 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 #define KVM_REG_SIZE(id)                                               \
        (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT))
 
index a008a9f03072deb900409ad2ef93a6bd65cdb48e..0a4adbc326e6c356a429866da02311034a68ca7a 100644 (file)
@@ -1687,6 +1687,7 @@ config CPU_CAVIUM_OCTEON
        select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
        select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN
        select MIPS_L1_CACHE_SHIFT_7
+       select HAVE_KVM
        help
          The Cavium Octeon processor is a highly integrated chip containing
          many ethernet hardware widgets for networking tasks. The processor
index e961c8a7ea6626a1b044b366a99c5201e6d5beb3..494d38274142697b62c4ff3959c859bd1fc91d64 100644 (file)
 # define cpu_has_msa           0
 #endif
 
+#ifndef cpu_has_ufr
+# define cpu_has_ufr           (cpu_data[0].options & MIPS_CPU_UFR)
+#endif
+
 #ifndef cpu_has_fre
 # define cpu_has_fre           (cpu_data[0].options & MIPS_CPU_FRE)
 #endif
 #ifndef cpu_guest_has_htw
 #define cpu_guest_has_htw      (cpu_data[0].guest.options & MIPS_CPU_HTW)
 #endif
+#ifndef cpu_guest_has_mvh
+#define cpu_guest_has_mvh      (cpu_data[0].guest.options & MIPS_CPU_MVH)
+#endif
 #ifndef cpu_guest_has_msa
 #define cpu_guest_has_msa      (cpu_data[0].guest.ases & MIPS_ASE_MSA)
 #endif
 #ifndef cpu_guest_has_maar
 #define cpu_guest_has_maar     (cpu_data[0].guest.options & MIPS_CPU_MAAR)
 #endif
+#ifndef cpu_guest_has_userlocal
+#define cpu_guest_has_userlocal        (cpu_data[0].guest.options & MIPS_CPU_ULRI)
+#endif
 
 /*
  * Guest dynamic capabilities
index edbe2734a1bf07ff6bef041881282d0ab968fb7c..be3b4c25f3359ad4e37bf013ca73d7ce5f2b9a1f 100644 (file)
@@ -33,6 +33,7 @@ struct guest_info {
        unsigned long           ases_dyn;
        unsigned long long      options;
        unsigned long long      options_dyn;
+       int                     tlbsize;
        u8                      conf;
        u8                      kscratch_mask;
 };
@@ -109,6 +110,7 @@ struct cpuinfo_mips {
        struct guest_info       guest;
        unsigned int            gtoffset_mask;
        unsigned int            guestid_mask;
+       unsigned int            guestid_cache;
 } __attribute__((aligned(SMP_CACHE_BYTES)));
 
 extern struct cpuinfo_mips cpu_data[];
index 9a8372484edc0f3dd48daf5e06a532473ff911db..98f59307e6a354ca5c61ada241aa010312d8c98c 100644 (file)
@@ -415,6 +415,7 @@ enum cpu_type_enum {
 #define MIPS_CPU_GUESTCTL2     MBIT_ULL(50)    /* CPU has VZ GuestCtl2 register */
 #define MIPS_CPU_GUESTID       MBIT_ULL(51)    /* CPU uses VZ ASE GuestID feature */
 #define MIPS_CPU_DRG           MBIT_ULL(52)    /* CPU has VZ Direct Root to Guest (DRG) */
+#define MIPS_CPU_UFR           MBIT_ULL(53)    /* CPU supports User mode FR switching */
 
 /*
  * CPU ASE encodings
index 05e785fc061daa4b8b25ce054a8d5924d811554c..2998479fd4e83f0ac4c6ccd7d89938c7cc9a6f5f 100644 (file)
@@ -10,6 +10,7 @@
 #ifndef __MIPS_KVM_HOST_H__
 #define __MIPS_KVM_HOST_H__
 
+#include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <linux/hrtimer.h>
 #include <linux/interrupt.h>
 #define KVM_REG_MIPS_CP0_ENTRYLO0      MIPS_CP0_64(2, 0)
 #define KVM_REG_MIPS_CP0_ENTRYLO1      MIPS_CP0_64(3, 0)
 #define KVM_REG_MIPS_CP0_CONTEXT       MIPS_CP0_64(4, 0)
+#define KVM_REG_MIPS_CP0_CONTEXTCONFIG MIPS_CP0_32(4, 1)
 #define KVM_REG_MIPS_CP0_USERLOCAL     MIPS_CP0_64(4, 2)
+#define KVM_REG_MIPS_CP0_XCONTEXTCONFIG        MIPS_CP0_64(4, 3)
 #define KVM_REG_MIPS_CP0_PAGEMASK      MIPS_CP0_32(5, 0)
 #define KVM_REG_MIPS_CP0_PAGEGRAIN     MIPS_CP0_32(5, 1)
+#define KVM_REG_MIPS_CP0_SEGCTL0       MIPS_CP0_64(5, 2)
+#define KVM_REG_MIPS_CP0_SEGCTL1       MIPS_CP0_64(5, 3)
+#define KVM_REG_MIPS_CP0_SEGCTL2       MIPS_CP0_64(5, 4)
+#define KVM_REG_MIPS_CP0_PWBASE                MIPS_CP0_64(5, 5)
+#define KVM_REG_MIPS_CP0_PWFIELD       MIPS_CP0_64(5, 6)
+#define KVM_REG_MIPS_CP0_PWSIZE                MIPS_CP0_64(5, 7)
 #define KVM_REG_MIPS_CP0_WIRED         MIPS_CP0_32(6, 0)
+#define KVM_REG_MIPS_CP0_PWCTL         MIPS_CP0_32(6, 6)
 #define KVM_REG_MIPS_CP0_HWRENA                MIPS_CP0_32(7, 0)
 #define KVM_REG_MIPS_CP0_BADVADDR      MIPS_CP0_64(8, 0)
+#define KVM_REG_MIPS_CP0_BADINSTR      MIPS_CP0_32(8, 1)
+#define KVM_REG_MIPS_CP0_BADINSTRP     MIPS_CP0_32(8, 2)
 #define KVM_REG_MIPS_CP0_COUNT         MIPS_CP0_32(9, 0)
 #define KVM_REG_MIPS_CP0_ENTRYHI       MIPS_CP0_64(10, 0)
 #define KVM_REG_MIPS_CP0_COMPARE       MIPS_CP0_32(11, 0)
@@ -55,6 +67,7 @@
 #define KVM_REG_MIPS_CP0_CONFIG4       MIPS_CP0_32(16, 4)
 #define KVM_REG_MIPS_CP0_CONFIG5       MIPS_CP0_32(16, 5)
 #define KVM_REG_MIPS_CP0_CONFIG7       MIPS_CP0_32(16, 7)
+#define KVM_REG_MIPS_CP0_MAARI         MIPS_CP0_64(17, 2)
 #define KVM_REG_MIPS_CP0_XCONTEXT      MIPS_CP0_64(20, 0)
 #define KVM_REG_MIPS_CP0_ERROREPC      MIPS_CP0_64(30, 0)
 #define KVM_REG_MIPS_CP0_KSCRATCH1     MIPS_CP0_64(31, 2)
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS  0
 
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HALT_POLL_NS_DEFAULT 500000
 
+#ifdef CONFIG_KVM_MIPS_VZ
+extern unsigned long GUESTID_MASK;
+extern unsigned long GUESTID_FIRST_VERSION;
+extern unsigned long GUESTID_VERSION_MASK;
+#endif
 
 
 /*
@@ -145,6 +162,16 @@ struct kvm_vcpu_stat {
        u64 fpe_exits;
        u64 msa_disabled_exits;
        u64 flush_dcache_exits;
+#ifdef CONFIG_KVM_MIPS_VZ
+       u64 vz_gpsi_exits;
+       u64 vz_gsfc_exits;
+       u64 vz_hc_exits;
+       u64 vz_grr_exits;
+       u64 vz_gva_exits;
+       u64 vz_ghfc_exits;
+       u64 vz_gpa_exits;
+       u64 vz_resvd_exits;
+#endif
        u64 halt_successful_poll;
        u64 halt_attempted_poll;
        u64 halt_poll_invalid;
@@ -157,6 +184,8 @@ struct kvm_arch_memory_slot {
 struct kvm_arch {
        /* Guest physical mm */
        struct mm_struct gpa_mm;
+       /* Mask of CPUs needing GPA ASID flush */
+       cpumask_t asid_flush_mask;
 };
 
 #define N_MIPS_COPROC_REGS     32
@@ -214,6 +243,11 @@ struct mips_coproc {
 #define MIPS_CP0_CONFIG4_SEL   4
 #define MIPS_CP0_CONFIG5_SEL   5
 
+#define MIPS_CP0_GUESTCTL2     10
+#define MIPS_CP0_GUESTCTL2_SEL 5
+#define MIPS_CP0_GTOFFSET      12
+#define MIPS_CP0_GTOFFSET_SEL  7
+
 /* Resume Flags */
 #define RESUME_FLAG_DR         (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST       (1<<1)  /* Resume host? */
@@ -229,6 +263,7 @@ enum emulation_result {
        EMULATE_WAIT,           /* WAIT instruction */
        EMULATE_PRIV_FAIL,
        EMULATE_EXCEPT,         /* A guest exception has been generated */
+       EMULATE_HYPERCALL,      /* HYPCALL instruction */
 };
 
 #define mips3_paddr_to_tlbpfn(x) \
@@ -276,13 +311,18 @@ struct kvm_mmu_memory_cache {
 struct kvm_vcpu_arch {
        void *guest_ebase;
        int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+       /* Host registers preserved across guest mode execution */
        unsigned long host_stack;
        unsigned long host_gp;
+       unsigned long host_pgd;
+       unsigned long host_entryhi;
 
        /* Host CP0 registers used when handling exits from guest */
        unsigned long host_cp0_badvaddr;
        unsigned long host_cp0_epc;
        u32 host_cp0_cause;
+       u32 host_cp0_guestctl0;
        u32 host_cp0_badinstr;
        u32 host_cp0_badinstrp;
 
@@ -340,7 +380,23 @@ struct kvm_vcpu_arch {
        /* Cache some mmu pages needed inside spinlock regions */
        struct kvm_mmu_memory_cache mmu_page_cache;
 
+#ifdef CONFIG_KVM_MIPS_VZ
+       /* vcpu's vzguestid is different on each host cpu in an smp system */
+       u32 vzguestid[NR_CPUS];
+
+       /* wired guest TLB entries */
+       struct kvm_mips_tlb *wired_tlb;
+       unsigned int wired_tlb_limit;
+       unsigned int wired_tlb_used;
+
+       /* emulated guest MAAR registers */
+       unsigned long maar[6];
+#endif
+
+       /* Last CPU the VCPU state was loaded on */
        int last_sched_cpu;
+       /* Last CPU the VCPU actually executed guest code on */
+       int last_exec_cpu;
 
        /* WAIT executed */
        int wait;
@@ -349,78 +405,6 @@ struct kvm_vcpu_arch {
        u8 msa_enabled;
 };
 
-
-#define kvm_read_c0_guest_index(cop0)          (cop0->reg[MIPS_CP0_TLB_INDEX][0])
-#define kvm_write_c0_guest_index(cop0, val)    (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
-#define kvm_read_c0_guest_entrylo0(cop0)       (cop0->reg[MIPS_CP0_TLB_LO0][0])
-#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
-#define kvm_read_c0_guest_entrylo1(cop0)       (cop0->reg[MIPS_CP0_TLB_LO1][0])
-#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
-#define kvm_read_c0_guest_context(cop0)                (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
-#define kvm_write_c0_guest_context(cop0, val)  (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
-#define kvm_read_c0_guest_userlocal(cop0)      (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
-#define kvm_write_c0_guest_userlocal(cop0, val)        (cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val))
-#define kvm_read_c0_guest_pagemask(cop0)       (cop0->reg[MIPS_CP0_TLB_PG_MASK][0])
-#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val))
-#define kvm_read_c0_guest_wired(cop0)          (cop0->reg[MIPS_CP0_TLB_WIRED][0])
-#define kvm_write_c0_guest_wired(cop0, val)    (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val))
-#define kvm_read_c0_guest_hwrena(cop0)         (cop0->reg[MIPS_CP0_HWRENA][0])
-#define kvm_write_c0_guest_hwrena(cop0, val)   (cop0->reg[MIPS_CP0_HWRENA][0] = (val))
-#define kvm_read_c0_guest_badvaddr(cop0)       (cop0->reg[MIPS_CP0_BAD_VADDR][0])
-#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val))
-#define kvm_read_c0_guest_count(cop0)          (cop0->reg[MIPS_CP0_COUNT][0])
-#define kvm_write_c0_guest_count(cop0, val)    (cop0->reg[MIPS_CP0_COUNT][0] = (val))
-#define kvm_read_c0_guest_entryhi(cop0)                (cop0->reg[MIPS_CP0_TLB_HI][0])
-#define kvm_write_c0_guest_entryhi(cop0, val)  (cop0->reg[MIPS_CP0_TLB_HI][0] = (val))
-#define kvm_read_c0_guest_compare(cop0)                (cop0->reg[MIPS_CP0_COMPARE][0])
-#define kvm_write_c0_guest_compare(cop0, val)  (cop0->reg[MIPS_CP0_COMPARE][0] = (val))
-#define kvm_read_c0_guest_status(cop0)         (cop0->reg[MIPS_CP0_STATUS][0])
-#define kvm_write_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] = (val))
-#define kvm_read_c0_guest_intctl(cop0)         (cop0->reg[MIPS_CP0_STATUS][1])
-#define kvm_write_c0_guest_intctl(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][1] = (val))
-#define kvm_read_c0_guest_cause(cop0)          (cop0->reg[MIPS_CP0_CAUSE][0])
-#define kvm_write_c0_guest_cause(cop0, val)    (cop0->reg[MIPS_CP0_CAUSE][0] = (val))
-#define kvm_read_c0_guest_epc(cop0)            (cop0->reg[MIPS_CP0_EXC_PC][0])
-#define kvm_write_c0_guest_epc(cop0, val)      (cop0->reg[MIPS_CP0_EXC_PC][0] = (val))
-#define kvm_read_c0_guest_prid(cop0)           (cop0->reg[MIPS_CP0_PRID][0])
-#define kvm_write_c0_guest_prid(cop0, val)     (cop0->reg[MIPS_CP0_PRID][0] = (val))
-#define kvm_read_c0_guest_ebase(cop0)          (cop0->reg[MIPS_CP0_PRID][1])
-#define kvm_write_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] = (val))
-#define kvm_read_c0_guest_config(cop0)         (cop0->reg[MIPS_CP0_CONFIG][0])
-#define kvm_read_c0_guest_config1(cop0)                (cop0->reg[MIPS_CP0_CONFIG][1])
-#define kvm_read_c0_guest_config2(cop0)                (cop0->reg[MIPS_CP0_CONFIG][2])
-#define kvm_read_c0_guest_config3(cop0)                (cop0->reg[MIPS_CP0_CONFIG][3])
-#define kvm_read_c0_guest_config4(cop0)                (cop0->reg[MIPS_CP0_CONFIG][4])
-#define kvm_read_c0_guest_config5(cop0)                (cop0->reg[MIPS_CP0_CONFIG][5])
-#define kvm_read_c0_guest_config7(cop0)                (cop0->reg[MIPS_CP0_CONFIG][7])
-#define kvm_write_c0_guest_config(cop0, val)   (cop0->reg[MIPS_CP0_CONFIG][0] = (val))
-#define kvm_write_c0_guest_config1(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][1] = (val))
-#define kvm_write_c0_guest_config2(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][2] = (val))
-#define kvm_write_c0_guest_config3(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][3] = (val))
-#define kvm_write_c0_guest_config4(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][4] = (val))
-#define kvm_write_c0_guest_config5(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][5] = (val))
-#define kvm_write_c0_guest_config7(cop0, val)  (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
-#define kvm_read_c0_guest_errorepc(cop0)       (cop0->reg[MIPS_CP0_ERROR_PC][0])
-#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
-#define kvm_read_c0_guest_kscratch1(cop0)      (cop0->reg[MIPS_CP0_DESAVE][2])
-#define kvm_read_c0_guest_kscratch2(cop0)      (cop0->reg[MIPS_CP0_DESAVE][3])
-#define kvm_read_c0_guest_kscratch3(cop0)      (cop0->reg[MIPS_CP0_DESAVE][4])
-#define kvm_read_c0_guest_kscratch4(cop0)      (cop0->reg[MIPS_CP0_DESAVE][5])
-#define kvm_read_c0_guest_kscratch5(cop0)      (cop0->reg[MIPS_CP0_DESAVE][6])
-#define kvm_read_c0_guest_kscratch6(cop0)      (cop0->reg[MIPS_CP0_DESAVE][7])
-#define kvm_write_c0_guest_kscratch1(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][2] = (val))
-#define kvm_write_c0_guest_kscratch2(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][3] = (val))
-#define kvm_write_c0_guest_kscratch3(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][4] = (val))
-#define kvm_write_c0_guest_kscratch4(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][5] = (val))
-#define kvm_write_c0_guest_kscratch5(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][6] = (val))
-#define kvm_write_c0_guest_kscratch6(cop0, val)        (cop0->reg[MIPS_CP0_DESAVE][7] = (val))
-
-/*
- * Some of the guest registers may be modified asynchronously (e.g. from a
- * hrtimer callback in hard irq context) and therefore need stronger atomicity
- * guarantees than other registers.
- */
-
 static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
                                                unsigned long val)
 {
@@ -471,26 +455,286 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
        } while (unlikely(!temp));
 }
 
-#define kvm_set_c0_guest_status(cop0, val)     (cop0->reg[MIPS_CP0_STATUS][0] |= (val))
-#define kvm_clear_c0_guest_status(cop0, val)   (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val))
+/* Guest register types, used in accessor build below */
+#define __KVMT32       u32
+#define __KVMTl        unsigned long
 
-/* Cause can be modified asynchronously from hardirq hrtimer callback */
-#define kvm_set_c0_guest_cause(cop0, val)                              \
-       _kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_clear_c0_guest_cause(cop0, val)                            \
-       _kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val)
-#define kvm_change_c0_guest_cause(cop0, change, val)                   \
-       _kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0],  \
-                                       change, val)
-
-#define kvm_set_c0_guest_ebase(cop0, val)      (cop0->reg[MIPS_CP0_PRID][1] |= (val))
-#define kvm_clear_c0_guest_ebase(cop0, val)    (cop0->reg[MIPS_CP0_PRID][1] &= ~(val))
-#define kvm_change_c0_guest_ebase(cop0, change, val)                   \
+/*
+ * __BUILD_KVM_$ops_SAVED(): kvm_$op_sw_gc0_$reg()
+ * These operate on the saved guest C0 state in RAM.
+ */
+
+/* Generate saved context simple accessors */
+#define __BUILD_KVM_RW_SAVED(name, type, _reg, sel)                    \
+static inline __KVMT##type kvm_read_sw_gc0_##name(struct mips_coproc *cop0) \
+{                                                                      \
+       return cop0->reg[(_reg)][(sel)];                                \
+}                                                                      \
+static inline void kvm_write_sw_gc0_##name(struct mips_coproc *cop0,   \
+                                          __KVMT##type val)            \
+{                                                                      \
+       cop0->reg[(_reg)][(sel)] = val;                                 \
+}
+
+/* Generate saved context bitwise modifiers */
+#define __BUILD_KVM_SET_SAVED(name, type, _reg, sel)                   \
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,     \
+                                        __KVMT##type val)              \
+{                                                                      \
+       cop0->reg[(_reg)][(sel)] |= val;                                \
+}                                                                      \
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,   \
+                                          __KVMT##type val)            \
+{                                                                      \
+       cop0->reg[(_reg)][(sel)] &= ~val;                               \
+}                                                                      \
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,  \
+                                           __KVMT##type mask,          \
+                                           __KVMT##type val)           \
+{                                                                      \
+       unsigned long _mask = mask;                                     \
+       cop0->reg[(_reg)][(sel)] &= ~_mask;                             \
+       cop0->reg[(_reg)][(sel)] |= val & _mask;                        \
+}
+
+/* Generate saved context atomic bitwise modifiers */
+#define __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)                        \
+static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0,     \
+                                        __KVMT##type val)              \
+{                                                                      \
+       _kvm_atomic_set_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val);   \
+}                                                                      \
+static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0,   \
+                                          __KVMT##type val)            \
+{                                                                      \
+       _kvm_atomic_clear_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val); \
+}                                                                      \
+static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0,  \
+                                           __KVMT##type mask,          \
+                                           __KVMT##type val)           \
+{                                                                      \
+       _kvm_atomic_change_c0_guest_reg(&cop0->reg[(_reg)][(sel)], mask, \
+                                       val);                           \
+}
+
+/*
+ * __BUILD_KVM_$ops_VZ(): kvm_$op_vz_gc0_$reg()
+ * These operate on the VZ guest C0 context in hardware.
+ */
+
+/* Generate VZ guest context simple accessors */
+#define __BUILD_KVM_RW_VZ(name, type, _reg, sel)                       \
+static inline __KVMT##type kvm_read_vz_gc0_##name(struct mips_coproc *cop0) \
+{                                                                      \
+       return read_gc0_##name();                                       \
+}                                                                      \
+static inline void kvm_write_vz_gc0_##name(struct mips_coproc *cop0,   \
+                                          __KVMT##type val)            \
+{                                                                      \
+       write_gc0_##name(val);                                          \
+}
+
+/* Generate VZ guest context bitwise modifiers */
+#define __BUILD_KVM_SET_VZ(name, type, _reg, sel)                      \
+static inline void kvm_set_vz_gc0_##name(struct mips_coproc *cop0,     \
+                                        __KVMT##type val)              \
+{                                                                      \
+       set_gc0_##name(val);                                            \
+}                                                                      \
+static inline void kvm_clear_vz_gc0_##name(struct mips_coproc *cop0,   \
+                                          __KVMT##type val)            \
+{                                                                      \
+       clear_gc0_##name(val);                                          \
+}                                                                      \
+static inline void kvm_change_vz_gc0_##name(struct mips_coproc *cop0,  \
+                                           __KVMT##type mask,          \
+                                           __KVMT##type val)           \
+{                                                                      \
+       change_gc0_##name(mask, val);                                   \
+}
+
+/* Generate VZ guest context save/restore to/from saved context */
+#define __BUILD_KVM_SAVE_VZ(name, _reg, sel)                   \
+static inline void kvm_restore_gc0_##name(struct mips_coproc *cop0)    \
+{                                                                      \
+       write_gc0_##name(cop0->reg[(_reg)][(sel)]);                     \
+}                                                                      \
+static inline void kvm_save_gc0_##name(struct mips_coproc *cop0)       \
+{                                                                      \
+       cop0->reg[(_reg)][(sel)] = read_gc0_##name();                   \
+}
+
+/*
+ * __BUILD_KVM_$ops_WRAP(): kvm_$op_$name1() -> kvm_$op_$name2()
+ * These wrap a set of operations to provide them with a different name.
+ */
+
+/* Generate simple accessor wrapper */
+#define __BUILD_KVM_RW_WRAP(name1, name2, type)                                \
+static inline __KVMT##type kvm_read_##name1(struct mips_coproc *cop0)  \
+{                                                                      \
+       return kvm_read_##name2(cop0);                                  \
+}                                                                      \
+static inline void kvm_write_##name1(struct mips_coproc *cop0,         \
+                                    __KVMT##type val)                  \
+{                                                                      \
+       kvm_write_##name2(cop0, val);                                   \
+}
+
+/* Generate bitwise modifier wrapper */
+#define __BUILD_KVM_SET_WRAP(name1, name2, type)                       \
+static inline void kvm_set_##name1(struct mips_coproc *cop0,           \
+                                  __KVMT##type val)                    \
 {                                                                      \
-       kvm_clear_c0_guest_ebase(cop0, change);                         \
-       kvm_set_c0_guest_ebase(cop0, ((val) & (change)));               \
+       kvm_set_##name2(cop0, val);                                     \
+}                                                                      \
+static inline void kvm_clear_##name1(struct mips_coproc *cop0,         \
+                                    __KVMT##type val)                  \
+{                                                                      \
+       kvm_clear_##name2(cop0, val);                                   \
+}                                                                      \
+static inline void kvm_change_##name1(struct mips_coproc *cop0,                \
+                                     __KVMT##type mask,                \
+                                     __KVMT##type val)                 \
+{                                                                      \
+       kvm_change_##name2(cop0, mask, val);                            \
 }
 
+/*
+ * __BUILD_KVM_$ops_SW(): kvm_$op_c0_guest_$reg() -> kvm_$op_sw_gc0_$reg()
+ * These generate accessors operating on the saved context in RAM, and wrap them
+ * with the common guest C0 accessors (for use by common emulation code).
+ */
+
+#define __BUILD_KVM_RW_SW(name, type, _reg, sel)                       \
+       __BUILD_KVM_RW_SAVED(name, type, _reg, sel)                     \
+       __BUILD_KVM_RW_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_SET_SW(name, type, _reg, sel)                      \
+       __BUILD_KVM_SET_SAVED(name, type, _reg, sel)                    \
+       __BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#define __BUILD_KVM_ATOMIC_SW(name, type, _reg, sel)                   \
+       __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel)                 \
+       __BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type)
+
+#ifndef CONFIG_KVM_MIPS_VZ
+
+/*
+ * T&E (trap & emulate software based virtualisation)
+ * We generate the common accessors operating exclusively on the saved context
+ * in RAM.
+ */
+
+#define __BUILD_KVM_RW_HW      __BUILD_KVM_RW_SW
+#define __BUILD_KVM_SET_HW     __BUILD_KVM_SET_SW
+#define __BUILD_KVM_ATOMIC_HW  __BUILD_KVM_ATOMIC_SW
+
+#else
+
+/*
+ * VZ (hardware assisted virtualisation)
+ * These macros use the active guest state in VZ mode (hardware registers),
+ */
+
+/*
+ * __BUILD_KVM_$ops_HW(): kvm_$op_c0_guest_$reg() -> kvm_$op_vz_gc0_$reg()
+ * These generate accessors operating on the VZ guest context in hardware, and
+ * wrap them with the common guest C0 accessors (for use by common emulation
+ * code).
+ *
+ * Accessors operating on the saved context in RAM are also generated to allow
+ * convenient explicit saving and restoring of the state.
+ */
+
+#define __BUILD_KVM_RW_HW(name, type, _reg, sel)                       \
+       __BUILD_KVM_RW_SAVED(name, type, _reg, sel)                     \
+       __BUILD_KVM_RW_VZ(name, type, _reg, sel)                        \
+       __BUILD_KVM_RW_WRAP(c0_guest_##name, vz_gc0_##name, type)       \
+       __BUILD_KVM_SAVE_VZ(name, _reg, sel)
+
+#define __BUILD_KVM_SET_HW(name, type, _reg, sel)                      \
+       __BUILD_KVM_SET_SAVED(name, type, _reg, sel)                    \
+       __BUILD_KVM_SET_VZ(name, type, _reg, sel)                       \
+       __BUILD_KVM_SET_WRAP(c0_guest_##name, vz_gc0_##name, type)
+
+/*
+ * We can't do atomic modifications of COP0 state if hardware can modify it.
+ * Races must be handled explicitly.
+ */
+#define __BUILD_KVM_ATOMIC_HW  __BUILD_KVM_SET_HW
+
+#endif
+
+/*
+ * Define accessors for CP0 registers that are accessible to the guest. These
+ * are primarily used by common emulation code, which may need to access the
+ * registers differently depending on the implementation.
+ *
+ *    fns_hw/sw    name     type    reg num         select
+ */
+__BUILD_KVM_RW_HW(index,          32, MIPS_CP0_TLB_INDEX,    0)
+__BUILD_KVM_RW_HW(entrylo0,       l,  MIPS_CP0_TLB_LO0,      0)
+__BUILD_KVM_RW_HW(entrylo1,       l,  MIPS_CP0_TLB_LO1,      0)
+__BUILD_KVM_RW_HW(context,        l,  MIPS_CP0_TLB_CONTEXT,  0)
+__BUILD_KVM_RW_HW(contextconfig,  32, MIPS_CP0_TLB_CONTEXT,  1)
+__BUILD_KVM_RW_HW(userlocal,      l,  MIPS_CP0_TLB_CONTEXT,  2)
+__BUILD_KVM_RW_HW(xcontextconfig, l,  MIPS_CP0_TLB_CONTEXT,  3)
+__BUILD_KVM_RW_HW(pagemask,       l,  MIPS_CP0_TLB_PG_MASK,  0)
+__BUILD_KVM_RW_HW(pagegrain,      32, MIPS_CP0_TLB_PG_MASK,  1)
+__BUILD_KVM_RW_HW(segctl0,        l,  MIPS_CP0_TLB_PG_MASK,  2)
+__BUILD_KVM_RW_HW(segctl1,        l,  MIPS_CP0_TLB_PG_MASK,  3)
+__BUILD_KVM_RW_HW(segctl2,        l,  MIPS_CP0_TLB_PG_MASK,  4)
+__BUILD_KVM_RW_HW(pwbase,         l,  MIPS_CP0_TLB_PG_MASK,  5)
+__BUILD_KVM_RW_HW(pwfield,        l,  MIPS_CP0_TLB_PG_MASK,  6)
+__BUILD_KVM_RW_HW(pwsize,         l,  MIPS_CP0_TLB_PG_MASK,  7)
+__BUILD_KVM_RW_HW(wired,          32, MIPS_CP0_TLB_WIRED,    0)
+__BUILD_KVM_RW_HW(pwctl,          32, MIPS_CP0_TLB_WIRED,    6)
+__BUILD_KVM_RW_HW(hwrena,         32, MIPS_CP0_HWRENA,       0)
+__BUILD_KVM_RW_HW(badvaddr,       l,  MIPS_CP0_BAD_VADDR,    0)
+__BUILD_KVM_RW_HW(badinstr,       32, MIPS_CP0_BAD_VADDR,    1)
+__BUILD_KVM_RW_HW(badinstrp,      32, MIPS_CP0_BAD_VADDR,    2)
+__BUILD_KVM_RW_SW(count,          32, MIPS_CP0_COUNT,        0)
+__BUILD_KVM_RW_HW(entryhi,        l,  MIPS_CP0_TLB_HI,       0)
+__BUILD_KVM_RW_HW(compare,        32, MIPS_CP0_COMPARE,      0)
+__BUILD_KVM_RW_HW(status,         32, MIPS_CP0_STATUS,       0)
+__BUILD_KVM_RW_HW(intctl,         32, MIPS_CP0_STATUS,       1)
+__BUILD_KVM_RW_HW(cause,          32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_RW_HW(epc,            l,  MIPS_CP0_EXC_PC,       0)
+__BUILD_KVM_RW_SW(prid,           32, MIPS_CP0_PRID,         0)
+__BUILD_KVM_RW_HW(ebase,          l,  MIPS_CP0_PRID,         1)
+__BUILD_KVM_RW_HW(config,         32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_RW_HW(config1,        32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_RW_HW(config2,        32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_RW_HW(config3,        32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_RW_HW(config4,        32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_RW_HW(config5,        32, MIPS_CP0_CONFIG,       5)
+__BUILD_KVM_RW_HW(config6,        32, MIPS_CP0_CONFIG,       6)
+__BUILD_KVM_RW_HW(config7,        32, MIPS_CP0_CONFIG,       7)
+__BUILD_KVM_RW_SW(maari,          l,  MIPS_CP0_LLADDR,       2)
+__BUILD_KVM_RW_HW(xcontext,       l,  MIPS_CP0_TLB_XCONTEXT, 0)
+__BUILD_KVM_RW_HW(errorepc,       l,  MIPS_CP0_ERROR_PC,     0)
+__BUILD_KVM_RW_HW(kscratch1,      l,  MIPS_CP0_DESAVE,       2)
+__BUILD_KVM_RW_HW(kscratch2,      l,  MIPS_CP0_DESAVE,       3)
+__BUILD_KVM_RW_HW(kscratch3,      l,  MIPS_CP0_DESAVE,       4)
+__BUILD_KVM_RW_HW(kscratch4,      l,  MIPS_CP0_DESAVE,       5)
+__BUILD_KVM_RW_HW(kscratch5,      l,  MIPS_CP0_DESAVE,       6)
+__BUILD_KVM_RW_HW(kscratch6,      l,  MIPS_CP0_DESAVE,       7)
+
+/* Bitwise operations (on HW state) */
+__BUILD_KVM_SET_HW(status,        32, MIPS_CP0_STATUS,       0)
+/* Cause can be modified asynchronously from hardirq hrtimer callback */
+__BUILD_KVM_ATOMIC_HW(cause,      32, MIPS_CP0_CAUSE,        0)
+__BUILD_KVM_SET_HW(ebase,         l,  MIPS_CP0_PRID,         1)
+
+/* Bitwise operations (on saved state) */
+__BUILD_KVM_SET_SAVED(config,     32, MIPS_CP0_CONFIG,       0)
+__BUILD_KVM_SET_SAVED(config1,    32, MIPS_CP0_CONFIG,       1)
+__BUILD_KVM_SET_SAVED(config2,    32, MIPS_CP0_CONFIG,       2)
+__BUILD_KVM_SET_SAVED(config3,    32, MIPS_CP0_CONFIG,       3)
+__BUILD_KVM_SET_SAVED(config4,    32, MIPS_CP0_CONFIG,       4)
+__BUILD_KVM_SET_SAVED(config5,    32, MIPS_CP0_CONFIG,       5)
+
 /* Helpers */
 
 static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
@@ -531,6 +775,10 @@ struct kvm_mips_callbacks {
        int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
        int (*handle_fpe)(struct kvm_vcpu *vcpu);
        int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
+       int (*handle_guest_exit)(struct kvm_vcpu *vcpu);
+       int (*hardware_enable)(void);
+       void (*hardware_disable)(void);
+       int (*check_extension)(struct kvm *kvm, long ext);
        int (*vcpu_init)(struct kvm_vcpu *vcpu);
        void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
        int (*vcpu_setup)(struct kvm_vcpu *vcpu);
@@ -599,6 +847,10 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
 
 u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+                                     struct kvm_vcpu *vcpu, bool write_fault);
+#endif
 extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
                                           struct kvm_vcpu *vcpu,
                                           bool write_fault);
@@ -625,6 +877,18 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
 extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
                                     unsigned long entryhi);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+                           unsigned long *gpa);
+void kvm_vz_local_flush_roottlb_all_guests(void);
+void kvm_vz_local_flush_guesttlb_all(void);
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+                         unsigned int count);
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+                         unsigned int count);
+#endif
+
 void kvm_mips_suspend_mm(int cpu);
 void kvm_mips_resume_mm(int cpu);
 
@@ -795,7 +1059,7 @@ extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
 u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
 void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
 void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
-void kvm_mips_init_count(struct kvm_vcpu *vcpu);
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz);
 int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
 int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
 int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz);
@@ -803,6 +1067,20 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
 void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
 
+/* fairly internal functions requiring some care to use */
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu);
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count);
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+                            u32 count, int min_drift);
+
+#ifdef CONFIG_KVM_MIPS_VZ
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu);
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu);
+#else
+static inline void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) {}
+static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {}
+#endif
+
 enum emulation_result kvm_mips_check_privilege(u32 cause,
                                               u32 *opc,
                                               struct kvm_run *run,
@@ -827,11 +1105,20 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
                                            struct kvm_run *run,
                                            struct kvm_vcpu *vcpu);
 
+/* COP0 */
+enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu);
+
 unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
 unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
 
+/* Hypercalls (hypcall.c) */
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+                                           union mips_instruction inst);
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu);
+
 /* Dynamic binary translation */
 extern int kvm_mips_trans_cache_index(union mips_instruction inst,
                                      u32 *opc, struct kvm_vcpu *vcpu);
@@ -846,7 +1133,6 @@ extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
 extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu);
 extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm);
 
-static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
index 21d9607c80d7deaa1ef2174c0dde62bda9985bde..e10f78befbd9b2b8185e39e0e5dca63ae3e5a78f 100644 (file)
@@ -36,7 +36,7 @@ unsigned platform_maar_init(unsigned num_pairs);
  * @upper:     The highest address that the MAAR pair will affect. Must be
  *             aligned to one byte before a 2^16 byte boundary.
  * @attrs:     The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *             MIPS_MAAR_V attribute will automatically be set.
+ *             MIPS_MAAR_VL attribute will automatically be set.
  *
  * Program the pair of MAAR registers specified by idx to apply the attributes
  * specified by attrs to the range of addresses from lower to higher.
@@ -49,10 +49,10 @@ static inline void write_maar_pair(unsigned idx, phys_addr_t lower,
        BUG_ON(((upper & 0xffff) != 0xffff)
                || ((upper & ~0xffffull) & ~(MIPS_MAAR_ADDR << 4)));
 
-       /* Automatically set MIPS_MAAR_V */
-       attrs |= MIPS_MAAR_V;
+       /* Automatically set MIPS_MAAR_VL */
+       attrs |= MIPS_MAAR_VL;
 
-       /* Write the upper address & attributes (only MIPS_MAAR_V matters) */
+       /* Write the upper address & attributes (only MIPS_MAAR_VL matters) */
        write_c0_maari(idx << 1);
        back_to_back_c0_hazard();
        write_c0_maar(((upper >> 4) & MIPS_MAAR_ADDR) | attrs);
@@ -81,7 +81,7 @@ extern void maar_init(void);
  * @upper:     The highest address that the MAAR pair will affect. Must be
  *             aligned to one byte before a 2^16 byte boundary.
  * @attrs:     The accessibility attributes to program, eg. MIPS_MAAR_S. The
- *             MIPS_MAAR_V attribute will automatically be set.
+ *             MIPS_MAAR_VL attribute will automatically be set.
  *
  * Describes the configuration of a pair of Memory Accessibility Attribute
  * Registers - applying attributes from attrs to the range of physical
index f8d1d2f1d80d55a3969b9eeb1ff05d7d484b18a9..6875b69f59f763b9c492a054e2c027754866869c 100644 (file)
  */
 #ifdef __ASSEMBLY__
 #define _ULCAST_
+#define _U64CAST_
 #else
 #define _ULCAST_ (unsigned long)
+#define _U64CAST_ (u64)
 #endif
 
 /*
 /*
  * Wired register bits
  */
-#define MIPSR6_WIRED_LIMIT     (_ULCAST_(0xffff) << 16)
-#define MIPSR6_WIRED_WIRED     (_ULCAST_(0xffff) << 0)
+#define MIPSR6_WIRED_LIMIT_SHIFT 16
+#define MIPSR6_WIRED_LIMIT     (_ULCAST_(0xffff) << MIPSR6_WIRED_LIMIT_SHIFT)
+#define MIPSR6_WIRED_WIRED_SHIFT 0
+#define MIPSR6_WIRED_WIRED     (_ULCAST_(0xffff) << MIPSR6_WIRED_WIRED_SHIFT)
 
 /*
  * Values used for computation of new tlb entries
 #define MIPS_CONF5_LLB         (_ULCAST_(1) << 4)
 #define MIPS_CONF5_MVH         (_ULCAST_(1) << 5)
 #define MIPS_CONF5_VP          (_ULCAST_(1) << 7)
+#define MIPS_CONF5_SBRI                (_ULCAST_(1) << 6)
 #define MIPS_CONF5_FRE         (_ULCAST_(1) << 8)
 #define MIPS_CONF5_UFE         (_ULCAST_(1) << 9)
 #define MIPS_CONF5_MSAEN       (_ULCAST_(1) << 27)
 #define XLR_PERFCTRL_ALLTHREADS        (_ULCAST_(1) << 13)
 
 /* MAAR bit definitions */
+#define MIPS_MAAR_VH           (_U64CAST_(1) << 63)
 #define MIPS_MAAR_ADDR         ((BIT_ULL(BITS_PER_LONG - 12) - 1) << 12)
 #define MIPS_MAAR_ADDR_SHIFT   12
 #define MIPS_MAAR_S            (_ULCAST_(1) << 1)
-#define MIPS_MAAR_V            (_ULCAST_(1) << 0)
+#define MIPS_MAAR_VL           (_ULCAST_(1) << 0)
+
+/* MAARI bit definitions */
+#define MIPS_MAARI_INDEX       (_ULCAST_(0x3f) << 0)
 
 /* EBase bit definitions */
 #define MIPS_EBASE_CPUNUM_SHIFT        0
 #define MIPS_CMGCRB_BASE       11
 #define MIPS_CMGCRF_BASE       (~_ULCAST_((1 << MIPS_CMGCRB_BASE) - 1))
 
+/* LLAddr bit definitions */
+#define MIPS_LLADDR_LLB_SHIFT  0
+#define MIPS_LLADDR_LLB                (_ULCAST_(1) << MIPS_LLADDR_LLB_SHIFT)
+
 /*
  * Bits in the MIPS32 Memory Segmentation registers.
  */
 /* Flush FTLB */
 #define LOONGSON_DIAG_FTLB     (_ULCAST_(1) << 13)
 
+/* CvmCtl register field definitions */
+#define CVMCTL_IPPCI_SHIFT     7
+#define CVMCTL_IPPCI           (_U64CAST_(0x7) << CVMCTL_IPPCI_SHIFT)
+#define CVMCTL_IPTI_SHIFT      4
+#define CVMCTL_IPTI            (_U64CAST_(0x7) << CVMCTL_IPTI_SHIFT)
+
+/* CvmMemCtl2 register field definitions */
+#define CVMMEMCTL2_INHIBITTS   (_U64CAST_(1) << 17)
+
+/* CvmVMConfig register field definitions */
+#define CVMVMCONF_DGHT         (_U64CAST_(1) << 60)
+#define CVMVMCONF_MMUSIZEM1_S  12
+#define CVMVMCONF_MMUSIZEM1    (_U64CAST_(0xff) << CVMVMCONF_MMUSIZEM1_S)
+#define CVMVMCONF_RMMUSIZEM1_S 0
+#define CVMVMCONF_RMMUSIZEM1   (_U64CAST_(0xff) << CVMVMCONF_RMMUSIZEM1_S)
+
 /*
  * Coprocessor 1 (FPU) register names
  */
@@ -1720,6 +1749,13 @@ do {                                                                     \
 
 #define read_c0_cvmmemctl()    __read_64bit_c0_register($11, 7)
 #define write_c0_cvmmemctl(val) __write_64bit_c0_register($11, 7, val)
+
+#define read_c0_cvmmemctl2()   __read_64bit_c0_register($16, 6)
+#define write_c0_cvmmemctl2(val) __write_64bit_c0_register($16, 6, val)
+
+#define read_c0_cvmvmconfig()  __read_64bit_c0_register($16, 7)
+#define write_c0_cvmvmconfig(val) __write_64bit_c0_register($16, 7, val)
+
 /*
  * The cacheerr registers are not standardized.         On OCTEON, they are
  * 64 bits wide.
@@ -1989,6 +2025,8 @@ do {                                                                      \
 #define read_gc0_epc()                 __read_ulong_gc0_register(14, 0)
 #define write_gc0_epc(val)             __write_ulong_gc0_register(14, 0, val)
 
+#define read_gc0_prid()                        __read_32bit_gc0_register(15, 0)
+
 #define read_gc0_ebase()               __read_32bit_gc0_register(15, 1)
 #define write_gc0_ebase(val)           __write_32bit_gc0_register(15, 1, val)
 
@@ -2012,6 +2050,9 @@ do {                                                                      \
 #define write_gc0_config6(val)         __write_32bit_gc0_register(16, 6, val)
 #define write_gc0_config7(val)         __write_32bit_gc0_register(16, 7, val)
 
+#define read_gc0_lladdr()              __read_ulong_gc0_register(17, 0)
+#define write_gc0_lladdr(val)          __write_ulong_gc0_register(17, 0, val)
+
 #define read_gc0_watchlo0()            __read_ulong_gc0_register(18, 0)
 #define read_gc0_watchlo1()            __read_ulong_gc0_register(18, 1)
 #define read_gc0_watchlo2()            __read_ulong_gc0_register(18, 2)
@@ -2090,6 +2131,19 @@ do {                                                                     \
 #define write_gc0_kscratch5(val)       __write_ulong_gc0_register(31, 6, val)
 #define write_gc0_kscratch6(val)       __write_ulong_gc0_register(31, 7, val)
 
+/* Cavium OCTEON (cnMIPS) */
+#define read_gc0_cvmcount()            __read_ulong_gc0_register(9, 6)
+#define write_gc0_cvmcount(val)                __write_ulong_gc0_register(9, 6, val)
+
+#define read_gc0_cvmctl()              __read_64bit_gc0_register(9, 7)
+#define write_gc0_cvmctl(val)          __write_64bit_gc0_register(9, 7, val)
+
+#define read_gc0_cvmmemctl()           __read_64bit_gc0_register(11, 7)
+#define write_gc0_cvmmemctl(val)       __write_64bit_gc0_register(11, 7, val)
+
+#define read_gc0_cvmmemctl2()          __read_64bit_gc0_register(16, 6)
+#define write_gc0_cvmmemctl2(val)      __write_64bit_gc0_register(16, 6, val)
+
 /*
  * Macros to access the floating point coprocessor control registers
  */
@@ -2696,9 +2750,11 @@ __BUILD_SET_C0(brcm_mode)
  */
 #define __BUILD_SET_GC0(name)  __BUILD_SET_COMMON(gc0_##name)
 
+__BUILD_SET_GC0(wired)
 __BUILD_SET_GC0(status)
 __BUILD_SET_GC0(cause)
 __BUILD_SET_GC0(ebase)
+__BUILD_SET_GC0(config1)
 
 /*
  * Return low 10 bits of ebase.
index dd179fd8acdac4a6e9f47257f79263f4a6432c46..939734de435908cf5a37d04dc904900943e72a93 100644 (file)
  */
 #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
 
-#define UNIQUE_ENTRYHI(idx)                                            \
-               ((CKSEG0 + ((idx) << (PAGE_SHIFT + 1))) |               \
+#define _UNIQUE_ENTRYHI(base, idx)                                     \
+               (((base) + ((idx) << (PAGE_SHIFT + 1))) |               \
                 (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0))
+#define UNIQUE_ENTRYHI(idx)            _UNIQUE_ENTRYHI(CKSEG0, idx)
+#define UNIQUE_GUEST_ENTRYHI(idx)      _UNIQUE_ENTRYHI(CKSEG1, idx)
 
 static inline unsigned int num_wired_entries(void)
 {
index 77429d1622b343aed9f6a0d1c81f5ed4eacadffd..b5e46ae872d3b55214f16e1249a13a86387c7aca 100644 (file)
@@ -179,7 +179,7 @@ enum cop0_coi_func {
        tlbr_op       = 0x01, tlbwi_op      = 0x02,
        tlbwr_op      = 0x06, tlbp_op       = 0x08,
        rfe_op        = 0x10, eret_op       = 0x18,
-       wait_op       = 0x20,
+       wait_op       = 0x20, hypcall_op    = 0x28
 };
 
 /*
index a8a0199bf7601aeaf01538c8de07b0bae76ca151..0318c6b442ab4df3f3af39c24ea4cb66c13902a3 100644 (file)
@@ -21,6 +21,8 @@
 
 #define __KVM_HAVE_READONLY_MEM
 
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 /*
  * for KVM_GET_REGS and KVM_SET_REGS
  *
@@ -54,9 +56,14 @@ struct kvm_fpu {
  * Register set = 0: GP registers from kvm_regs (see definitions below).
  *
  * Register set = 1: CP0 registers.
- *  bits[15..8]  - Must be zero.
- *  bits[7..3]   - Register 'rd'  index.
- *  bits[2..0]   - Register 'sel' index.
+ *  bits[15..8]  - COP0 register set.
+ *
+ *  COP0 register set = 0: Main CP0 registers.
+ *   bits[7..3]   - Register 'rd'  index.
+ *   bits[2..0]   - Register 'sel' index.
+ *
+ *  COP0 register set = 1: MAARs.
+ *   bits[7..0]   - MAAR index.
  *
  * Register set = 2: KVM specific registers (see definitions below).
  *
@@ -114,6 +121,15 @@ struct kvm_fpu {
 #define KVM_REG_MIPS_PC                (KVM_REG_MIPS_GP | KVM_REG_SIZE_U64 | 34)
 
 
+/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR      (KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n)       (KVM_REG_MIPS_MAAR | \
+                                        KVM_REG_SIZE_U64 | (n))
+
+
 /*
  * KVM_REG_MIPS_KVM - KVM specific control registers.
  */
index 07718bb5fc9d8612d98cc78d46a7ca10851f8245..c72a4cda389ce8cfb257a0299b8a5e22ac9132bf 100644 (file)
@@ -289,6 +289,8 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c)
                            MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) {
                if (c->fpu_id & MIPS_FPIR_3D)
                        c->ases |= MIPS_ASE_MIPS3D;
+               if (c->fpu_id & MIPS_FPIR_UFRP)
+                       c->options |= MIPS_CPU_UFR;
                if (c->fpu_id & MIPS_FPIR_FREP)
                        c->options |= MIPS_CPU_FRE;
        }
@@ -1003,7 +1005,8 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
        unsigned int config3, config3_dyn;
 
        probe_gc0_config_dyn(config3, config3, config3_dyn,
-                            MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_CTXTC);
+                            MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_ULRI |
+                            MIPS_CONF3_CTXTC);
 
        if (config3 & MIPS_CONF3_CTXTC)
                c->guest.options |= MIPS_CPU_CTXTC;
@@ -1013,6 +1016,9 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c)
        if (config3 & MIPS_CONF3_PW)
                c->guest.options |= MIPS_CPU_HTW;
 
+       if (config3 & MIPS_CONF3_ULRI)
+               c->guest.options |= MIPS_CPU_ULRI;
+
        if (config3 & MIPS_CONF3_SC)
                c->guest.options |= MIPS_CPU_SEGMENTS;
 
@@ -1051,7 +1057,7 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
        unsigned int config5, config5_dyn;
 
        probe_gc0_config_dyn(config5, config5, config5_dyn,
-                        MIPS_CONF_M | MIPS_CONF5_MRP);
+                        MIPS_CONF_M | MIPS_CONF5_MVH | MIPS_CONF5_MRP);
 
        if (config5 & MIPS_CONF5_MRP)
                c->guest.options |= MIPS_CPU_MAAR;
@@ -1061,6 +1067,9 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c)
        if (config5 & MIPS_CONF5_LLB)
                c->guest.options |= MIPS_CPU_RW_LLB;
 
+       if (config5 & MIPS_CONF5_MVH)
+               c->guest.options |= MIPS_CPU_MVH;
+
        if (config5 & MIPS_CONF_M)
                c->guest.conf |= BIT(6);
        return config5 & MIPS_CONF_M;
index a7f81261c781f0d935fbf70feffdeca3d898703a..c036157fb891ff7e9a76c461b5ed0873c56743b9 100644 (file)
@@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq);
  */
 
 unsigned int mips_hpt_frequency;
+EXPORT_SYMBOL_GPL(mips_hpt_frequency);
 
 /*
  * This function exists in order to cause an error due to a duplicate
index 65067327db122bac7a98d318c21a1c2ca99f6aa1..50a722dfb236d7edbaf99ab51dd65986e250dc08 100644 (file)
@@ -26,11 +26,34 @@ config KVM
        select SRCU
        ---help---
          Support for hosting Guest kernels.
-         Currently supported on MIPS32 processors.
+
+choice
+       prompt "Virtualization mode"
+       depends on KVM
+       default KVM_MIPS_TE
+
+config KVM_MIPS_TE
+       bool "Trap & Emulate"
+       ---help---
+         Use trap and emulate to virtualize 32-bit guests in user mode. This
+         does not require any special hardware Virtualization support beyond
+         standard MIPS32/64 r2 or later, but it does require the guest kernel
+         to be configured with CONFIG_KVM_GUEST=y so that it resides in the
+         user address segment.
+
+config KVM_MIPS_VZ
+       bool "MIPS Virtualization (VZ) ASE"
+       ---help---
+         Use the MIPS Virtualization (VZ) ASE to virtualize guests. This
+         supports running unmodified guest kernels (with CONFIG_KVM_GUEST=n),
+         but requires hardware support.
+
+endchoice
 
 config KVM_MIPS_DYN_TRANS
        bool "KVM/MIPS: Dynamic binary translation to reduce traps"
-       depends on KVM
+       depends on KVM_MIPS_TE
+       default y
        ---help---
          When running in Trap & Emulate mode patch privileged
          instructions to reduce the number of traps.
index 847429de780d3b948fe9cb296e40223188af715e..45d90f5d5177a0783842e0b6137cc8d1c38db2d0 100644 (file)
@@ -9,8 +9,15 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
 
 kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
            interrupt.o stats.o commpage.o \
-           dyntrans.o trap_emul.o fpu.o
+           fpu.o
+kvm-objs += hypcall.o
 kvm-objs += mmu.o
 
+ifdef CONFIG_KVM_MIPS_VZ
+kvm-objs               += vz.o
+else
+kvm-objs               += dyntrans.o
+kvm-objs               += trap_emul.o
+endif
 obj-$(CONFIG_KVM)      += kvm.o
 obj-y                  += callback.o tlb.o
index d40cfaad45295c8631f306a96fbb0d713e2a864a..34e78a3ee9d79ece862574274f4ce11fcf3e6d7a 100644 (file)
@@ -308,7 +308,7 @@ int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
  *             CP0_Cause.DC bit or the count_ctl.DC bit.
  *             0 otherwise (in which case CP0_Count timer is running).
  */
-static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
+int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
 
@@ -467,7 +467,7 @@ u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
  *
  * Returns:    The ktime at the point of freeze.
  */
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
+ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
 {
        ktime_t now;
 
@@ -516,6 +516,82 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
        hrtimer_start(&vcpu->arch.comparecount_timer, expire, HRTIMER_MODE_ABS);
 }
 
+/**
+ * kvm_mips_restore_hrtimer() - Restore hrtimer after a gap, updating expiry.
+ * @vcpu:      Virtual CPU.
+ * @before:    Time before Count was saved, lower bound of drift calculation.
+ * @count:     CP0_Count at point of restore.
+ * @min_drift: Minimum amount of drift permitted before correction.
+ *             Must be <= 0.
+ *
+ * Restores the timer from a particular @count, accounting for drift. This can
+ * be used in conjunction with kvm_mips_freeze_timer() when a hardware timer is
+ * to be used for a period of time, but the exact ktime corresponding to the
+ * final Count that must be restored is not known.
+ *
+ * It is gauranteed that a timer interrupt immediately after restore will be
+ * handled, but not if CP0_Compare is exactly at @count. That case should
+ * already be handled when the hardware timer state is saved.
+ *
+ * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is not
+ * stopped).
+ *
+ * Returns:    Amount of correction to count_bias due to drift.
+ */
+int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before,
+                            u32 count, int min_drift)
+{
+       ktime_t now, count_time;
+       u32 now_count, before_count;
+       u64 delta;
+       int drift, ret = 0;
+
+       /* Calculate expected count at before */
+       before_count = vcpu->arch.count_bias +
+                       kvm_mips_ktime_to_count(vcpu, before);
+
+       /*
+        * Detect significantly negative drift, where count is lower than
+        * expected. Some negative drift is expected when hardware counter is
+        * set after kvm_mips_freeze_timer(), and it is harmless to allow the
+        * time to jump forwards a little, within reason. If the drift is too
+        * significant, adjust the bias to avoid a big Guest.CP0_Count jump.
+        */
+       drift = count - before_count;
+       if (drift < min_drift) {
+               count_time = before;
+               vcpu->arch.count_bias += drift;
+               ret = drift;
+               goto resume;
+       }
+
+       /* Calculate expected count right now */
+       now = ktime_get();
+       now_count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now);
+
+       /*
+        * Detect positive drift, where count is higher than expected, and
+        * adjust the bias to avoid guest time going backwards.
+        */
+       drift = count - now_count;
+       if (drift > 0) {
+               count_time = now;
+               vcpu->arch.count_bias += drift;
+               ret = drift;
+               goto resume;
+       }
+
+       /* Subtract nanosecond delta to find ktime when count was read */
+       delta = (u64)(u32)(now_count - count);
+       delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
+       count_time = ktime_sub_ns(now, delta);
+
+resume:
+       /* Resume using the calculated ktime */
+       kvm_mips_resume_hrtimer(vcpu, count_time, count);
+       return ret;
+}
+
 /**
  * kvm_mips_write_count() - Modify the count and update timer.
  * @vcpu:      Virtual CPU.
@@ -543,16 +619,15 @@ void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
 /**
  * kvm_mips_init_count() - Initialise timer.
  * @vcpu:      Virtual CPU.
+ * @count_hz:  Frequency of timer.
  *
- * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set
- * it going if it's enabled.
+ * Initialise the timer to the specified frequency, zero it, and set it going if
+ * it's enabled.
  */
-void kvm_mips_init_count(struct kvm_vcpu *vcpu)
+void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz)
 {
-       /* 100 MHz */
-       vcpu->arch.count_hz = 100*1000*1000;
-       vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32,
-                                         vcpu->arch.count_hz);
+       vcpu->arch.count_hz = count_hz;
+       vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz);
        vcpu->arch.count_dyn_bias = 0;
 
        /* Starting at 0 */
@@ -622,7 +697,9 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
        struct mips_coproc *cop0 = vcpu->arch.cop0;
        int dc;
        u32 old_compare = kvm_read_c0_guest_compare(cop0);
-       ktime_t now;
+       s32 delta = compare - old_compare;
+       u32 cause;
+       ktime_t now = ktime_set(0, 0); /* silence bogus GCC warning */
        u32 count;
 
        /* if unchanged, must just be an ack */
@@ -634,6 +711,21 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
                return;
        }
 
+       /*
+        * If guest CP0_Compare moves forward, CP0_GTOffset should be adjusted
+        * too to prevent guest CP0_Count hitting guest CP0_Compare.
+        *
+        * The new GTOffset corresponds to the new value of CP0_Compare, and is
+        * set prior to it being written into the guest context. We disable
+        * preemption until the new value is written to prevent restore of a
+        * GTOffset corresponding to the old CP0_Compare value.
+        */
+       if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta > 0) {
+               preempt_disable();
+               write_c0_gtoffset(compare - read_c0_count());
+               back_to_back_c0_hazard();
+       }
+
        /* freeze_hrtimer() takes care of timer interrupts <= count */
        dc = kvm_mips_count_disabled(vcpu);
        if (!dc)
@@ -641,12 +733,36 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
 
        if (ack)
                kvm_mips_callbacks->dequeue_timer_int(vcpu);
+       else if (IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+               /*
+                * With VZ, writing CP0_Compare acks (clears) CP0_Cause.TI, so
+                * preserve guest CP0_Cause.TI if we don't want to ack it.
+                */
+               cause = kvm_read_c0_guest_cause(cop0);
 
        kvm_write_c0_guest_compare(cop0, compare);
 
+       if (IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+               if (delta > 0)
+                       preempt_enable();
+
+               back_to_back_c0_hazard();
+
+               if (!ack && cause & CAUSEF_TI)
+                       kvm_write_c0_guest_cause(cop0, cause);
+       }
+
        /* resume_hrtimer() takes care of timer interrupts > count */
        if (!dc)
                kvm_mips_resume_hrtimer(vcpu, now, count);
+
+       /*
+        * If guest CP0_Compare is moving backward, we delay CP0_GTOffset change
+        * until after the new CP0_Compare is written, otherwise new guest
+        * CP0_Count could hit new guest CP0_Compare.
+        */
+       if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta <= 0)
+               write_c0_gtoffset(compare - read_c0_count());
 }
 
 /**
@@ -857,6 +973,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
        ++vcpu->stat.wait_exits;
        trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
        if (!vcpu->arch.pending_exceptions) {
+               kvm_vz_lose_htimer(vcpu);
                vcpu->arch.wait = 1;
                kvm_vcpu_block(vcpu);
 
@@ -873,17 +990,62 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
        return EMULATE_DONE;
 }
 
-/*
- * XXXKYMA: Linux doesn't seem to use TLBR, return EMULATE_FAIL for now so that
- * we can catch this, if things ever change
- */
+static void kvm_mips_change_entryhi(struct kvm_vcpu *vcpu,
+                                   unsigned long entryhi)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+       int cpu, i;
+       u32 nasid = entryhi & KVM_ENTRYHI_ASID;
+
+       if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) {
+               trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) &
+                                     KVM_ENTRYHI_ASID, nasid);
+
+               /*
+                * Flush entries from the GVA page tables.
+                * Guest user page table will get flushed lazily on re-entry to
+                * guest user if the guest ASID actually changes.
+                */
+               kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_KERN);
+
+               /*
+                * Regenerate/invalidate kernel MMU context.
+                * The user MMU context will be regenerated lazily on re-entry
+                * to guest user if the guest ASID actually changes.
+                */
+               preempt_disable();
+               cpu = smp_processor_id();
+               get_new_mmu_context(kern_mm, cpu);
+               for_each_possible_cpu(i)
+                       if (i != cpu)
+                               cpu_context(i, kern_mm) = 0;
+               preempt_enable();
+       }
+       kvm_write_c0_guest_entryhi(cop0, entryhi);
+}
+
 enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
+       struct kvm_mips_tlb *tlb;
        unsigned long pc = vcpu->arch.pc;
+       int index;
 
-       kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
-       return EMULATE_FAIL;
+       index = kvm_read_c0_guest_index(cop0);
+       if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
+               /* UNDEFINED */
+               kvm_debug("[%#lx] TLBR Index %#x out of range\n", pc, index);
+               index &= KVM_MIPS_GUEST_TLB_SIZE - 1;
+       }
+
+       tlb = &vcpu->arch.guest_tlb[index];
+       kvm_write_c0_guest_pagemask(cop0, tlb->tlb_mask);
+       kvm_write_c0_guest_entrylo0(cop0, tlb->tlb_lo[0]);
+       kvm_write_c0_guest_entrylo1(cop0, tlb->tlb_lo[1]);
+       kvm_mips_change_entryhi(vcpu, tlb->tlb_hi);
+
+       return EMULATE_DONE;
 }
 
 /**
@@ -1105,11 +1267,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                                           struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
-       struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
        enum emulation_result er = EMULATE_DONE;
        u32 rt, rd, sel;
        unsigned long curr_pc;
-       int cpu, i;
 
        /*
         * Update PC and hold onto current PC in case there is
@@ -1143,6 +1303,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                case wait_op:
                        er = kvm_mips_emul_wait(vcpu);
                        break;
+               case hypcall_op:
+                       er = kvm_mips_emul_hypcall(vcpu, inst);
+                       break;
                }
        } else {
                rt = inst.c0r_format.rt;
@@ -1208,44 +1371,8 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
                                kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
                                                          vcpu->arch.gprs[rt]);
                        } else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
-                               u32 nasid =
-                                       vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
-                               if (((kvm_read_c0_guest_entryhi(cop0) &
-                                     KVM_ENTRYHI_ASID) != nasid)) {
-                                       trace_kvm_asid_change(vcpu,
-                                               kvm_read_c0_guest_entryhi(cop0)
-                                                       & KVM_ENTRYHI_ASID,
-                                               nasid);
-
-                                       /*
-                                        * Flush entries from the GVA page
-                                        * tables.
-                                        * Guest user page table will get
-                                        * flushed lazily on re-entry to guest
-                                        * user if the guest ASID actually
-                                        * changes.
-                                        */
-                                       kvm_mips_flush_gva_pt(kern_mm->pgd,
-                                                             KMF_KERN);
-
-                                       /*
-                                        * Regenerate/invalidate kernel MMU
-                                        * context.
-                                        * The user MMU context will be
-                                        * regenerated lazily on re-entry to
-                                        * guest user if the guest ASID actually
-                                        * changes.
-                                        */
-                                       preempt_disable();
-                                       cpu = smp_processor_id();
-                                       get_new_mmu_context(kern_mm, cpu);
-                                       for_each_possible_cpu(i)
-                                               if (i != cpu)
-                                                       cpu_context(i, kern_mm) = 0;
-                                       preempt_enable();
-                               }
-                               kvm_write_c0_guest_entryhi(cop0,
-                                                          vcpu->arch.gprs[rt]);
+                               kvm_mips_change_entryhi(vcpu,
+                                                       vcpu->arch.gprs[rt]);
                        }
                        /* Are we writing to COUNT */
                        else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
@@ -1474,9 +1601,8 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
                                             struct kvm_run *run,
                                             struct kvm_vcpu *vcpu)
 {
-       enum emulation_result er = EMULATE_DO_MMIO;
+       enum emulation_result er;
        u32 rt;
-       u32 bytes;
        void *data = run->mmio.data;
        unsigned long curr_pc;
 
@@ -1491,103 +1617,74 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
 
        rt = inst.i_format.rt;
 
+       run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+                                               vcpu->arch.host_cp0_badvaddr);
+       if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+               goto out_fail;
+
        switch (inst.i_format.opcode) {
-       case sb_op:
-               bytes = 1;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
-               run->mmio.len = bytes;
-               run->mmio.is_write = 1;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_is_write = 1;
-               *(u8 *) data = vcpu->arch.gprs[rt];
-               kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
-                         vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
-                         *(u8 *) data);
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+       case sd_op:
+               run->mmio.len = 8;
+               *(u64 *)data = vcpu->arch.gprs[rt];
 
+               kvm_debug("[%#lx] OP_SD: eaddr: %#lx, gpr: %#lx, data: %#llx\n",
+                         vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+                         vcpu->arch.gprs[rt], *(u64 *)data);
                break;
+#endif
 
        case sw_op:
-               bytes = 4;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
-
-               run->mmio.len = bytes;
-               run->mmio.is_write = 1;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_is_write = 1;
-               *(u32 *) data = vcpu->arch.gprs[rt];
+               run->mmio.len = 4;
+               *(u32 *)data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(u32 *) data);
+                         vcpu->arch.gprs[rt], *(u32 *)data);
                break;
 
        case sh_op:
-               bytes = 2;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
-
-               run->mmio.len = bytes;
-               run->mmio.is_write = 1;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_is_write = 1;
-               *(u16 *) data = vcpu->arch.gprs[rt];
+               run->mmio.len = 2;
+               *(u16 *)data = vcpu->arch.gprs[rt];
 
                kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
                          vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
-                         vcpu->arch.gprs[rt], *(u32 *) data);
+                         vcpu->arch.gprs[rt], *(u16 *)data);
+               break;
+
+       case sb_op:
+               run->mmio.len = 1;
+               *(u8 *)data = vcpu->arch.gprs[rt];
+
+               kvm_debug("[%#lx] OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
+                         vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
+                         vcpu->arch.gprs[rt], *(u8 *)data);
                break;
 
        default:
                kvm_err("Store not yet supported (inst=0x%08x)\n",
                        inst.word);
-               er = EMULATE_FAIL;
-               break;
+               goto out_fail;
        }
 
-       /* Rollback PC if emulation was unsuccessful */
-       if (er == EMULATE_FAIL)
-               vcpu->arch.pc = curr_pc;
+       run->mmio.is_write = 1;
+       vcpu->mmio_needed = 1;
+       vcpu->mmio_is_write = 1;
+       return EMULATE_DO_MMIO;
 
-       return er;
+out_fail:
+       /* Rollback PC if emulation was unsuccessful */
+       vcpu->arch.pc = curr_pc;
+       return EMULATE_FAIL;
 }
 
 enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
                                            u32 cause, struct kvm_run *run,
                                            struct kvm_vcpu *vcpu)
 {
-       enum emulation_result er = EMULATE_DO_MMIO;
+       enum emulation_result er;
        unsigned long curr_pc;
        u32 op, rt;
-       u32 bytes;
 
        rt = inst.i_format.rt;
        op = inst.i_format.opcode;
@@ -1606,96 +1703,53 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
 
        vcpu->arch.io_gpr = rt;
 
+       run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa(
+                                               vcpu->arch.host_cp0_badvaddr);
+       if (run->mmio.phys_addr == KVM_INVALID_ADDR)
+               return EMULATE_FAIL;
+
+       vcpu->mmio_needed = 2;  /* signed */
        switch (op) {
-       case lw_op:
-               bytes = 4;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-                       er = EMULATE_FAIL;
-                       break;
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
+#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ)
+       case ld_op:
+               run->mmio.len = 8;
+               break;
 
-               run->mmio.len = bytes;
-               run->mmio.is_write = 0;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_is_write = 0;
+       case lwu_op:
+               vcpu->mmio_needed = 1;  /* unsigned */
+               /* fall through */
+#endif
+       case lw_op:
+               run->mmio.len = 4;
                break;
 
-       case lh_op:
        case lhu_op:
-               bytes = 2;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-                       er = EMULATE_FAIL;
-                       break;
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
-
-               run->mmio.len = bytes;
-               run->mmio.is_write = 0;
-               vcpu->mmio_needed = 1;
-               vcpu->mmio_is_write = 0;
-
-               if (op == lh_op)
-                       vcpu->mmio_needed = 2;
-               else
-                       vcpu->mmio_needed = 1;
-
+               vcpu->mmio_needed = 1;  /* unsigned */
+               /* fall through */
+       case lh_op:
+               run->mmio.len = 2;
                break;
 
        case lbu_op:
+               vcpu->mmio_needed = 1;  /* unsigned */
+               /* fall through */
        case lb_op:
-               bytes = 1;
-               if (bytes > sizeof(run->mmio.data)) {
-                       kvm_err("%s: bad MMIO length: %d\n", __func__,
-                              run->mmio.len);
-                       er = EMULATE_FAIL;
-                       break;
-               }
-               run->mmio.phys_addr =
-                   kvm_mips_callbacks->gva_to_gpa(vcpu->arch.
-                                                  host_cp0_badvaddr);
-               if (run->mmio.phys_addr == KVM_INVALID_ADDR) {
-                       er = EMULATE_FAIL;
-                       break;
-               }
-
-               run->mmio.len = bytes;
-               run->mmio.is_write = 0;
-               vcpu->mmio_is_write = 0;
-
-               if (op == lb_op)
-                       vcpu->mmio_needed = 2;
-               else
-                       vcpu->mmio_needed = 1;
-
+               run->mmio.len = 1;
                break;
 
        default:
                kvm_err("Load not yet supported (inst=0x%08x)\n",
                        inst.word);
-               er = EMULATE_FAIL;
-               break;
+               vcpu->mmio_needed = 0;
+               return EMULATE_FAIL;
        }
 
-       return er;
+       run->mmio.is_write = 0;
+       vcpu->mmio_is_write = 0;
+       return EMULATE_DO_MMIO;
 }
 
+#ifndef CONFIG_KVM_MIPS_VZ
 static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
                                                     unsigned long curr_pc,
                                                     unsigned long addr,
@@ -1786,11 +1840,35 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
                          vcpu->arch.pc, vcpu->arch.gprs[31], cache, op, base,
                          arch->gprs[base], offset);
 
-               if (cache == Cache_D)
+               if (cache == Cache_D) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
                        r4k_blast_dcache();
-               else if (cache == Cache_I)
+#else
+                       switch (boot_cpu_type()) {
+                       case CPU_CAVIUM_OCTEON3:
+                               /* locally flush icache */
+                               local_flush_icache_range(0, 0);
+                               break;
+                       default:
+                               __flush_cache_all();
+                               break;
+                       }
+#endif
+               } else if (cache == Cache_I) {
+#ifdef CONFIG_CPU_R4K_CACHE_TLB
                        r4k_blast_icache();
-               else {
+#else
+                       switch (boot_cpu_type()) {
+                       case CPU_CAVIUM_OCTEON3:
+                               /* locally flush icache */
+                               local_flush_icache_range(0, 0);
+                               break;
+                       default:
+                               flush_icache_all();
+                               break;
+                       }
+#endif
+               } else {
                        kvm_err("%s: unsupported CACHE INDEX operation\n",
                                __func__);
                        return EMULATE_FAIL;
@@ -1870,18 +1948,6 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
        case cop0_op:
                er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
                break;
-       case sb_op:
-       case sh_op:
-       case sw_op:
-               er = kvm_mips_emulate_store(inst, cause, run, vcpu);
-               break;
-       case lb_op:
-       case lbu_op:
-       case lhu_op:
-       case lh_op:
-       case lw_op:
-               er = kvm_mips_emulate_load(inst, cause, run, vcpu);
-               break;
 
 #ifndef CONFIG_CPU_MIPSR6
        case cache_op:
@@ -1915,6 +1981,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
 
        return er;
 }
+#endif /* CONFIG_KVM_MIPS_VZ */
 
 /**
  * kvm_mips_guest_exception_base() - Find guest exception vector base address.
@@ -2524,8 +2591,15 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
        vcpu->arch.pc = vcpu->arch.io_pc;
 
        switch (run->mmio.len) {
+       case 8:
+               *gpr = *(s64 *)run->mmio.data;
+               break;
+
        case 4:
-               *gpr = *(s32 *) run->mmio.data;
+               if (vcpu->mmio_needed == 2)
+                       *gpr = *(s32 *)run->mmio.data;
+               else
+                       *gpr = *(u32 *)run->mmio.data;
                break;
 
        case 2:
index c5b254c4d0da89102a5c26964d9d567afe80c093..16e1c93b484f43460f373b6887d6ac2b05107226 100644 (file)
 #define RA             31
 
 /* Some CP0 registers */
+#define C0_PWBASE      5, 5
 #define C0_HWRENA      7, 0
 #define C0_BADVADDR    8, 0
 #define C0_BADINSTR    8, 1
 #define C0_BADINSTRP   8, 2
 #define C0_ENTRYHI     10, 0
+#define C0_GUESTCTL1   10, 4
 #define C0_STATUS      12, 0
+#define C0_GUESTCTL0   12, 6
 #define C0_CAUSE       13, 0
 #define C0_EPC         14, 0
 #define C0_EBASE       15, 1
@@ -292,8 +295,8 @@ static void *kvm_mips_build_enter_guest(void *addr)
        unsigned int i;
        struct uasm_label labels[2];
        struct uasm_reloc relocs[2];
-       struct uasm_label *l = labels;
-       struct uasm_reloc *r = relocs;
+       struct uasm_label __maybe_unused *l = labels;
+       struct uasm_reloc __maybe_unused *r = relocs;
 
        memset(labels, 0, sizeof(labels));
        memset(relocs, 0, sizeof(relocs));
@@ -302,7 +305,67 @@ static void *kvm_mips_build_enter_guest(void *addr)
        UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
        UASM_i_MTC0(&p, T0, C0_EPC);
 
-       /* Set the ASID for the Guest Kernel */
+#ifdef CONFIG_KVM_MIPS_VZ
+       /* Save normal linux process pgd (VZ guarantees pgd_reg is set) */
+       UASM_i_MFC0(&p, K0, c0_kscratch(), pgd_reg);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+
+       /*
+        * Set up KVM GPA pgd.
+        * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+        * - call tlbmiss_handler_setup_pgd(mm->pgd)
+        * - write mm->pgd into CP0_PWBase
+        *
+        * We keep S0 pointing at struct kvm so we can load the ASID below.
+        */
+       UASM_i_LW(&p, S0, (int)offsetof(struct kvm_vcpu, kvm) -
+                         (int)offsetof(struct kvm_vcpu, arch), K1);
+       UASM_i_LW(&p, A0, offsetof(struct kvm, arch.gpa_mm.pgd), S0);
+       UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+       uasm_i_jalr(&p, RA, T9);
+       /* delay slot */
+       if (cpu_has_htw)
+               UASM_i_MTC0(&p, A0, C0_PWBASE);
+       else
+               uasm_i_nop(&p);
+
+       /* Set GM bit to setup eret to VZ guest context */
+       uasm_i_addiu(&p, V1, ZERO, 1);
+       uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+       uasm_i_ins(&p, K0, V1, MIPS_GCTL0_GM_SHIFT, 1);
+       uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+       if (cpu_has_guestid) {
+               /*
+                * Set root mode GuestID, so that root TLB refill handler can
+                * use the correct GuestID in the root TLB.
+                */
+
+               /* Get current GuestID */
+               uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+               /* Set GuestCtl1.RID = GuestCtl1.ID */
+               uasm_i_ext(&p, T1, T0, MIPS_GCTL1_ID_SHIFT,
+                          MIPS_GCTL1_ID_WIDTH);
+               uasm_i_ins(&p, T0, T1, MIPS_GCTL1_RID_SHIFT,
+                          MIPS_GCTL1_RID_WIDTH);
+               uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+
+               /* GuestID handles dealiasing so we don't need to touch ASID */
+               goto skip_asid_restore;
+       }
+
+       /* Root ASID Dealias (RAD) */
+
+       /* Save host ASID */
+       UASM_i_MFC0(&p, K0, C0_ENTRYHI);
+       UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+                 K1);
+
+       /* Set the root ASID for the Guest */
+       UASM_i_ADDIU(&p, T1, S0,
+                    offsetof(struct kvm, arch.gpa_mm.context.asid));
+#else
+       /* Set the ASID for the Guest Kernel or User */
        UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
        UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
                  T0);
@@ -315,6 +378,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
        UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
                                          guest_user_mm.context.asid));
        uasm_l_kernel_asid(&l, p);
+#endif
 
        /* t1: contains the base of the ASID array, need to get the cpu id  */
        /* smp_processor_id */
@@ -339,6 +403,7 @@ static void *kvm_mips_build_enter_guest(void *addr)
        uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
 #endif
 
+#ifndef CONFIG_KVM_MIPS_VZ
        /*
         * Set up KVM T&E GVA pgd.
         * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
@@ -351,7 +416,11 @@ static void *kvm_mips_build_enter_guest(void *addr)
        UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
        uasm_i_jalr(&p, RA, T9);
         uasm_i_mtc0(&p, K0, C0_ENTRYHI);
-
+#else
+       /* Set up KVM VZ root ASID (!guestid) */
+       uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+skip_asid_restore:
+#endif
        uasm_i_ehb(&p);
 
        /* Disable RDHWR access */
@@ -559,13 +628,10 @@ void *kvm_mips_build_exit(void *addr)
        /* Now that context has been saved, we can use other registers */
 
        /* Restore vcpu */
-       UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
-       uasm_i_move(&p, S1, A1);
+       UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
 
        /* Restore run (vcpu->run) */
-       UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
-       /* Save pointer to run in s0, will be saved by the compiler */
-       uasm_i_move(&p, S0, A0);
+       UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1);
 
        /*
         * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
@@ -641,6 +707,52 @@ void *kvm_mips_build_exit(void *addr)
                uasm_l_msa_1(&l, p);
        }
 
+#ifdef CONFIG_KVM_MIPS_VZ
+       /* Restore host ASID */
+       if (!cpu_has_guestid) {
+               UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi),
+                         K1);
+               UASM_i_MTC0(&p, K0, C0_ENTRYHI);
+       }
+
+       /*
+        * Set up normal Linux process pgd.
+        * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+        * - call tlbmiss_handler_setup_pgd(mm->pgd)
+        * - write mm->pgd into CP0_PWBase
+        */
+       UASM_i_LW(&p, A0,
+                 offsetof(struct kvm_vcpu_arch, host_pgd), K1);
+       UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+       uasm_i_jalr(&p, RA, T9);
+       /* delay slot */
+       if (cpu_has_htw)
+               UASM_i_MTC0(&p, A0, C0_PWBASE);
+       else
+               uasm_i_nop(&p);
+
+       /* Clear GM bit so we don't enter guest mode when EXL is cleared */
+       uasm_i_mfc0(&p, K0, C0_GUESTCTL0);
+       uasm_i_ins(&p, K0, ZERO, MIPS_GCTL0_GM_SHIFT, 1);
+       uasm_i_mtc0(&p, K0, C0_GUESTCTL0);
+
+       /* Save GuestCtl0 so we can access GExcCode after CPU migration */
+       uasm_i_sw(&p, K0,
+                 offsetof(struct kvm_vcpu_arch, host_cp0_guestctl0), K1);
+
+       if (cpu_has_guestid) {
+               /*
+                * Clear root mode GuestID, so that root TLB operations use the
+                * root GuestID in the root TLB.
+                */
+               uasm_i_mfc0(&p, T0, C0_GUESTCTL1);
+               /* Set GuestCtl1.RID = MIPS_GCTL1_ROOT_GUESTID (i.e. 0) */
+               uasm_i_ins(&p, T0, ZERO, MIPS_GCTL1_RID_SHIFT,
+                          MIPS_GCTL1_RID_WIDTH);
+               uasm_i_mtc0(&p, T0, C0_GUESTCTL1);
+       }
+#endif
+
        /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
        uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
        uasm_i_and(&p, V0, V0, AT);
@@ -680,6 +792,8 @@ void *kvm_mips_build_exit(void *addr)
         * Now jump to the kvm_mips_handle_exit() to see if we can deal
         * with this in the kernel
         */
+       uasm_i_move(&p, A0, S0);
+       uasm_i_move(&p, A1, S1);
        UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
        uasm_i_jalr(&p, RA, T9);
         UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
diff --git a/arch/mips/kvm/hypcall.c b/arch/mips/kvm/hypcall.c
new file mode 100644 (file)
index 0000000..8306343
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Hypercall handling.
+ *
+ * Copyright (C) 2015  Imagination Technologies Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_para.h>
+
+#define MAX_HYPCALL_ARGS       4
+
+enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu,
+                                           union mips_instruction inst)
+{
+       unsigned int code = (inst.co_format.code >> 5) & 0x3ff;
+
+       kvm_debug("[%#lx] HYPCALL %#03x\n", vcpu->arch.pc, code);
+
+       switch (code) {
+       case 0:
+               return EMULATE_HYPERCALL;
+       default:
+               return EMULATE_FAIL;
+       };
+}
+
+static int kvm_mips_hypercall(struct kvm_vcpu *vcpu, unsigned long num,
+                             const unsigned long *args, unsigned long *hret)
+{
+       /* Report unimplemented hypercall to guest */
+       *hret = -KVM_ENOSYS;
+       return RESUME_GUEST;
+}
+
+int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu)
+{
+       unsigned long num, args[MAX_HYPCALL_ARGS];
+
+       /* read hypcall number and arguments */
+       num = vcpu->arch.gprs[2];       /* v0 */
+       args[0] = vcpu->arch.gprs[4];   /* a0 */
+       args[1] = vcpu->arch.gprs[5];   /* a1 */
+       args[2] = vcpu->arch.gprs[6];   /* a2 */
+       args[3] = vcpu->arch.gprs[7];   /* a3 */
+
+       return kvm_mips_hypercall(vcpu, num,
+                                 args, &vcpu->arch.gprs[2] /* v0 */);
+}
index fb118a2c8379f8b97fceb68730f98f6bb6d6ba61..3bf0a49725e81ae4fb59081255595eed85f4fa43 100644 (file)
 
 #define C_TI        (_ULCAST_(1) << 30)
 
+#ifdef CONFIG_KVM_MIPS_VZ
+#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (1)
+#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (1)
+#else
 #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
 #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE   (0)
+#endif
 
 void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
 void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
index 15a1b1716c2eeed477ef2412cad09ece77abdd90..d4b2ad18eef2023d23701b0dd5a320d2370fee5d 100644 (file)
@@ -59,6 +59,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        { "fpe",          VCPU_STAT(fpe_exits),          KVM_STAT_VCPU },
        { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU },
        { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU },
+#ifdef CONFIG_KVM_MIPS_VZ
+       { "vz_gpsi",      VCPU_STAT(vz_gpsi_exits),      KVM_STAT_VCPU },
+       { "vz_gsfc",      VCPU_STAT(vz_gsfc_exits),      KVM_STAT_VCPU },
+       { "vz_hc",        VCPU_STAT(vz_hc_exits),        KVM_STAT_VCPU },
+       { "vz_grr",       VCPU_STAT(vz_grr_exits),       KVM_STAT_VCPU },
+       { "vz_gva",       VCPU_STAT(vz_gva_exits),       KVM_STAT_VCPU },
+       { "vz_ghfc",      VCPU_STAT(vz_ghfc_exits),      KVM_STAT_VCPU },
+       { "vz_gpa",       VCPU_STAT(vz_gpa_exits),       KVM_STAT_VCPU },
+       { "vz_resvd",     VCPU_STAT(vz_resvd_exits),     KVM_STAT_VCPU },
+#endif
        { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU },
        { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU },
        { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU },
@@ -66,6 +76,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
        {NULL}
 };
 
+bool kvm_trace_guest_mode_change;
+
+int kvm_guest_mode_change_trace_reg(void)
+{
+       kvm_trace_guest_mode_change = 1;
+       return 0;
+}
+
+void kvm_guest_mode_change_trace_unreg(void)
+{
+       kvm_trace_guest_mode_change = 0;
+}
+
 /*
  * XXXKYMA: We are simulatoring a processor that has the WII bit set in
  * Config7, so we are "runnable" if interrupts are pending
@@ -82,7 +105,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
 
 int kvm_arch_hardware_enable(void)
 {
-       return 0;
+       return kvm_mips_callbacks->hardware_enable();
+}
+
+void kvm_arch_hardware_disable(void)
+{
+       kvm_mips_callbacks->hardware_disable();
 }
 
 int kvm_arch_hardware_setup(void)
@@ -97,6 +125,18 @@ void kvm_arch_check_processor_compat(void *rtn)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 {
+       switch (type) {
+#ifdef CONFIG_KVM_MIPS_VZ
+       case KVM_VM_MIPS_VZ:
+#else
+       case KVM_VM_MIPS_TE:
+#endif
+               break;
+       default:
+               /* Unsupported KVM type */
+               return -EINVAL;
+       };
+
        /* Allocate page table to map GPA -> RPA */
        kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
        if (!kvm->arch.gpa_mm.pgd)
@@ -301,8 +341,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
        /* Build guest exception vectors dynamically in unmapped memory */
        handler = gebase + 0x2000;
 
-       /* TLB refill */
+       /* TLB refill (or XTLB refill on 64-bit VZ where KX=1) */
        refill_start = gebase;
+       if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && IS_ENABLED(CONFIG_64BIT))
+               refill_start += 0x080;
        refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
 
        /* General Exception Entry point */
@@ -353,9 +395,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 
        /* Init */
        vcpu->arch.last_sched_cpu = -1;
-
-       /* Start off the timer */
-       kvm_mips_init_count(vcpu);
+       vcpu->arch.last_exec_cpu = -1;
 
        return vcpu;
 
@@ -1030,9 +1070,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_IMMEDIATE_EXIT:
                r = 1;
                break;
-       case KVM_CAP_COALESCED_MMIO:
-               r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-               break;
        case KVM_CAP_NR_VCPUS:
                r = num_online_cpus();
                break;
@@ -1059,7 +1096,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF);
                break;
        default:
-               r = 0;
+               r = kvm_mips_callbacks->check_extension(kvm, ext);
                break;
        }
        return r;
@@ -1067,7 +1104,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-       return kvm_mips_pending_timer(vcpu);
+       return kvm_mips_pending_timer(vcpu) ||
+               kvm_read_c0_guest_cause(vcpu->arch.cop0) & C_TI;
 }
 
 int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
@@ -1092,7 +1130,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu)
        kvm_debug("\tlo: 0x%08lx\n", vcpu->arch.lo);
 
        cop0 = vcpu->arch.cop0;
-       kvm_debug("\tStatus: 0x%08lx, Cause: 0x%08lx\n",
+       kvm_debug("\tStatus: 0x%08x, Cause: 0x%08x\n",
                  kvm_read_c0_guest_status(cop0),
                  kvm_read_c0_guest_cause(cop0));
 
@@ -1208,7 +1246,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
        vcpu->mode = OUTSIDE_GUEST_MODE;
 
        /* re-enable HTW before enabling interrupts */
-       htw_start();
+       if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+               htw_start();
 
        /* Set a default exit reason */
        run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -1226,17 +1265,20 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                        cause, opc, run, vcpu);
        trace_kvm_exit(vcpu, exccode);
 
-       /*
-        * Do a privilege check, if in UM most of these exit conditions end up
-        * causing an exception to be delivered to the Guest Kernel
-        */
-       er = kvm_mips_check_privilege(cause, opc, run, vcpu);
-       if (er == EMULATE_PRIV_FAIL) {
-               goto skip_emul;
-       } else if (er == EMULATE_FAIL) {
-               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               ret = RESUME_HOST;
-               goto skip_emul;
+       if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+               /*
+                * Do a privilege check, if in UM most of these exit conditions
+                * end up causing an exception to be delivered to the Guest
+                * Kernel
+                */
+               er = kvm_mips_check_privilege(cause, opc, run, vcpu);
+               if (er == EMULATE_PRIV_FAIL) {
+                       goto skip_emul;
+               } else if (er == EMULATE_FAIL) {
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       ret = RESUME_HOST;
+                       goto skip_emul;
+               }
        }
 
        switch (exccode) {
@@ -1267,7 +1309,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                break;
 
        case EXCCODE_TLBS:
-               kvm_debug("TLB ST fault:  cause %#x, status %#lx, PC: %p, BadVaddr: %#lx\n",
+               kvm_debug("TLB ST fault:  cause %#x, status %#x, PC: %p, BadVaddr: %#lx\n",
                          cause, kvm_read_c0_guest_status(vcpu->arch.cop0), opc,
                          badvaddr);
 
@@ -1328,12 +1370,17 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
                ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
                break;
 
+       case EXCCODE_GE:
+               /* defer exit accounting to handler */
+               ret = kvm_mips_callbacks->handle_guest_exit(vcpu);
+               break;
+
        default:
                if (cause & CAUSEF_BD)
                        opc += 1;
                inst = 0;
                kvm_get_badinstr(opc, vcpu, &inst);
-               kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#lx\n",
+               kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x  BadVaddr: %#lx Status: %#x\n",
                        exccode, opc, inst, badvaddr,
                        kvm_read_c0_guest_status(vcpu->arch.cop0));
                kvm_arch_vcpu_dump_regs(vcpu);
@@ -1346,6 +1393,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
 skip_emul:
        local_irq_disable();
 
+       if (ret == RESUME_GUEST)
+               kvm_vz_acquire_htimer(vcpu);
+
        if (er == EMULATE_DONE && !(ret & RESUME_HOST))
                kvm_mips_deliver_interrupts(vcpu, cause);
 
@@ -1391,7 +1441,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
        }
 
        /* Disable HTW before returning to guest or host */
-       htw_stop();
+       if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ))
+               htw_stop();
 
        return ret;
 }
@@ -1527,16 +1578,18 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu)
 void kvm_lose_fpu(struct kvm_vcpu *vcpu)
 {
        /*
-        * FPU & MSA get disabled in root context (hardware) when it is disabled
-        * in guest context (software), but the register state in the hardware
-        * may still be in use. This is why we explicitly re-enable the hardware
-        * before saving.
+        * With T&E, FPU & MSA get disabled in root context (hardware) when it
+        * is disabled in guest context (software), but the register state in
+        * the hardware may still be in use.
+        * This is why we explicitly re-enable the hardware before saving.
         */
 
        preempt_disable();
        if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
-               set_c0_config5(MIPS_CONF5_MSAEN);
-               enable_fpu_hazard();
+               if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+                       set_c0_config5(MIPS_CONF5_MSAEN);
+                       enable_fpu_hazard();
+               }
 
                __kvm_save_msa(&vcpu->arch);
                trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
@@ -1549,8 +1602,10 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
                }
                vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
        } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
-               set_c0_status(ST0_CU1);
-               enable_fpu_hazard();
+               if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) {
+                       set_c0_status(ST0_CU1);
+                       enable_fpu_hazard();
+               }
 
                __kvm_save_fpu(&vcpu->arch);
                vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
index cb0faade311e125c582a43e7a1a671dd64069c1a..ee64db03279336db79ac5c98e7634074d47608ac 100644 (file)
@@ -992,6 +992,22 @@ static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
        return kvm_mips_gpa_pte_to_gva_unmapped(pte);
 }
 
+#ifdef CONFIG_KVM_MIPS_VZ
+int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr,
+                                     struct kvm_vcpu *vcpu,
+                                     bool write_fault)
+{
+       int ret;
+
+       ret = kvm_mips_map_page(vcpu, badvaddr, write_fault, NULL, NULL);
+       if (ret)
+               return ret;
+
+       /* Invalidate this entry in the TLB */
+       return kvm_vz_host_tlb_inv(vcpu, badvaddr);
+}
+#endif
+
 /* XXXKYMA: Must be called with interrupts disabled */
 int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
                                    struct kvm_vcpu *vcpu,
@@ -1225,6 +1241,10 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
 {
        int err;
 
+       if (WARN(IS_ENABLED(CONFIG_KVM_MIPS_VZ),
+                "Expect BadInstr/BadInstrP registers to be used with VZ\n"))
+               return -EINVAL;
+
 retry:
        kvm_trap_emul_gva_lockless_begin(vcpu);
        err = get_user(*out, opc);
index 2819eb793345abb41fd1a183edb67084f11acead..7c6336dd2638ce9c12c4ff8be566ff6acb856137 100644 (file)
 #define KVM_GUEST_PC_TLB    0
 #define KVM_GUEST_SP_TLB    1
 
+#ifdef CONFIG_KVM_MIPS_VZ
+unsigned long GUESTID_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_MASK);
+unsigned long GUESTID_FIRST_VERSION;
+EXPORT_SYMBOL_GPL(GUESTID_FIRST_VERSION);
+unsigned long GUESTID_VERSION_MASK;
+EXPORT_SYMBOL_GPL(GUESTID_VERSION_MASK);
+
+static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu)
+{
+       struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm;
+
+       if (cpu_has_guestid)
+               return 0;
+       else
+               return cpu_asid(smp_processor_id(), gpa_mm);
+}
+#endif
+
 static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
 {
        struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -166,6 +185,13 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 
        local_irq_restore(flags);
 
+       /*
+        * We don't want to get reserved instruction exceptions for missing tlb
+        * entries.
+        */
+       if (cpu_has_vtag_icache)
+               flush_icache_all();
+
        if (user && idx_user >= 0)
                kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
                          __func__, (va & VPN2_MASK) |
@@ -179,6 +205,421 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
 }
 EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
 
+#ifdef CONFIG_KVM_MIPS_VZ
+
+/* GuestID management */
+
+/**
+ * clear_root_gid() - Set GuestCtl1.RID for normal root operation.
+ */
+static inline void clear_root_gid(void)
+{
+       if (cpu_has_guestid) {
+               clear_c0_guestctl1(MIPS_GCTL1_RID);
+               mtc0_tlbw_hazard();
+       }
+}
+
+/**
+ * set_root_gid_to_guest_gid() - Set GuestCtl1.RID to match GuestCtl1.ID.
+ *
+ * Sets the root GuestID to match the current guest GuestID, for TLB operation
+ * on the GPA->RPA mappings in the root TLB.
+ *
+ * The caller must be sure to disable HTW while the root GID is set, and
+ * possibly longer if TLB registers are modified.
+ */
+static inline void set_root_gid_to_guest_gid(void)
+{
+       unsigned int guestctl1;
+
+       if (cpu_has_guestid) {
+               back_to_back_c0_hazard();
+               guestctl1 = read_c0_guestctl1();
+               guestctl1 = (guestctl1 & ~MIPS_GCTL1_RID) |
+                       ((guestctl1 & MIPS_GCTL1_ID) >> MIPS_GCTL1_ID_SHIFT)
+                                                    << MIPS_GCTL1_RID_SHIFT;
+               write_c0_guestctl1(guestctl1);
+               mtc0_tlbw_hazard();
+       }
+}
+
+int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+{
+       int idx;
+       unsigned long flags, old_entryhi;
+
+       local_irq_save(flags);
+       htw_stop();
+
+       /* Set root GuestID for root probe and write of guest TLB entry */
+       set_root_gid_to_guest_gid();
+
+       old_entryhi = read_c0_entryhi();
+
+       idx = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+                                    kvm_mips_get_root_asid(vcpu));
+
+       write_c0_entryhi(old_entryhi);
+       clear_root_gid();
+       mtc0_tlbw_hazard();
+
+       htw_start();
+       local_irq_restore(flags);
+
+       /*
+        * We don't want to get reserved instruction exceptions for missing tlb
+        * entries.
+        */
+       if (cpu_has_vtag_icache)
+               flush_icache_all();
+
+       if (idx > 0)
+               kvm_debug("%s: Invalidated root entryhi %#lx @ idx %d\n",
+                         __func__, (va & VPN2_MASK) |
+                                   kvm_mips_get_root_asid(vcpu), idx);
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_host_tlb_inv);
+
+/**
+ * kvm_vz_guest_tlb_lookup() - Lookup a guest VZ TLB mapping.
+ * @vcpu:      KVM VCPU pointer.
+ * @gpa:       Guest virtual address in a TLB mapped guest segment.
+ * @gpa:       Ponter to output guest physical address it maps to.
+ *
+ * Converts a guest virtual address in a guest TLB mapped segment to a guest
+ * physical address, by probing the guest TLB.
+ *
+ * Returns:    0 if guest TLB mapping exists for @gva. *@gpa will have been
+ *             written.
+ *             -EFAULT if no guest TLB mapping exists for @gva. *@gpa may not
+ *             have been written.
+ */
+int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva,
+                           unsigned long *gpa)
+{
+       unsigned long o_entryhi, o_entrylo[2], o_pagemask;
+       unsigned int o_index;
+       unsigned long entrylo[2], pagemask, pagemaskbit, pa;
+       unsigned long flags;
+       int index;
+
+       /* Probe the guest TLB for a mapping */
+       local_irq_save(flags);
+       /* Set root GuestID for root probe of guest TLB entry */
+       htw_stop();
+       set_root_gid_to_guest_gid();
+
+       o_entryhi = read_gc0_entryhi();
+       o_index = read_gc0_index();
+
+       write_gc0_entryhi((o_entryhi & 0x3ff) | (gva & ~0xfffl));
+       mtc0_tlbw_hazard();
+       guest_tlb_probe();
+       tlb_probe_hazard();
+
+       index = read_gc0_index();
+       if (index < 0) {
+               /* No match, fail */
+               write_gc0_entryhi(o_entryhi);
+               write_gc0_index(o_index);
+
+               clear_root_gid();
+               htw_start();
+               local_irq_restore(flags);
+               return -EFAULT;
+       }
+
+       /* Match! read the TLB entry */
+       o_entrylo[0] = read_gc0_entrylo0();
+       o_entrylo[1] = read_gc0_entrylo1();
+       o_pagemask = read_gc0_pagemask();
+
+       mtc0_tlbr_hazard();
+       guest_tlb_read();
+       tlb_read_hazard();
+
+       entrylo[0] = read_gc0_entrylo0();
+       entrylo[1] = read_gc0_entrylo1();
+       pagemask = ~read_gc0_pagemask() & ~0x1fffl;
+
+       write_gc0_entryhi(o_entryhi);
+       write_gc0_index(o_index);
+       write_gc0_entrylo0(o_entrylo[0]);
+       write_gc0_entrylo1(o_entrylo[1]);
+       write_gc0_pagemask(o_pagemask);
+
+       clear_root_gid();
+       htw_start();
+       local_irq_restore(flags);
+
+       /* Select one of the EntryLo values and interpret the GPA */
+       pagemaskbit = (pagemask ^ (pagemask & (pagemask - 1))) >> 1;
+       pa = entrylo[!!(gva & pagemaskbit)];
+
+       /*
+        * TLB entry may have become invalid since TLB probe if physical FTLB
+        * entries are shared between threads (e.g. I6400).
+        */
+       if (!(pa & ENTRYLO_V))
+               return -EFAULT;
+
+       /*
+        * Note, this doesn't take guest MIPS32 XPA into account, where PFN is
+        * split with XI/RI in the middle.
+        */
+       pa = (pa << 6) & ~0xfffl;
+       pa |= gva & ~(pagemask | pagemaskbit);
+
+       *gpa = pa;
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vz_guest_tlb_lookup);
+
+/**
+ * kvm_vz_local_flush_roottlb_all_guests() - Flush all root TLB entries for
+ * guests.
+ *
+ * Invalidate all entries in root tlb which are GPA mappings.
+ */
+void kvm_vz_local_flush_roottlb_all_guests(void)
+{
+       unsigned long flags;
+       unsigned long old_entryhi, old_pagemask, old_guestctl1;
+       int entry;
+
+       if (WARN_ON(!cpu_has_guestid))
+               return;
+
+       local_irq_save(flags);
+       htw_stop();
+
+       /* TLBR may clobber EntryHi.ASID, PageMask, and GuestCtl1.RID */
+       old_entryhi = read_c0_entryhi();
+       old_pagemask = read_c0_pagemask();
+       old_guestctl1 = read_c0_guestctl1();
+
+       /*
+        * Invalidate guest entries in root TLB while leaving root entries
+        * intact when possible.
+        */
+       for (entry = 0; entry < current_cpu_data.tlbsize; entry++) {
+               write_c0_index(entry);
+               mtc0_tlbw_hazard();
+               tlb_read();
+               tlb_read_hazard();
+
+               /* Don't invalidate non-guest (RVA) mappings in the root TLB */
+               if (!(read_c0_guestctl1() & MIPS_GCTL1_RID))
+                       continue;
+
+               /* Make sure all entries differ. */
+               write_c0_entryhi(UNIQUE_ENTRYHI(entry));
+               write_c0_entrylo0(0);
+               write_c0_entrylo1(0);
+               write_c0_guestctl1(0);
+               mtc0_tlbw_hazard();
+               tlb_write_indexed();
+       }
+
+       write_c0_entryhi(old_entryhi);
+       write_c0_pagemask(old_pagemask);
+       write_c0_guestctl1(old_guestctl1);
+       tlbw_use_hazard();
+
+       htw_start();
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_roottlb_all_guests);
+
+/**
+ * kvm_vz_local_flush_guesttlb_all() - Flush all guest TLB entries.
+ *
+ * Invalidate all entries in guest tlb irrespective of guestid.
+ */
+void kvm_vz_local_flush_guesttlb_all(void)
+{
+       unsigned long flags;
+       unsigned long old_index;
+       unsigned long old_entryhi;
+       unsigned long old_entrylo[2];
+       unsigned long old_pagemask;
+       int entry;
+       u64 cvmmemctl2 = 0;
+
+       local_irq_save(flags);
+
+       /* Preserve all clobbered guest registers */
+       old_index = read_gc0_index();
+       old_entryhi = read_gc0_entryhi();
+       old_entrylo[0] = read_gc0_entrylo0();
+       old_entrylo[1] = read_gc0_entrylo1();
+       old_pagemask = read_gc0_pagemask();
+
+       switch (current_cpu_type()) {
+       case CPU_CAVIUM_OCTEON3:
+               /* Inhibit machine check due to multiple matching TLB entries */
+               cvmmemctl2 = read_c0_cvmmemctl2();
+               cvmmemctl2 |= CVMMEMCTL2_INHIBITTS;
+               write_c0_cvmmemctl2(cvmmemctl2);
+               break;
+       };
+
+       /* Invalidate guest entries in guest TLB */
+       write_gc0_entrylo0(0);
+       write_gc0_entrylo1(0);
+       write_gc0_pagemask(0);
+       for (entry = 0; entry < current_cpu_data.guest.tlbsize; entry++) {
+               /* Make sure all entries differ. */
+               write_gc0_index(entry);
+               write_gc0_entryhi(UNIQUE_GUEST_ENTRYHI(entry));
+               mtc0_tlbw_hazard();
+               guest_tlb_write_indexed();
+       }
+
+       if (cvmmemctl2) {
+               cvmmemctl2 &= ~CVMMEMCTL2_INHIBITTS;
+               write_c0_cvmmemctl2(cvmmemctl2);
+       };
+
+       write_gc0_index(old_index);
+       write_gc0_entryhi(old_entryhi);
+       write_gc0_entrylo0(old_entrylo[0]);
+       write_gc0_entrylo1(old_entrylo[1]);
+       write_gc0_pagemask(old_pagemask);
+       tlbw_use_hazard();
+
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(kvm_vz_local_flush_guesttlb_all);
+
+/**
+ * kvm_vz_save_guesttlb() - Save a range of guest TLB entries.
+ * @buf:       Buffer to write TLB entries into.
+ * @index:     Start index.
+ * @count:     Number of entries to save.
+ *
+ * Save a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index,
+                         unsigned int count)
+{
+       unsigned int end = index + count;
+       unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+       unsigned int guestctl1 = 0;
+       int old_index, i;
+
+       /* Save registers we're about to clobber */
+       old_index = read_gc0_index();
+       old_entryhi = read_gc0_entryhi();
+       old_entrylo0 = read_gc0_entrylo0();
+       old_entrylo1 = read_gc0_entrylo1();
+       old_pagemask = read_gc0_pagemask();
+
+       /* Set root GuestID for root probe */
+       htw_stop();
+       set_root_gid_to_guest_gid();
+       if (cpu_has_guestid)
+               guestctl1 = read_c0_guestctl1();
+
+       /* Read each entry from guest TLB */
+       for (i = index; i < end; ++i, ++buf) {
+               write_gc0_index(i);
+
+               mtc0_tlbr_hazard();
+               guest_tlb_read();
+               tlb_read_hazard();
+
+               if (cpu_has_guestid &&
+                   (read_c0_guestctl1() ^ guestctl1) & MIPS_GCTL1_RID) {
+                       /* Entry invalid or belongs to another guest */
+                       buf->tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+                       buf->tlb_lo[0] = 0;
+                       buf->tlb_lo[1] = 0;
+                       buf->tlb_mask = 0;
+               } else {
+                       /* Entry belongs to the right guest */
+                       buf->tlb_hi = read_gc0_entryhi();
+                       buf->tlb_lo[0] = read_gc0_entrylo0();
+                       buf->tlb_lo[1] = read_gc0_entrylo1();
+                       buf->tlb_mask = read_gc0_pagemask();
+               }
+       }
+
+       /* Clear root GuestID again */
+       clear_root_gid();
+       htw_start();
+
+       /* Restore clobbered registers */
+       write_gc0_index(old_index);
+       write_gc0_entryhi(old_entryhi);
+       write_gc0_entrylo0(old_entrylo0);
+       write_gc0_entrylo1(old_entrylo1);
+       write_gc0_pagemask(old_pagemask);
+
+       tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_save_guesttlb);
+
+/**
+ * kvm_vz_load_guesttlb() - Save a range of guest TLB entries.
+ * @buf:       Buffer to read TLB entries from.
+ * @index:     Start index.
+ * @count:     Number of entries to load.
+ *
+ * Load a range of guest TLB entries. The caller must ensure interrupts are
+ * disabled.
+ */
+void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index,
+                         unsigned int count)
+{
+       unsigned int end = index + count;
+       unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask;
+       int old_index, i;
+
+       /* Save registers we're about to clobber */
+       old_index = read_gc0_index();
+       old_entryhi = read_gc0_entryhi();
+       old_entrylo0 = read_gc0_entrylo0();
+       old_entrylo1 = read_gc0_entrylo1();
+       old_pagemask = read_gc0_pagemask();
+
+       /* Set root GuestID for root probe */
+       htw_stop();
+       set_root_gid_to_guest_gid();
+
+       /* Write each entry to guest TLB */
+       for (i = index; i < end; ++i, ++buf) {
+               write_gc0_index(i);
+               write_gc0_entryhi(buf->tlb_hi);
+               write_gc0_entrylo0(buf->tlb_lo[0]);
+               write_gc0_entrylo1(buf->tlb_lo[1]);
+               write_gc0_pagemask(buf->tlb_mask);
+
+               mtc0_tlbw_hazard();
+               guest_tlb_write_indexed();
+       }
+
+       /* Clear root GuestID again */
+       clear_root_gid();
+       htw_start();
+
+       /* Restore clobbered registers */
+       write_gc0_index(old_index);
+       write_gc0_entryhi(old_entryhi);
+       write_gc0_entrylo0(old_entrylo0);
+       write_gc0_entrylo1(old_entrylo1);
+       write_gc0_pagemask(old_pagemask);
+
+       tlbw_use_hazard();
+}
+EXPORT_SYMBOL_GPL(kvm_vz_load_guesttlb);
+
+#endif
+
 /**
  * kvm_mips_suspend_mm() - Suspend the active mm.
  * @cpu                The CPU we're running on.
index c858cf168078401931d762542928857414d8b200..a8c7fd7bf6d267bccfbba172f58ae300826330c1 100644 (file)
 #define TRACE_INCLUDE_PATH .
 #define TRACE_INCLUDE_FILE trace
 
+/*
+ * arch/mips/kvm/mips.c
+ */
+extern bool kvm_trace_guest_mode_change;
+int kvm_guest_mode_change_trace_reg(void);
+void kvm_guest_mode_change_trace_unreg(void);
+
 /*
  * Tracepoints for VM enters
  */
@@ -62,10 +69,20 @@ DEFINE_EVENT(kvm_transition, kvm_out,
 #define KVM_TRACE_EXIT_MSA_FPE         14
 #define KVM_TRACE_EXIT_FPE             15
 #define KVM_TRACE_EXIT_MSA_DISABLED    21
+#define KVM_TRACE_EXIT_GUEST_EXIT      27
 /* Further exit reasons */
 #define KVM_TRACE_EXIT_WAIT            32
 #define KVM_TRACE_EXIT_CACHE           33
 #define KVM_TRACE_EXIT_SIGNAL          34
+/* 32 exit reasons correspond to GuestCtl0.GExcCode (VZ) */
+#define KVM_TRACE_EXIT_GEXCCODE_BASE   64
+#define KVM_TRACE_EXIT_GPSI            64      /*  0 */
+#define KVM_TRACE_EXIT_GSFC            65      /*  1 */
+#define KVM_TRACE_EXIT_HC              66      /*  2 */
+#define KVM_TRACE_EXIT_GRR             67      /*  3 */
+#define KVM_TRACE_EXIT_GVA             72      /*  8 */
+#define KVM_TRACE_EXIT_GHFC            73      /*  9 */
+#define KVM_TRACE_EXIT_GPA             74      /* 10 */
 
 /* Tracepoints for VM exits */
 #define kvm_trace_symbol_exit_types                            \
@@ -83,9 +100,17 @@ DEFINE_EVENT(kvm_transition, kvm_out,
        { KVM_TRACE_EXIT_MSA_FPE,       "MSA FPE" },            \
        { KVM_TRACE_EXIT_FPE,           "FPE" },                \
        { KVM_TRACE_EXIT_MSA_DISABLED,  "MSA Disabled" },       \
+       { KVM_TRACE_EXIT_GUEST_EXIT,    "Guest Exit" },         \
        { KVM_TRACE_EXIT_WAIT,          "WAIT" },               \
        { KVM_TRACE_EXIT_CACHE,         "CACHE" },              \
-       { KVM_TRACE_EXIT_SIGNAL,        "Signal" }
+       { KVM_TRACE_EXIT_SIGNAL,        "Signal" },             \
+       { KVM_TRACE_EXIT_GPSI,          "GPSI" },               \
+       { KVM_TRACE_EXIT_GSFC,          "GSFC" },               \
+       { KVM_TRACE_EXIT_HC,            "HC" },                 \
+       { KVM_TRACE_EXIT_GRR,           "GRR" },                \
+       { KVM_TRACE_EXIT_GVA,           "GVA" },                \
+       { KVM_TRACE_EXIT_GHFC,          "GHFC" },               \
+       { KVM_TRACE_EXIT_GPA,           "GPA" }
 
 TRACE_EVENT(kvm_exit,
            TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -158,6 +183,8 @@ TRACE_EVENT(kvm_exit,
        { KVM_TRACE_COP0(16, 4),        "Config4" },            \
        { KVM_TRACE_COP0(16, 5),        "Config5" },            \
        { KVM_TRACE_COP0(16, 7),        "Config7" },            \
+       { KVM_TRACE_COP0(17, 1),        "MAAR" },               \
+       { KVM_TRACE_COP0(17, 2),        "MAARI" },              \
        { KVM_TRACE_COP0(26, 0),        "ECC" },                \
        { KVM_TRACE_COP0(30, 0),        "ErrorEPC" },           \
        { KVM_TRACE_COP0(31, 2),        "KScratch1" },          \
@@ -268,6 +295,51 @@ TRACE_EVENT(kvm_asid_change,
                      __entry->new_asid)
 );
 
+TRACE_EVENT(kvm_guestid_change,
+           TP_PROTO(struct kvm_vcpu *vcpu, unsigned int guestid),
+           TP_ARGS(vcpu, guestid),
+           TP_STRUCT__entry(
+                       __field(unsigned int, guestid)
+           ),
+
+           TP_fast_assign(
+                       __entry->guestid = guestid;
+           ),
+
+           TP_printk("GuestID: 0x%02x",
+                     __entry->guestid)
+);
+
+TRACE_EVENT_FN(kvm_guest_mode_change,
+           TP_PROTO(struct kvm_vcpu *vcpu),
+           TP_ARGS(vcpu),
+           TP_STRUCT__entry(
+                       __field(unsigned long, epc)
+                       __field(unsigned long, pc)
+                       __field(unsigned long, badvaddr)
+                       __field(unsigned int, status)
+                       __field(unsigned int, cause)
+           ),
+
+           TP_fast_assign(
+                       __entry->epc = kvm_read_c0_guest_epc(vcpu->arch.cop0);
+                       __entry->pc = vcpu->arch.pc;
+                       __entry->badvaddr = kvm_read_c0_guest_badvaddr(vcpu->arch.cop0);
+                       __entry->status = kvm_read_c0_guest_status(vcpu->arch.cop0);
+                       __entry->cause = kvm_read_c0_guest_cause(vcpu->arch.cop0);
+           ),
+
+           TP_printk("EPC: 0x%08lx PC: 0x%08lx Status: 0x%08x Cause: 0x%08x BadVAddr: 0x%08lx",
+                     __entry->epc,
+                     __entry->pc,
+                     __entry->status,
+                     __entry->cause,
+                     __entry->badvaddr),
+
+           kvm_guest_mode_change_trace_reg,
+           kvm_guest_mode_change_trace_unreg
+);
+
 #endif /* _TRACE_KVM_H */
 
 /* This part must be outside protection */
index b1fa53b252eab2e94a93ef24c6a9183abff0d37e..a563759fd142c6b18ca2fd3b06a8a7703d8714d3 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
+#include <linux/log2.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <asm/mmu_context.h>
@@ -40,6 +41,29 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
        return gpa;
 }
 
+static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu)
+{
+       u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+       unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+       u32 inst = 0;
+
+       /*
+        *  Fetch the instruction.
+        */
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       kvm_get_badinstr(opc, vcpu, &inst);
+
+       kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+               exccode, opc, inst, badvaddr,
+               kvm_read_c0_guest_status(vcpu->arch.cop0));
+       kvm_arch_vcpu_dump_regs(vcpu);
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       return RESUME_HOST;
+}
+
 static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
 {
        struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -82,6 +106,10 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
                ret = RESUME_HOST;
                break;
 
+       case EMULATE_HYPERCALL:
+               ret = kvm_mips_handle_hypcall(vcpu);
+               break;
+
        default:
                BUG();
        }
@@ -484,6 +512,31 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
        return ret;
 }
 
+static int kvm_trap_emul_hardware_enable(void)
+{
+       return 0;
+}
+
+static void kvm_trap_emul_hardware_disable(void)
+{
+}
+
+static int kvm_trap_emul_check_extension(struct kvm *kvm, long ext)
+{
+       int r;
+
+       switch (ext) {
+       case KVM_CAP_MIPS_TE:
+               r = 1;
+               break;
+       default:
+               r = 0;
+               break;
+       }
+
+       return r;
+}
+
 static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
 {
        struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
@@ -561,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        u32 config, config1;
        int vcpu_id = vcpu->vcpu_id;
 
+       /* Start off the timer at 100 MHz */
+       kvm_mips_init_count(vcpu, 100*1000*1000);
+
        /*
         * Arch specific stuff, set up config registers properly so that the
         * guest will come up as expected
@@ -589,6 +645,13 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
        /* Read the cache characteristics from the host Config1 Register */
        config1 = (read_c0_config1() & ~0x7f);
 
+       /* DCache line size not correctly reported in Config1 on Octeon CPUs */
+       if (cpu_dcache_line_size()) {
+               config1 &= ~MIPS_CONF1_DL;
+               config1 |= ((ilog2(cpu_dcache_line_size()) - 1) <<
+                           MIPS_CONF1_DL_SHF) & MIPS_CONF1_DL;
+       }
+
        /* Set up MMU size */
        config1 &= ~(0x3f << 25);
        config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
@@ -892,10 +955,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
                        if (v & CAUSEF_DC) {
                                /* disable timer first */
                                kvm_mips_count_disable_cause(vcpu);
-                               kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+                               kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+                                                         v);
                        } else {
                                /* enable timer last */
-                               kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v);
+                               kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC,
+                                                         v);
                                kvm_mips_count_enable_cause(vcpu);
                        }
                } else {
@@ -1230,7 +1295,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
        .handle_msa_fpe = kvm_trap_emul_handle_msa_fpe,
        .handle_fpe = kvm_trap_emul_handle_fpe,
        .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
+       .handle_guest_exit = kvm_trap_emul_no_handler,
 
+       .hardware_enable = kvm_trap_emul_hardware_enable,
+       .hardware_disable = kvm_trap_emul_hardware_disable,
+       .check_extension = kvm_trap_emul_check_extension,
        .vcpu_init = kvm_trap_emul_vcpu_init,
        .vcpu_uninit = kvm_trap_emul_vcpu_uninit,
        .vcpu_setup = kvm_trap_emul_vcpu_setup,
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
new file mode 100644 (file)
index 0000000..71d8856
--- /dev/null
@@ -0,0 +1,3223 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS: Support for hardware virtualization extensions
+ *
+ * Copyright (C) 2012  MIPS Technologies, Inc.  All rights reserved.
+ * Authors: Yann Le Du <ledu@kymasys.com>
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
+#include <asm/cacheops.h>
+#include <asm/cmpxchg.h>
+#include <asm/fpu.h>
+#include <asm/hazards.h>
+#include <asm/inst.h>
+#include <asm/mmu_context.h>
+#include <asm/r4kcache.h>
+#include <asm/time.h>
+#include <asm/tlb.h>
+#include <asm/tlbex.h>
+
+#include <linux/kvm_host.h>
+
+#include "interrupt.h"
+
+#include "trace.h"
+
+/* Pointers to last VCPU loaded on each physical CPU */
+static struct kvm_vcpu *last_vcpu[NR_CPUS];
+/* Pointers to last VCPU executed on each physical CPU */
+static struct kvm_vcpu *last_exec_vcpu[NR_CPUS];
+
+/*
+ * Number of guest VTLB entries to use, so we can catch inconsistency between
+ * CPUs.
+ */
+static unsigned int kvm_vz_guest_vtlb_size;
+
+static inline long kvm_vz_read_gc0_ebase(void)
+{
+       if (sizeof(long) == 8 && cpu_has_ebase_wg)
+               return read_gc0_ebase_64();
+       else
+               return read_gc0_ebase();
+}
+
+static inline void kvm_vz_write_gc0_ebase(long v)
+{
+       /*
+        * First write with WG=1 to write upper bits, then write again in case
+        * WG should be left at 0.
+        * write_gc0_ebase_64() is no longer UNDEFINED since R6.
+        */
+       if (sizeof(long) == 8 &&
+           (cpu_has_mips64r6 || cpu_has_ebase_wg)) {
+               write_gc0_ebase_64(v | MIPS_EBASE_WG);
+               write_gc0_ebase_64(v);
+       } else {
+               write_gc0_ebase(v | MIPS_EBASE_WG);
+               write_gc0_ebase(v);
+       }
+}
+
+/*
+ * These Config bits may be writable by the guest:
+ * Config:     [K23, KU] (!TLB), K0
+ * Config1:    (none)
+ * Config2:    [TU, SU] (impl)
+ * Config3:    ISAOnExc
+ * Config4:    FTLBPageSize
+ * Config5:    K, CV, MSAEn, UFE, FRE, SBRI, UFR
+ */
+
+static inline unsigned int kvm_vz_config_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       return CONF_CM_CMASK;
+}
+
+static inline unsigned int kvm_vz_config1_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static inline unsigned int kvm_vz_config2_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       return 0;
+}
+
+static inline unsigned int kvm_vz_config3_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       return MIPS_CONF3_ISA_OE;
+}
+
+static inline unsigned int kvm_vz_config4_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       /* no need to be exact */
+       return MIPS_CONF4_VFTLBPAGESIZE;
+}
+
+static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu)
+{
+       unsigned int mask = MIPS_CONF5_K | MIPS_CONF5_CV | MIPS_CONF5_SBRI;
+
+       /* Permit MSAEn changes if MSA supported and enabled */
+       if (kvm_mips_guest_has_msa(&vcpu->arch))
+               mask |= MIPS_CONF5_MSAEN;
+
+       /*
+        * Permit guest FPU mode changes if FPU is enabled and the relevant
+        * feature exists according to FIR register.
+        */
+       if (kvm_mips_guest_has_fpu(&vcpu->arch)) {
+               if (cpu_has_ufr)
+                       mask |= MIPS_CONF5_UFR;
+               if (cpu_has_fre)
+                       mask |= MIPS_CONF5_FRE | MIPS_CONF5_UFE;
+       }
+
+       return mask;
+}
+
+/*
+ * VZ optionally allows these additional Config bits to be written by root:
+ * Config:     M, [MT]
+ * Config1:    M, [MMUSize-1, C2, MD, PC, WR, CA], FP
+ * Config2:    M
+ * Config3:    M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC,
+ *             VInt, SP, CDMM, MT, SM, TL]
+ * Config4:    M, [VTLBSizeExt, MMUSizeExt]
+ * Config5:    MRP
+ */
+
+static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       return kvm_vz_config_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config1_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       unsigned int mask = kvm_vz_config1_guest_wrmask(vcpu) | MIPS_CONF_M;
+
+       /* Permit FPU to be present if FPU is supported */
+       if (kvm_mips_guest_can_have_fpu(&vcpu->arch))
+               mask |= MIPS_CONF1_FP;
+
+       return mask;
+}
+
+static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       return kvm_vz_config2_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M |
+               MIPS_CONF3_ULRI | MIPS_CONF3_CTXTC;
+
+       /* Permit MSA to be present if MSA is supported */
+       if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+               mask |= MIPS_CONF3_MSA;
+
+       return mask;
+}
+
+static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       return kvm_vz_config4_guest_wrmask(vcpu) | MIPS_CONF_M;
+}
+
+static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu)
+{
+       return kvm_vz_config5_guest_wrmask(vcpu) | MIPS_CONF5_MRP;
+}
+
+static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva)
+{
+       /* VZ guest has already converted gva to gpa */
+       return gva;
+}
+
+static void kvm_vz_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+       set_bit(priority, &vcpu->arch.pending_exceptions);
+       clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
+{
+       clear_bit(priority, &vcpu->arch.pending_exceptions);
+       set_bit(priority, &vcpu->arch.pending_exceptions_clr);
+}
+
+static void kvm_vz_queue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+       /*
+        * timer expiry is asynchronous to vcpu execution therefore defer guest
+        * cp0 accesses
+        */
+       kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_dequeue_timer_int_cb(struct kvm_vcpu *vcpu)
+{
+       /*
+        * timer expiry is asynchronous to vcpu execution therefore defer guest
+        * cp0 accesses
+        */
+       kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+static void kvm_vz_queue_io_int_cb(struct kvm_vcpu *vcpu,
+                                  struct kvm_mips_interrupt *irq)
+{
+       int intr = (int)irq->irq;
+
+       /*
+        * interrupts are asynchronous to vcpu execution therefore defer guest
+        * cp0 accesses
+        */
+       switch (intr) {
+       case 2:
+               kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IO);
+               break;
+
+       case 3:
+               kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+               break;
+
+       case 4:
+               kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+               break;
+
+       default:
+               break;
+       }
+
+}
+
+static void kvm_vz_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
+                                    struct kvm_mips_interrupt *irq)
+{
+       int intr = (int)irq->irq;
+
+       /*
+        * interrupts are asynchronous to vcpu execution therefore defer guest
+        * cp0 accesses
+        */
+       switch (intr) {
+       case -2:
+               kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IO);
+               break;
+
+       case -3:
+               kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1);
+               break;
+
+       case -4:
+               kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2);
+               break;
+
+       default:
+               break;
+       }
+
+}
+
+static u32 kvm_vz_priority_to_irq[MIPS_EXC_MAX] = {
+       [MIPS_EXC_INT_TIMER] = C_IRQ5,
+       [MIPS_EXC_INT_IO]    = C_IRQ0,
+       [MIPS_EXC_INT_IPI_1] = C_IRQ1,
+       [MIPS_EXC_INT_IPI_2] = C_IRQ2,
+};
+
+static int kvm_vz_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+                                u32 cause)
+{
+       u32 irq = (priority < MIPS_EXC_MAX) ?
+               kvm_vz_priority_to_irq[priority] : 0;
+
+       switch (priority) {
+       case MIPS_EXC_INT_TIMER:
+               set_gc0_cause(C_TI);
+               break;
+
+       case MIPS_EXC_INT_IO:
+       case MIPS_EXC_INT_IPI_1:
+       case MIPS_EXC_INT_IPI_2:
+               if (cpu_has_guestctl2)
+                       set_c0_guestctl2(irq);
+               else
+                       set_gc0_cause(irq);
+               break;
+
+       default:
+               break;
+       }
+
+       clear_bit(priority, &vcpu->arch.pending_exceptions);
+       return 1;
+}
+
+static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
+                              u32 cause)
+{
+       u32 irq = (priority < MIPS_EXC_MAX) ?
+               kvm_vz_priority_to_irq[priority] : 0;
+
+       switch (priority) {
+       case MIPS_EXC_INT_TIMER:
+               /*
+                * Call to kvm_write_c0_guest_compare() clears Cause.TI in
+                * kvm_mips_emulate_CP0(). Explicitly clear irq associated with
+                * Cause.IP[IPTI] if GuestCtl2 virtual interrupt register not
+                * supported or if not using GuestCtl2 Hardware Clear.
+                */
+               if (cpu_has_guestctl2) {
+                       if (!(read_c0_guestctl2() & (irq << 14)))
+                               clear_c0_guestctl2(irq);
+               } else {
+                       clear_gc0_cause(irq);
+               }
+               break;
+
+       case MIPS_EXC_INT_IO:
+       case MIPS_EXC_INT_IPI_1:
+       case MIPS_EXC_INT_IPI_2:
+               /* Clear GuestCtl2.VIP irq if not using Hardware Clear */
+               if (cpu_has_guestctl2) {
+                       if (!(read_c0_guestctl2() & (irq << 14)))
+                               clear_c0_guestctl2(irq);
+               } else {
+                       clear_gc0_cause(irq);
+               }
+               break;
+
+       default:
+               break;
+       }
+
+       clear_bit(priority, &vcpu->arch.pending_exceptions_clr);
+       return 1;
+}
+
+/*
+ * VZ guest timer handling.
+ */
+
+/**
+ * kvm_vz_should_use_htimer() - Find whether to use the VZ hard guest timer.
+ * @vcpu:      Virtual CPU.
+ *
+ * Returns:    true if the VZ GTOffset & real guest CP0_Count should be used
+ *             instead of software emulation of guest timer.
+ *             false otherwise.
+ */
+static bool kvm_vz_should_use_htimer(struct kvm_vcpu *vcpu)
+{
+       if (kvm_mips_count_disabled(vcpu))
+               return false;
+
+       /* Chosen frequency must match real frequency */
+       if (mips_hpt_frequency != vcpu->arch.count_hz)
+               return false;
+
+       /* We don't support a CP0_GTOffset with fewer bits than CP0_Count */
+       if (current_cpu_data.gtoffset_mask != 0xffffffff)
+               return false;
+
+       return true;
+}
+
+/**
+ * _kvm_vz_restore_stimer() - Restore soft timer state.
+ * @vcpu:      Virtual CPU.
+ * @compare:   CP0_Compare register value, restored by caller.
+ * @cause:     CP0_Cause register to restore.
+ *
+ * Restore VZ state relating to the soft timer. The hard timer can be enabled
+ * later.
+ */
+static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare,
+                                  u32 cause)
+{
+       /*
+        * Avoid spurious counter interrupts by setting Guest CP0_Count to just
+        * after Guest CP0_Compare.
+        */
+       write_c0_gtoffset(compare - read_c0_count());
+
+       back_to_back_c0_hazard();
+       write_gc0_cause(cause);
+}
+
+/**
+ * _kvm_vz_restore_htimer() - Restore hard timer state.
+ * @vcpu:      Virtual CPU.
+ * @compare:   CP0_Compare register value, restored by caller.
+ * @cause:     CP0_Cause register to restore.
+ *
+ * Restore hard timer Guest.Count & Guest.Cause taking care to preserve the
+ * value of Guest.CP0_Cause.TI while restoring Guest.CP0_Cause.
+ */
+static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu,
+                                  u32 compare, u32 cause)
+{
+       u32 start_count, after_count;
+       ktime_t freeze_time;
+       unsigned long flags;
+
+       /*
+        * Freeze the soft-timer and sync the guest CP0_Count with it. We do
+        * this with interrupts disabled to avoid latency.
+        */
+       local_irq_save(flags);
+       freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count);
+       write_c0_gtoffset(start_count - read_c0_count());
+       local_irq_restore(flags);
+
+       /* restore guest CP0_Cause, as TI may already be set */
+       back_to_back_c0_hazard();
+       write_gc0_cause(cause);
+
+       /*
+        * The above sequence isn't atomic and would result in lost timer
+        * interrupts if we're not careful. Detect if a timer interrupt is due
+        * and assert it.
+        */
+       back_to_back_c0_hazard();
+       after_count = read_gc0_count();
+       if (after_count - start_count > compare - start_count - 1)
+               kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+}
+
+/**
+ * kvm_vz_restore_timer() - Restore timer state.
+ * @vcpu:      Virtual CPU.
+ *
+ * Restore soft timer state from saved context.
+ */
+static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       u32 cause, compare;
+
+       compare = kvm_read_sw_gc0_compare(cop0);
+       cause = kvm_read_sw_gc0_cause(cop0);
+
+       write_gc0_compare(compare);
+       _kvm_vz_restore_stimer(vcpu, compare, cause);
+}
+
+/**
+ * kvm_vz_acquire_htimer() - Switch to hard timer state.
+ * @vcpu:      Virtual CPU.
+ *
+ * Restore hard timer state on top of existing soft timer state if possible.
+ *
+ * Since hard timer won't remain active over preemption, preemption should be
+ * disabled by the caller.
+ */
+void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu)
+{
+       u32 gctl0;
+
+       gctl0 = read_c0_guestctl0();
+       if (!(gctl0 & MIPS_GCTL0_GT) && kvm_vz_should_use_htimer(vcpu)) {
+               /* enable guest access to hard timer */
+               write_c0_guestctl0(gctl0 | MIPS_GCTL0_GT);
+
+               _kvm_vz_restore_htimer(vcpu, read_gc0_compare(),
+                                      read_gc0_cause());
+       }
+}
+
+/**
+ * _kvm_vz_save_htimer() - Switch to software emulation of guest timer.
+ * @vcpu:      Virtual CPU.
+ * @compare:   Pointer to write compare value to.
+ * @cause:     Pointer to write cause value to.
+ *
+ * Save VZ guest timer state and switch to software emulation of guest CP0
+ * timer. The hard timer must already be in use, so preemption should be
+ * disabled.
+ */
+static void _kvm_vz_save_htimer(struct kvm_vcpu *vcpu,
+                               u32 *out_compare, u32 *out_cause)
+{
+       u32 cause, compare, before_count, end_count;
+       ktime_t before_time;
+
+       compare = read_gc0_compare();
+       *out_compare = compare;
+
+       before_time = ktime_get();
+
+       /*
+        * Record the CP0_Count *prior* to saving CP0_Cause, so we have a time
+        * at which no pending timer interrupt is missing.
+        */
+       before_count = read_gc0_count();
+       back_to_back_c0_hazard();
+       cause = read_gc0_cause();
+       *out_cause = cause;
+
+       /*
+        * Record a final CP0_Count which we will transfer to the soft-timer.
+        * This is recorded *after* saving CP0_Cause, so we don't get any timer
+        * interrupts from just after the final CP0_Count point.
+        */
+       back_to_back_c0_hazard();
+       end_count = read_gc0_count();
+
+       /*
+        * The above sequence isn't atomic, so we could miss a timer interrupt
+        * between reading CP0_Cause and end_count. Detect and record any timer
+        * interrupt due between before_count and end_count.
+        */
+       if (end_count - before_count > compare - before_count - 1)
+               kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER);
+
+       /*
+        * Restore soft-timer, ignoring a small amount of negative drift due to
+        * delay between freeze_hrtimer and setting CP0_GTOffset.
+        */
+       kvm_mips_restore_hrtimer(vcpu, before_time, end_count, -0x10000);
+}
+
+/**
+ * kvm_vz_save_timer() - Save guest timer state.
+ * @vcpu:      Virtual CPU.
+ *
+ * Save VZ guest timer state and switch to soft guest timer if hard timer was in
+ * use.
+ */
+static void kvm_vz_save_timer(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       u32 gctl0, compare, cause;
+
+       gctl0 = read_c0_guestctl0();
+       if (gctl0 & MIPS_GCTL0_GT) {
+               /* disable guest use of hard timer */
+               write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+               /* save hard timer state */
+               _kvm_vz_save_htimer(vcpu, &compare, &cause);
+       } else {
+               compare = read_gc0_compare();
+               cause = read_gc0_cause();
+       }
+
+       /* save timer-related state to VCPU context */
+       kvm_write_sw_gc0_cause(cop0, cause);
+       kvm_write_sw_gc0_compare(cop0, compare);
+}
+
+/**
+ * kvm_vz_lose_htimer() - Ensure hard guest timer is not in use.
+ * @vcpu:      Virtual CPU.
+ *
+ * Transfers the state of the hard guest timer to the soft guest timer, leaving
+ * guest state intact so it can continue to be used with the soft timer.
+ */
+void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu)
+{
+       u32 gctl0, compare, cause;
+
+       preempt_disable();
+       gctl0 = read_c0_guestctl0();
+       if (gctl0 & MIPS_GCTL0_GT) {
+               /* disable guest use of timer */
+               write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT);
+
+               /* switch to soft timer */
+               _kvm_vz_save_htimer(vcpu, &compare, &cause);
+
+               /* leave soft timer in usable state */
+               _kvm_vz_restore_stimer(vcpu, compare, cause);
+       }
+       preempt_enable();
+}
+
+/**
+ * is_eva_access() - Find whether an instruction is an EVA memory accessor.
+ * @inst:      32-bit instruction encoding.
+ *
+ * Finds whether @inst encodes an EVA memory access instruction, which would
+ * indicate that emulation of it should access the user mode address space
+ * instead of the kernel mode address space. This matters for MUSUK segments
+ * which are TLB mapped for user mode but unmapped for kernel mode.
+ *
+ * Returns:    Whether @inst encodes an EVA accessor instruction.
+ */
+static bool is_eva_access(union mips_instruction inst)
+{
+       if (inst.spec3_format.opcode != spec3_op)
+               return false;
+
+       switch (inst.spec3_format.func) {
+       case lwle_op:
+       case lwre_op:
+       case cachee_op:
+       case sbe_op:
+       case she_op:
+       case sce_op:
+       case swe_op:
+       case swle_op:
+       case swre_op:
+       case prefe_op:
+       case lbue_op:
+       case lhue_op:
+       case lbe_op:
+       case lhe_op:
+       case lle_op:
+       case lwe_op:
+               return true;
+       default:
+               return false;
+       }
+}
+
+/**
+ * is_eva_am_mapped() - Find whether an access mode is mapped.
+ * @vcpu:      KVM VCPU state.
+ * @am:                3-bit encoded access mode.
+ * @eu:                Segment becomes unmapped and uncached when Status.ERL=1.
+ *
+ * Decode @am to find whether it encodes a mapped segment for the current VCPU
+ * state. Where necessary @eu and the actual instruction causing the fault are
+ * taken into account to make the decision.
+ *
+ * Returns:    Whether the VCPU faulted on a TLB mapped address.
+ */
+static bool is_eva_am_mapped(struct kvm_vcpu *vcpu, unsigned int am, bool eu)
+{
+       u32 am_lookup;
+       int err;
+
+       /*
+        * Interpret access control mode. We assume address errors will already
+        * have been caught by the guest, leaving us with:
+        *      AM      UM  SM  KM  31..24 23..16
+        * UK    0 000          Unm   0      0
+        * MK    1 001          TLB   1
+        * MSK   2 010      TLB TLB   1
+        * MUSK  3 011  TLB TLB TLB   1
+        * MUSUK 4 100  TLB TLB Unm   0      1
+        * USK   5 101      Unm Unm   0      0
+        * -     6 110                0      0
+        * UUSK  7 111  Unm Unm Unm   0      0
+        *
+        * We shift a magic value by AM across the sign bit to find if always
+        * TLB mapped, and if not shift by 8 again to find if it depends on KM.
+        */
+       am_lookup = 0x70080000 << am;
+       if ((s32)am_lookup < 0) {
+               /*
+                * MK, MSK, MUSK
+                * Always TLB mapped, unless SegCtl.EU && ERL
+                */
+               if (!eu || !(read_gc0_status() & ST0_ERL))
+                       return true;
+       } else {
+               am_lookup <<= 8;
+               if ((s32)am_lookup < 0) {
+                       union mips_instruction inst;
+                       unsigned int status;
+                       u32 *opc;
+
+                       /*
+                        * MUSUK
+                        * TLB mapped if not in kernel mode
+                        */
+                       status = read_gc0_status();
+                       if (!(status & (ST0_EXL | ST0_ERL)) &&
+                           (status & ST0_KSU))
+                               return true;
+                       /*
+                        * EVA access instructions in kernel
+                        * mode access user address space.
+                        */
+                       opc = (u32 *)vcpu->arch.pc;
+                       if (vcpu->arch.host_cp0_cause & CAUSEF_BD)
+                               opc += 1;
+                       err = kvm_get_badinstr(opc, vcpu, &inst.word);
+                       if (!err && is_eva_access(inst))
+                               return true;
+               }
+       }
+
+       return false;
+}
+
+/**
+ * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA.
+ * @vcpu:      KVM VCPU state.
+ * @gva:       Guest virtual address to convert.
+ * @gpa:       Output guest physical address.
+ *
+ * Convert a guest virtual address (GVA) which is valid according to the guest
+ * context, to a guest physical address (GPA).
+ *
+ * Returns:    0 on success.
+ *             -errno on failure.
+ */
+static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+                            unsigned long *gpa)
+{
+       u32 gva32 = gva;
+       unsigned long segctl;
+
+       if ((long)gva == (s32)gva32) {
+               /* Handle canonical 32-bit virtual address */
+               if (cpu_guest_has_segments) {
+                       unsigned long mask, pa;
+
+                       switch (gva32 >> 29) {
+                       case 0:
+                       case 1: /* CFG5 (1GB) */
+                               segctl = read_gc0_segctl2() >> 16;
+                               mask = (unsigned long)0xfc0000000ull;
+                               break;
+                       case 2:
+                       case 3: /* CFG4 (1GB) */
+                               segctl = read_gc0_segctl2();
+                               mask = (unsigned long)0xfc0000000ull;
+                               break;
+                       case 4: /* CFG3 (512MB) */
+                               segctl = read_gc0_segctl1() >> 16;
+                               mask = (unsigned long)0xfe0000000ull;
+                               break;
+                       case 5: /* CFG2 (512MB) */
+                               segctl = read_gc0_segctl1();
+                               mask = (unsigned long)0xfe0000000ull;
+                               break;
+                       case 6: /* CFG1 (512MB) */
+                               segctl = read_gc0_segctl0() >> 16;
+                               mask = (unsigned long)0xfe0000000ull;
+                               break;
+                       case 7: /* CFG0 (512MB) */
+                               segctl = read_gc0_segctl0();
+                               mask = (unsigned long)0xfe0000000ull;
+                               break;
+                       default:
+                               /*
+                                * GCC 4.9 isn't smart enough to figure out that
+                                * segctl and mask are always initialised.
+                                */
+                               unreachable();
+                       }
+
+                       if (is_eva_am_mapped(vcpu, (segctl >> 4) & 0x7,
+                                            segctl & 0x0008))
+                               goto tlb_mapped;
+
+                       /* Unmapped, find guest physical address */
+                       pa = (segctl << 20) & mask;
+                       pa |= gva32 & ~mask;
+                       *gpa = pa;
+                       return 0;
+               } else if ((s32)gva32 < (s32)0xc0000000) {
+                       /* legacy unmapped KSeg0 or KSeg1 */
+                       *gpa = gva32 & 0x1fffffff;
+                       return 0;
+               }
+#ifdef CONFIG_64BIT
+       } else if ((gva & 0xc000000000000000) == 0x8000000000000000) {
+               /* XKPHYS */
+               if (cpu_guest_has_segments) {
+                       /*
+                        * Each of the 8 regions can be overridden by SegCtl2.XR
+                        * to use SegCtl1.XAM.
+                        */
+                       segctl = read_gc0_segctl2();
+                       if (segctl & (1ull << (56 + ((gva >> 59) & 0x7)))) {
+                               segctl = read_gc0_segctl1();
+                               if (is_eva_am_mapped(vcpu, (segctl >> 59) & 0x7,
+                                                    0))
+                                       goto tlb_mapped;
+                       }
+
+               }
+               /*
+                * Traditionally fully unmapped.
+                * Bits 61:59 specify the CCA, which we can just mask off here.
+                * Bits 58:PABITS should be zero, but we shouldn't have got here
+                * if it wasn't.
+                */
+               *gpa = gva & 0x07ffffffffffffff;
+               return 0;
+#endif
+       }
+
+tlb_mapped:
+       return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa);
+}
+
+/**
+ * kvm_vz_badvaddr_to_gpa() - Convert GVA BadVAddr from root exception to GPA.
+ * @vcpu:      KVM VCPU state.
+ * @badvaddr:  Root BadVAddr.
+ * @gpa:       Output guest physical address.
+ *
+ * VZ implementations are permitted to report guest virtual addresses (GVA) in
+ * BadVAddr on a root exception during guest execution, instead of the more
+ * convenient guest physical addresses (GPA). When we get a GVA, this function
+ * converts it to a GPA, taking into account guest segmentation and guest TLB
+ * state.
+ *
+ * Returns:    0 on success.
+ *             -errno on failure.
+ */
+static int kvm_vz_badvaddr_to_gpa(struct kvm_vcpu *vcpu, unsigned long badvaddr,
+                                 unsigned long *gpa)
+{
+       unsigned int gexccode = (vcpu->arch.host_cp0_guestctl0 &
+                                MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+
+       /* If BadVAddr is GPA, then all is well in the world */
+       if (likely(gexccode == MIPS_GCTL0_GEXC_GPA)) {
+               *gpa = badvaddr;
+               return 0;
+       }
+
+       /* Otherwise we'd expect it to be GVA ... */
+       if (WARN(gexccode != MIPS_GCTL0_GEXC_GVA,
+                "Unexpected gexccode %#x\n", gexccode))
+               return -EINVAL;
+
+       /* ... and we need to perform the GVA->GPA translation in software */
+       return kvm_vz_gva_to_gpa(vcpu, badvaddr, gpa);
+}
+
+static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu)
+{
+       u32 *opc = (u32 *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE;
+       unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
+       u32 inst = 0;
+
+       /*
+        *  Fetch the instruction.
+        */
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       kvm_get_badinstr(opc, vcpu, &inst);
+
+       kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n",
+               exccode, opc, inst, badvaddr,
+               read_gc0_status());
+       kvm_arch_vcpu_dump_regs(vcpu);
+       vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+       return RESUME_HOST;
+}
+
+static unsigned long mips_process_maar(unsigned int op, unsigned long val)
+{
+       /* Mask off unused bits */
+       unsigned long mask = 0xfffff000 | MIPS_MAAR_S | MIPS_MAAR_VL;
+
+       if (read_gc0_pagegrain() & PG_ELPA)
+               mask |= 0x00ffffff00000000ull;
+       if (cpu_guest_has_mvh)
+               mask |= MIPS_MAAR_VH;
+
+       /* Set or clear VH */
+       if (op == mtc_op) {
+               /* clear VH */
+               val &= ~MIPS_MAAR_VH;
+       } else if (op == dmtc_op) {
+               /* set VH to match VL */
+               val &= ~MIPS_MAAR_VH;
+               if (val & MIPS_MAAR_VL)
+                       val |= MIPS_MAAR_VH;
+       }
+
+       return val & mask;
+}
+
+static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+       val &= MIPS_MAARI_INDEX;
+       if (val == MIPS_MAARI_INDEX)
+               kvm_write_sw_gc0_maari(cop0, ARRAY_SIZE(vcpu->arch.maar) - 1);
+       else if (val < ARRAY_SIZE(vcpu->arch.maar))
+               kvm_write_sw_gc0_maari(cop0, val);
+}
+
+static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst,
+                                             u32 *opc, u32 cause,
+                                             struct kvm_run *run,
+                                             struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       enum emulation_result er = EMULATE_DONE;
+       u32 rt, rd, sel;
+       unsigned long curr_pc;
+       unsigned long val;
+
+       /*
+        * Update PC and hold onto current PC in case there is
+        * an error and we want to rollback the PC
+        */
+       curr_pc = vcpu->arch.pc;
+       er = update_pc(vcpu, cause);
+       if (er == EMULATE_FAIL)
+               return er;
+
+       if (inst.co_format.co) {
+               switch (inst.co_format.func) {
+               case wait_op:
+                       er = kvm_mips_emul_wait(vcpu);
+                       break;
+               default:
+                       er = EMULATE_FAIL;
+               }
+       } else {
+               rt = inst.c0r_format.rt;
+               rd = inst.c0r_format.rd;
+               sel = inst.c0r_format.sel;
+
+               switch (inst.c0r_format.rs) {
+               case dmfc_op:
+               case mfc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+                       cop0->stat[rd][sel]++;
+#endif
+                       if (rd == MIPS_CP0_COUNT &&
+                           sel == 0) {                 /* Count */
+                               val = kvm_mips_read_count(vcpu);
+                       } else if (rd == MIPS_CP0_COMPARE &&
+                                  sel == 0) {          /* Compare */
+                               val = read_gc0_compare();
+                       } else if (rd == MIPS_CP0_LLADDR &&
+                                  sel == 0) {          /* LLAddr */
+                               if (cpu_guest_has_rw_llb)
+                                       val = read_gc0_lladdr() &
+                                               MIPS_LLADDR_LLB;
+                               else
+                                       val = 0;
+                       } else if (rd == MIPS_CP0_LLADDR &&
+                                  sel == 1 &&          /* MAAR */
+                                  cpu_guest_has_maar &&
+                                  !cpu_guest_has_dyn_maar) {
+                               /* MAARI must be in range */
+                               BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+                                               ARRAY_SIZE(vcpu->arch.maar));
+                               val = vcpu->arch.maar[
+                                       kvm_read_sw_gc0_maari(cop0)];
+                       } else if ((rd == MIPS_CP0_PRID &&
+                                   (sel == 0 ||        /* PRid */
+                                    sel == 2 ||        /* CDMMBase */
+                                    sel == 3)) ||      /* CMGCRBase */
+                                  (rd == MIPS_CP0_STATUS &&
+                                   (sel == 2 ||        /* SRSCtl */
+                                    sel == 3)) ||      /* SRSMap */
+                                  (rd == MIPS_CP0_CONFIG &&
+                                   (sel == 7)) ||      /* Config7 */
+                                  (rd == MIPS_CP0_LLADDR &&
+                                   (sel == 2) &&       /* MAARI */
+                                   cpu_guest_has_maar &&
+                                   !cpu_guest_has_dyn_maar) ||
+                                  (rd == MIPS_CP0_ERRCTL &&
+                                   (sel == 0))) {      /* ErrCtl */
+                               val = cop0->reg[rd][sel];
+                       } else {
+                               val = 0;
+                               er = EMULATE_FAIL;
+                       }
+
+                       if (er != EMULATE_FAIL) {
+                               /* Sign extend */
+                               if (inst.c0r_format.rs == mfc_op)
+                                       val = (int)val;
+                               vcpu->arch.gprs[rt] = val;
+                       }
+
+                       trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mfc_op) ?
+                                       KVM_TRACE_MFC0 : KVM_TRACE_DMFC0,
+                                     KVM_TRACE_COP0(rd, sel), val);
+                       break;
+
+               case dmtc_op:
+               case mtc_op:
+#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
+                       cop0->stat[rd][sel]++;
+#endif
+                       val = vcpu->arch.gprs[rt];
+                       trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mtc_op) ?
+                                       KVM_TRACE_MTC0 : KVM_TRACE_DMTC0,
+                                     KVM_TRACE_COP0(rd, sel), val);
+
+                       if (rd == MIPS_CP0_COUNT &&
+                           sel == 0) {                 /* Count */
+                               kvm_vz_lose_htimer(vcpu);
+                               kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
+                       } else if (rd == MIPS_CP0_COMPARE &&
+                                  sel == 0) {          /* Compare */
+                               kvm_mips_write_compare(vcpu,
+                                                      vcpu->arch.gprs[rt],
+                                                      true);
+                       } else if (rd == MIPS_CP0_LLADDR &&
+                                  sel == 0) {          /* LLAddr */
+                               /*
+                                * P5600 generates GPSI on guest MTC0 LLAddr.
+                                * Only allow the guest to clear LLB.
+                                */
+                               if (cpu_guest_has_rw_llb &&
+                                   !(val & MIPS_LLADDR_LLB))
+                                       write_gc0_lladdr(0);
+                       } else if (rd == MIPS_CP0_LLADDR &&
+                                  sel == 1 &&          /* MAAR */
+                                  cpu_guest_has_maar &&
+                                  !cpu_guest_has_dyn_maar) {
+                               val = mips_process_maar(inst.c0r_format.rs,
+                                                       val);
+
+                               /* MAARI must be in range */
+                               BUG_ON(kvm_read_sw_gc0_maari(cop0) >=
+                                               ARRAY_SIZE(vcpu->arch.maar));
+                               vcpu->arch.maar[kvm_read_sw_gc0_maari(cop0)] =
+                                                                       val;
+                       } else if (rd == MIPS_CP0_LLADDR &&
+                                  (sel == 2) &&        /* MAARI */
+                                  cpu_guest_has_maar &&
+                                  !cpu_guest_has_dyn_maar) {
+                               kvm_write_maari(vcpu, val);
+                       } else if (rd == MIPS_CP0_ERRCTL &&
+                                  (sel == 0)) {        /* ErrCtl */
+                               /* ignore the written value */
+                       } else {
+                               er = EMULATE_FAIL;
+                       }
+                       break;
+
+               default:
+                       er = EMULATE_FAIL;
+                       break;
+               }
+       }
+       /* Rollback PC only if emulation was unsuccessful */
+       if (er == EMULATE_FAIL) {
+               kvm_err("[%#lx]%s: unsupported cop0 instruction 0x%08x\n",
+                       curr_pc, __func__, inst.word);
+
+               vcpu->arch.pc = curr_pc;
+       }
+
+       return er;
+}
+
+static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst,
+                                              u32 *opc, u32 cause,
+                                              struct kvm_run *run,
+                                              struct kvm_vcpu *vcpu)
+{
+       enum emulation_result er = EMULATE_DONE;
+       u32 cache, op_inst, op, base;
+       s16 offset;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       unsigned long va, curr_pc;
+
+       /*
+        * Update PC and hold onto current PC in case there is
+        * an error and we want to rollback the PC
+        */
+       curr_pc = vcpu->arch.pc;
+       er = update_pc(vcpu, cause);
+       if (er == EMULATE_FAIL)
+               return er;
+
+       base = inst.i_format.rs;
+       op_inst = inst.i_format.rt;
+       if (cpu_has_mips_r6)
+               offset = inst.spec3_format.simmediate;
+       else
+               offset = inst.i_format.simmediate;
+       cache = op_inst & CacheOp_Cache;
+       op = op_inst & CacheOp_Op;
+
+       va = arch->gprs[base] + offset;
+
+       kvm_debug("CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+                 cache, op, base, arch->gprs[base], offset);
+
+       /* Secondary or tirtiary cache ops ignored */
+       if (cache != Cache_I && cache != Cache_D)
+               return EMULATE_DONE;
+
+       switch (op_inst) {
+       case Index_Invalidate_I:
+               flush_icache_line_indexed(va);
+               return EMULATE_DONE;
+       case Index_Writeback_Inv_D:
+               flush_dcache_line_indexed(va);
+               return EMULATE_DONE;
+       case Hit_Invalidate_I:
+       case Hit_Invalidate_D:
+       case Hit_Writeback_Inv_D:
+               if (boot_cpu_type() == CPU_CAVIUM_OCTEON3) {
+                       /* We can just flush entire icache */
+                       local_flush_icache_range(0, 0);
+                       return EMULATE_DONE;
+               }
+
+               /* So far, other platforms support guest hit cache ops */
+               break;
+       default:
+               break;
+       };
+
+       kvm_err("@ %#lx/%#lx CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
+               curr_pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base],
+               offset);
+       /* Rollback PC */
+       vcpu->arch.pc = curr_pc;
+
+       return EMULATE_FAIL;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc,
+                                                    struct kvm_vcpu *vcpu)
+{
+       enum emulation_result er = EMULATE_DONE;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       struct kvm_run *run = vcpu->run;
+       union mips_instruction inst;
+       int rd, rt, sel;
+       int err;
+
+       /*
+        *  Fetch the instruction.
+        */
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       err = kvm_get_badinstr(opc, vcpu, &inst.word);
+       if (err)
+               return EMULATE_FAIL;
+
+       switch (inst.r_format.opcode) {
+       case cop0_op:
+               er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu);
+               break;
+#ifndef CONFIG_CPU_MIPSR6
+       case cache_op:
+               trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+               er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+               break;
+#endif
+       case spec3_op:
+               switch (inst.spec3_format.func) {
+#ifdef CONFIG_CPU_MIPSR6
+               case cache6_op:
+                       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+                       er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu);
+                       break;
+#endif
+               case rdhwr_op:
+                       if (inst.r_format.rs || (inst.r_format.re >> 3))
+                               goto unknown;
+
+                       rd = inst.r_format.rd;
+                       rt = inst.r_format.rt;
+                       sel = inst.r_format.re & 0x7;
+
+                       switch (rd) {
+                       case MIPS_HWR_CC:       /* Read count register */
+                               arch->gprs[rt] =
+                                       (long)(int)kvm_mips_read_count(vcpu);
+                               break;
+                       default:
+                               trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+                                             KVM_TRACE_HWR(rd, sel), 0);
+                               goto unknown;
+                       };
+
+                       trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR,
+                                     KVM_TRACE_HWR(rd, sel), arch->gprs[rt]);
+
+                       er = update_pc(vcpu, cause);
+                       break;
+               default:
+                       goto unknown;
+               };
+               break;
+unknown:
+
+       default:
+               kvm_err("GPSI exception not supported (%p/%#x)\n",
+                               opc, inst.word);
+               kvm_arch_vcpu_dump_regs(vcpu);
+               er = EMULATE_FAIL;
+               break;
+       }
+
+       return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc,
+                                                    struct kvm_vcpu *vcpu)
+{
+       enum emulation_result er = EMULATE_DONE;
+       struct kvm_vcpu_arch *arch = &vcpu->arch;
+       union mips_instruction inst;
+       int err;
+
+       /*
+        *  Fetch the instruction.
+        */
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       err = kvm_get_badinstr(opc, vcpu, &inst.word);
+       if (err)
+               return EMULATE_FAIL;
+
+       /* complete MTC0 on behalf of guest and advance EPC */
+       if (inst.c0r_format.opcode == cop0_op &&
+           inst.c0r_format.rs == mtc_op &&
+           inst.c0r_format.z == 0) {
+               int rt = inst.c0r_format.rt;
+               int rd = inst.c0r_format.rd;
+               int sel = inst.c0r_format.sel;
+               unsigned int val = arch->gprs[rt];
+               unsigned int old_val, change;
+
+               trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, KVM_TRACE_COP0(rd, sel),
+                             val);
+
+               if ((rd == MIPS_CP0_STATUS) && (sel == 0)) {
+                       /* FR bit should read as zero if no FPU */
+                       if (!kvm_mips_guest_has_fpu(&vcpu->arch))
+                               val &= ~(ST0_CU1 | ST0_FR);
+
+                       /*
+                        * Also don't allow FR to be set if host doesn't support
+                        * it.
+                        */
+                       if (!(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+                               val &= ~ST0_FR;
+
+                       old_val = read_gc0_status();
+                       change = val ^ old_val;
+
+                       if (change & ST0_FR) {
+                               /*
+                                * FPU and Vector register state is made
+                                * UNPREDICTABLE by a change of FR, so don't
+                                * even bother saving it.
+                                */
+                               kvm_drop_fpu(vcpu);
+                       }
+
+                       /*
+                        * If MSA state is already live, it is undefined how it
+                        * interacts with FR=0 FPU state, and we don't want to
+                        * hit reserved instruction exceptions trying to save
+                        * the MSA state later when CU=1 && FR=1, so play it
+                        * safe and save it first.
+                        */
+                       if (change & ST0_CU1 && !(val & ST0_FR) &&
+                           vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
+                               kvm_lose_fpu(vcpu);
+
+                       write_gc0_status(val);
+               } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
+                       u32 old_cause = read_gc0_cause();
+                       u32 change = old_cause ^ val;
+
+                       /* DC bit enabling/disabling timer? */
+                       if (change & CAUSEF_DC) {
+                               if (val & CAUSEF_DC) {
+                                       kvm_vz_lose_htimer(vcpu);
+                                       kvm_mips_count_disable_cause(vcpu);
+                               } else {
+                                       kvm_mips_count_enable_cause(vcpu);
+                               }
+                       }
+
+                       /* Only certain bits are RW to the guest */
+                       change &= (CAUSEF_DC | CAUSEF_IV | CAUSEF_WP |
+                                  CAUSEF_IP0 | CAUSEF_IP1);
+
+                       /* WP can only be cleared */
+                       change &= ~CAUSEF_WP | old_cause;
+
+                       write_gc0_cause(old_cause ^ change);
+               } else if ((rd == MIPS_CP0_STATUS) && (sel == 1)) { /* IntCtl */
+                       write_gc0_intctl(val);
+               } else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) {
+                       old_val = read_gc0_config5();
+                       change = val ^ old_val;
+                       /* Handle changes in FPU/MSA modes */
+                       preempt_disable();
+
+                       /*
+                        * Propagate FRE changes immediately if the FPU
+                        * context is already loaded.
+                        */
+                       if (change & MIPS_CONF5_FRE &&
+                           vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
+                               change_c0_config5(MIPS_CONF5_FRE, val);
+
+                       preempt_enable();
+
+                       val = old_val ^
+                               (change & kvm_vz_config5_guest_wrmask(vcpu));
+                       write_gc0_config5(val);
+               } else {
+                       kvm_err("Handle GSFC, unsupported field change @ %p: %#x\n",
+                           opc, inst.word);
+                       er = EMULATE_FAIL;
+               }
+
+               if (er != EMULATE_FAIL)
+                       er = update_pc(vcpu, cause);
+       } else {
+               kvm_err("Handle GSFC, unrecognized instruction @ %p: %#x\n",
+                       opc, inst.word);
+               er = EMULATE_FAIL;
+       }
+
+       return er;
+}
+
+static enum emulation_result kvm_trap_vz_handle_ghfc(u32 cause, u32 *opc,
+                                                    struct kvm_vcpu *vcpu)
+{
+       /*
+        * Presumably this is due to MC (guest mode change), so lets trace some
+        * relevant info.
+        */
+       trace_kvm_guest_mode_change(vcpu);
+
+       return EMULATE_DONE;
+}
+
+static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc,
+                                                  struct kvm_vcpu *vcpu)
+{
+       enum emulation_result er;
+       union mips_instruction inst;
+       unsigned long curr_pc;
+       int err;
+
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       err = kvm_get_badinstr(opc, vcpu, &inst.word);
+       if (err)
+               return EMULATE_FAIL;
+
+       /*
+        * Update PC and hold onto current PC in case there is
+        * an error and we want to rollback the PC
+        */
+       curr_pc = vcpu->arch.pc;
+       er = update_pc(vcpu, cause);
+       if (er == EMULATE_FAIL)
+               return er;
+
+       er = kvm_mips_emul_hypcall(vcpu, inst);
+       if (er == EMULATE_FAIL)
+               vcpu->arch.pc = curr_pc;
+
+       return er;
+}
+
+static enum emulation_result kvm_trap_vz_no_handler_guest_exit(u32 gexccode,
+                                                       u32 cause,
+                                                       u32 *opc,
+                                                       struct kvm_vcpu *vcpu)
+{
+       u32 inst;
+
+       /*
+        *  Fetch the instruction.
+        */
+       if (cause & CAUSEF_BD)
+               opc += 1;
+       kvm_get_badinstr(opc, vcpu, &inst);
+
+       kvm_err("Guest Exception Code: %d not yet handled @ PC: %p, inst: 0x%08x  Status: %#x\n",
+               gexccode, opc, inst, read_gc0_status());
+
+       return EMULATE_FAIL;
+}
+
+static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
+{
+       u32 *opc = (u32 *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_DONE;
+       u32 gexccode = (vcpu->arch.host_cp0_guestctl0 &
+                       MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT;
+       int ret = RESUME_GUEST;
+
+       trace_kvm_exit(vcpu, KVM_TRACE_EXIT_GEXCCODE_BASE + gexccode);
+       switch (gexccode) {
+       case MIPS_GCTL0_GEXC_GPSI:
+               ++vcpu->stat.vz_gpsi_exits;
+               er = kvm_trap_vz_handle_gpsi(cause, opc, vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_GSFC:
+               ++vcpu->stat.vz_gsfc_exits;
+               er = kvm_trap_vz_handle_gsfc(cause, opc, vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_HC:
+               ++vcpu->stat.vz_hc_exits;
+               er = kvm_trap_vz_handle_hc(cause, opc, vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_GRR:
+               ++vcpu->stat.vz_grr_exits;
+               er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+                                                      vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_GVA:
+               ++vcpu->stat.vz_gva_exits;
+               er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+                                                      vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_GHFC:
+               ++vcpu->stat.vz_ghfc_exits;
+               er = kvm_trap_vz_handle_ghfc(cause, opc, vcpu);
+               break;
+       case MIPS_GCTL0_GEXC_GPA:
+               ++vcpu->stat.vz_gpa_exits;
+               er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+                                                      vcpu);
+               break;
+       default:
+               ++vcpu->stat.vz_resvd_exits;
+               er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc,
+                                                      vcpu);
+               break;
+
+       }
+
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else if (er == EMULATE_HYPERCALL) {
+               ret = kvm_mips_handle_hypcall(vcpu);
+       } else {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor.
+ * @vcpu:      Virtual CPU context.
+ *
+ * Handle when the guest attempts to use a coprocessor which hasn't been allowed
+ * by the root context.
+ */
+static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       enum emulation_result er = EMULATE_FAIL;
+       int ret = RESUME_GUEST;
+
+       if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) {
+               /*
+                * If guest FPU not present, the FPU operation should have been
+                * treated as a reserved instruction!
+                * If FPU already in use, we shouldn't get this at all.
+                */
+               if (WARN_ON(!kvm_mips_guest_has_fpu(&vcpu->arch) ||
+                           vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
+                       preempt_enable();
+                       return EMULATE_FAIL;
+               }
+
+               kvm_own_fpu(vcpu);
+               er = EMULATE_DONE;
+       }
+       /* other coprocessors not handled */
+
+       switch (er) {
+       case EMULATE_DONE:
+               ret = RESUME_GUEST;
+               break;
+
+       case EMULATE_FAIL:
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+               break;
+
+       default:
+               BUG();
+       }
+       return ret;
+}
+
+/**
+ * kvm_trap_vz_handle_msa_disabled() - Guest used MSA while disabled in root.
+ * @vcpu:      Virtual CPU context.
+ *
+ * Handle when the guest attempts to use MSA when it is disabled in the root
+ * context.
+ */
+static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+
+       /*
+        * If MSA not present or not exposed to guest or FR=0, the MSA operation
+        * should have been treated as a reserved instruction!
+        * Same if CU1=1, FR=0.
+        * If MSA already in use, we shouldn't get this at all.
+        */
+       if (!kvm_mips_guest_has_msa(&vcpu->arch) ||
+           (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 ||
+           !(read_gc0_config5() & MIPS_CONF5_MSAEN) ||
+           vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               return RESUME_HOST;
+       }
+
+       kvm_own_msa(vcpu);
+
+       return RESUME_GUEST;
+}
+
+static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       u32 *opc = (u32 *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+       union mips_instruction inst;
+       enum emulation_result er = EMULATE_DONE;
+       int err, ret = RESUME_GUEST;
+
+       if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, false)) {
+               /* A code fetch fault doesn't count as an MMIO */
+               if (kvm_is_ifetch_fault(&vcpu->arch)) {
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       return RESUME_HOST;
+               }
+
+               /* Fetch the instruction */
+               if (cause & CAUSEF_BD)
+                       opc += 1;
+               err = kvm_get_badinstr(opc, vcpu, &inst.word);
+               if (err) {
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       return RESUME_HOST;
+               }
+
+               /* Treat as MMIO */
+               er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+               if (er == EMULATE_FAIL) {
+                       kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+                               opc, badvaddr);
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               }
+       }
+
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else if (er == EMULATE_DO_MMIO) {
+               run->exit_reason = KVM_EXIT_MMIO;
+               ret = RESUME_HOST;
+       } else {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+       struct kvm_run *run = vcpu->run;
+       u32 *opc = (u32 *) vcpu->arch.pc;
+       u32 cause = vcpu->arch.host_cp0_cause;
+       ulong badvaddr = vcpu->arch.host_cp0_badvaddr;
+       union mips_instruction inst;
+       enum emulation_result er = EMULATE_DONE;
+       int err;
+       int ret = RESUME_GUEST;
+
+       /* Just try the access again if we couldn't do the translation */
+       if (kvm_vz_badvaddr_to_gpa(vcpu, badvaddr, &badvaddr))
+               return RESUME_GUEST;
+       vcpu->arch.host_cp0_badvaddr = badvaddr;
+
+       if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, true)) {
+               /* Fetch the instruction */
+               if (cause & CAUSEF_BD)
+                       opc += 1;
+               err = kvm_get_badinstr(opc, vcpu, &inst.word);
+               if (err) {
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+                       return RESUME_HOST;
+               }
+
+               /* Treat as MMIO */
+               er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+               if (er == EMULATE_FAIL) {
+                       kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n",
+                               opc, badvaddr);
+                       run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               }
+       }
+
+       if (er == EMULATE_DONE) {
+               ret = RESUME_GUEST;
+       } else if (er == EMULATE_DO_MMIO) {
+               run->exit_reason = KVM_EXIT_MMIO;
+               ret = RESUME_HOST;
+       } else {
+               run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               ret = RESUME_HOST;
+       }
+       return ret;
+}
+
+static u64 kvm_vz_get_one_regs[] = {
+       KVM_REG_MIPS_CP0_INDEX,
+       KVM_REG_MIPS_CP0_ENTRYLO0,
+       KVM_REG_MIPS_CP0_ENTRYLO1,
+       KVM_REG_MIPS_CP0_CONTEXT,
+       KVM_REG_MIPS_CP0_PAGEMASK,
+       KVM_REG_MIPS_CP0_PAGEGRAIN,
+       KVM_REG_MIPS_CP0_WIRED,
+       KVM_REG_MIPS_CP0_HWRENA,
+       KVM_REG_MIPS_CP0_BADVADDR,
+       KVM_REG_MIPS_CP0_COUNT,
+       KVM_REG_MIPS_CP0_ENTRYHI,
+       KVM_REG_MIPS_CP0_COMPARE,
+       KVM_REG_MIPS_CP0_STATUS,
+       KVM_REG_MIPS_CP0_INTCTL,
+       KVM_REG_MIPS_CP0_CAUSE,
+       KVM_REG_MIPS_CP0_EPC,
+       KVM_REG_MIPS_CP0_PRID,
+       KVM_REG_MIPS_CP0_EBASE,
+       KVM_REG_MIPS_CP0_CONFIG,
+       KVM_REG_MIPS_CP0_CONFIG1,
+       KVM_REG_MIPS_CP0_CONFIG2,
+       KVM_REG_MIPS_CP0_CONFIG3,
+       KVM_REG_MIPS_CP0_CONFIG4,
+       KVM_REG_MIPS_CP0_CONFIG5,
+#ifdef CONFIG_64BIT
+       KVM_REG_MIPS_CP0_XCONTEXT,
+#endif
+       KVM_REG_MIPS_CP0_ERROREPC,
+
+       KVM_REG_MIPS_COUNT_CTL,
+       KVM_REG_MIPS_COUNT_RESUME,
+       KVM_REG_MIPS_COUNT_HZ,
+};
+
+static u64 kvm_vz_get_one_regs_contextconfig[] = {
+       KVM_REG_MIPS_CP0_CONTEXTCONFIG,
+#ifdef CONFIG_64BIT
+       KVM_REG_MIPS_CP0_XCONTEXTCONFIG,
+#endif
+};
+
+static u64 kvm_vz_get_one_regs_segments[] = {
+       KVM_REG_MIPS_CP0_SEGCTL0,
+       KVM_REG_MIPS_CP0_SEGCTL1,
+       KVM_REG_MIPS_CP0_SEGCTL2,
+};
+
+static u64 kvm_vz_get_one_regs_htw[] = {
+       KVM_REG_MIPS_CP0_PWBASE,
+       KVM_REG_MIPS_CP0_PWFIELD,
+       KVM_REG_MIPS_CP0_PWSIZE,
+       KVM_REG_MIPS_CP0_PWCTL,
+};
+
+static u64 kvm_vz_get_one_regs_kscratch[] = {
+       KVM_REG_MIPS_CP0_KSCRATCH1,
+       KVM_REG_MIPS_CP0_KSCRATCH2,
+       KVM_REG_MIPS_CP0_KSCRATCH3,
+       KVM_REG_MIPS_CP0_KSCRATCH4,
+       KVM_REG_MIPS_CP0_KSCRATCH5,
+       KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu)
+{
+       unsigned long ret;
+
+       ret = ARRAY_SIZE(kvm_vz_get_one_regs);
+       if (cpu_guest_has_userlocal)
+               ++ret;
+       if (cpu_guest_has_badinstr)
+               ++ret;
+       if (cpu_guest_has_badinstrp)
+               ++ret;
+       if (cpu_guest_has_contextconfig)
+               ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+       if (cpu_guest_has_segments)
+               ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+       if (cpu_guest_has_htw)
+               ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+       if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar)
+               ret += 1 + ARRAY_SIZE(vcpu->arch.maar);
+       ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask);
+
+       return ret;
+}
+
+static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+       u64 index;
+       unsigned int i;
+
+       if (copy_to_user(indices, kvm_vz_get_one_regs,
+                        sizeof(kvm_vz_get_one_regs)))
+               return -EFAULT;
+       indices += ARRAY_SIZE(kvm_vz_get_one_regs);
+
+       if (cpu_guest_has_userlocal) {
+               index = KVM_REG_MIPS_CP0_USERLOCAL;
+               if (copy_to_user(indices, &index, sizeof(index)))
+                       return -EFAULT;
+               ++indices;
+       }
+       if (cpu_guest_has_badinstr) {
+               index = KVM_REG_MIPS_CP0_BADINSTR;
+               if (copy_to_user(indices, &index, sizeof(index)))
+                       return -EFAULT;
+               ++indices;
+       }
+       if (cpu_guest_has_badinstrp) {
+               index = KVM_REG_MIPS_CP0_BADINSTRP;
+               if (copy_to_user(indices, &index, sizeof(index)))
+                       return -EFAULT;
+               ++indices;
+       }
+       if (cpu_guest_has_contextconfig) {
+               if (copy_to_user(indices, kvm_vz_get_one_regs_contextconfig,
+                                sizeof(kvm_vz_get_one_regs_contextconfig)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig);
+       }
+       if (cpu_guest_has_segments) {
+               if (copy_to_user(indices, kvm_vz_get_one_regs_segments,
+                                sizeof(kvm_vz_get_one_regs_segments)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments);
+       }
+       if (cpu_guest_has_htw) {
+               if (copy_to_user(indices, kvm_vz_get_one_regs_htw,
+                                sizeof(kvm_vz_get_one_regs_htw)))
+                       return -EFAULT;
+               indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw);
+       }
+       if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) {
+               for (i = 0; i < ARRAY_SIZE(vcpu->arch.maar); ++i) {
+                       index = KVM_REG_MIPS_CP0_MAAR(i);
+                       if (copy_to_user(indices, &index, sizeof(index)))
+                               return -EFAULT;
+                       ++indices;
+               }
+
+               index = KVM_REG_MIPS_CP0_MAARI;
+               if (copy_to_user(indices, &index, sizeof(index)))
+                       return -EFAULT;
+               ++indices;
+       }
+       for (i = 0; i < 6; ++i) {
+               if (!cpu_guest_has_kscr(i + 2))
+                       continue;
+
+               if (copy_to_user(indices, &kvm_vz_get_one_regs_kscratch[i],
+                                sizeof(kvm_vz_get_one_regs_kscratch[i])))
+                       return -EFAULT;
+               ++indices;
+       }
+
+       return 0;
+}
+
+static inline s64 entrylo_kvm_to_user(unsigned long v)
+{
+       s64 mask, ret = v;
+
+       if (BITS_PER_LONG == 32) {
+               /*
+                * KVM API exposes 64-bit version of the register, so move the
+                * RI/XI bits up into place.
+                */
+               mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+               ret &= ~mask;
+               ret |= ((s64)v & mask) << 32;
+       }
+       return ret;
+}
+
+static inline unsigned long entrylo_user_to_kvm(s64 v)
+{
+       unsigned long mask, ret = v;
+
+       if (BITS_PER_LONG == 32) {
+               /*
+                * KVM API exposes 64-bit versiono of the register, so move the
+                * RI/XI bits down into place.
+                */
+               mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI;
+               ret &= ~mask;
+               ret |= (v >> 32) & mask;
+       }
+       return ret;
+}
+
+static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu,
+                             const struct kvm_one_reg *reg,
+                             s64 *v)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned int idx;
+
+       switch (reg->id) {
+       case KVM_REG_MIPS_CP0_INDEX:
+               *v = (long)read_gc0_index();
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYLO0:
+               *v = entrylo_kvm_to_user(read_gc0_entrylo0());
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYLO1:
+               *v = entrylo_kvm_to_user(read_gc0_entrylo1());
+               break;
+       case KVM_REG_MIPS_CP0_CONTEXT:
+               *v = (long)read_gc0_context();
+               break;
+       case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+               if (!cpu_guest_has_contextconfig)
+                       return -EINVAL;
+               *v = read_gc0_contextconfig();
+               break;
+       case KVM_REG_MIPS_CP0_USERLOCAL:
+               if (!cpu_guest_has_userlocal)
+                       return -EINVAL;
+               *v = read_gc0_userlocal();
+               break;
+#ifdef CONFIG_64BIT
+       case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+               if (!cpu_guest_has_contextconfig)
+                       return -EINVAL;
+               *v = read_gc0_xcontextconfig();
+               break;
+#endif
+       case KVM_REG_MIPS_CP0_PAGEMASK:
+               *v = (long)read_gc0_pagemask();
+               break;
+       case KVM_REG_MIPS_CP0_PAGEGRAIN:
+               *v = (long)read_gc0_pagegrain();
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL0:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               *v = read_gc0_segctl0();
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL1:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               *v = read_gc0_segctl1();
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL2:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               *v = read_gc0_segctl2();
+               break;
+       case KVM_REG_MIPS_CP0_PWBASE:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               *v = read_gc0_pwbase();
+               break;
+       case KVM_REG_MIPS_CP0_PWFIELD:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               *v = read_gc0_pwfield();
+               break;
+       case KVM_REG_MIPS_CP0_PWSIZE:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               *v = read_gc0_pwsize();
+               break;
+       case KVM_REG_MIPS_CP0_WIRED:
+               *v = (long)read_gc0_wired();
+               break;
+       case KVM_REG_MIPS_CP0_PWCTL:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               *v = read_gc0_pwctl();
+               break;
+       case KVM_REG_MIPS_CP0_HWRENA:
+               *v = (long)read_gc0_hwrena();
+               break;
+       case KVM_REG_MIPS_CP0_BADVADDR:
+               *v = (long)read_gc0_badvaddr();
+               break;
+       case KVM_REG_MIPS_CP0_BADINSTR:
+               if (!cpu_guest_has_badinstr)
+                       return -EINVAL;
+               *v = read_gc0_badinstr();
+               break;
+       case KVM_REG_MIPS_CP0_BADINSTRP:
+               if (!cpu_guest_has_badinstrp)
+                       return -EINVAL;
+               *v = read_gc0_badinstrp();
+               break;
+       case KVM_REG_MIPS_CP0_COUNT:
+               *v = kvm_mips_read_count(vcpu);
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYHI:
+               *v = (long)read_gc0_entryhi();
+               break;
+       case KVM_REG_MIPS_CP0_COMPARE:
+               *v = (long)read_gc0_compare();
+               break;
+       case KVM_REG_MIPS_CP0_STATUS:
+               *v = (long)read_gc0_status();
+               break;
+       case KVM_REG_MIPS_CP0_INTCTL:
+               *v = read_gc0_intctl();
+               break;
+       case KVM_REG_MIPS_CP0_CAUSE:
+               *v = (long)read_gc0_cause();
+               break;
+       case KVM_REG_MIPS_CP0_EPC:
+               *v = (long)read_gc0_epc();
+               break;
+       case KVM_REG_MIPS_CP0_PRID:
+               switch (boot_cpu_type()) {
+               case CPU_CAVIUM_OCTEON3:
+                       /* Octeon III has a read-only guest.PRid */
+                       *v = read_gc0_prid();
+                       break;
+               default:
+                       *v = (long)kvm_read_c0_guest_prid(cop0);
+                       break;
+               };
+               break;
+       case KVM_REG_MIPS_CP0_EBASE:
+               *v = kvm_vz_read_gc0_ebase();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG:
+               *v = read_gc0_config();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG1:
+               if (!cpu_guest_has_conf1)
+                       return -EINVAL;
+               *v = read_gc0_config1();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG2:
+               if (!cpu_guest_has_conf2)
+                       return -EINVAL;
+               *v = read_gc0_config2();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG3:
+               if (!cpu_guest_has_conf3)
+                       return -EINVAL;
+               *v = read_gc0_config3();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG4:
+               if (!cpu_guest_has_conf4)
+                       return -EINVAL;
+               *v = read_gc0_config4();
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG5:
+               if (!cpu_guest_has_conf5)
+                       return -EINVAL;
+               *v = read_gc0_config5();
+               break;
+       case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+               if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+               if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+                       return -EINVAL;
+               *v = vcpu->arch.maar[idx];
+               break;
+       case KVM_REG_MIPS_CP0_MAARI:
+               if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+                       return -EINVAL;
+               *v = kvm_read_sw_gc0_maari(vcpu->arch.cop0);
+               break;
+#ifdef CONFIG_64BIT
+       case KVM_REG_MIPS_CP0_XCONTEXT:
+               *v = read_gc0_xcontext();
+               break;
+#endif
+       case KVM_REG_MIPS_CP0_ERROREPC:
+               *v = (long)read_gc0_errorepc();
+               break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!cpu_guest_has_kscr(idx))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       *v = (long)read_gc0_kscratch1();
+                       break;
+               case 3:
+                       *v = (long)read_gc0_kscratch2();
+                       break;
+               case 4:
+                       *v = (long)read_gc0_kscratch3();
+                       break;
+               case 5:
+                       *v = (long)read_gc0_kscratch4();
+                       break;
+               case 6:
+                       *v = (long)read_gc0_kscratch5();
+                       break;
+               case 7:
+                       *v = (long)read_gc0_kscratch6();
+                       break;
+               }
+               break;
+       case KVM_REG_MIPS_COUNT_CTL:
+               *v = vcpu->arch.count_ctl;
+               break;
+       case KVM_REG_MIPS_COUNT_RESUME:
+               *v = ktime_to_ns(vcpu->arch.count_resume);
+               break;
+       case KVM_REG_MIPS_COUNT_HZ:
+               *v = vcpu->arch.count_hz;
+               break;
+       default:
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu,
+                             const struct kvm_one_reg *reg,
+                             s64 v)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned int idx;
+       int ret = 0;
+       unsigned int cur, change;
+
+       switch (reg->id) {
+       case KVM_REG_MIPS_CP0_INDEX:
+               write_gc0_index(v);
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYLO0:
+               write_gc0_entrylo0(entrylo_user_to_kvm(v));
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYLO1:
+               write_gc0_entrylo1(entrylo_user_to_kvm(v));
+               break;
+       case KVM_REG_MIPS_CP0_CONTEXT:
+               write_gc0_context(v);
+               break;
+       case KVM_REG_MIPS_CP0_CONTEXTCONFIG:
+               if (!cpu_guest_has_contextconfig)
+                       return -EINVAL;
+               write_gc0_contextconfig(v);
+               break;
+       case KVM_REG_MIPS_CP0_USERLOCAL:
+               if (!cpu_guest_has_userlocal)
+                       return -EINVAL;
+               write_gc0_userlocal(v);
+               break;
+#ifdef CONFIG_64BIT
+       case KVM_REG_MIPS_CP0_XCONTEXTCONFIG:
+               if (!cpu_guest_has_contextconfig)
+                       return -EINVAL;
+               write_gc0_xcontextconfig(v);
+               break;
+#endif
+       case KVM_REG_MIPS_CP0_PAGEMASK:
+               write_gc0_pagemask(v);
+               break;
+       case KVM_REG_MIPS_CP0_PAGEGRAIN:
+               write_gc0_pagegrain(v);
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL0:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               write_gc0_segctl0(v);
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL1:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               write_gc0_segctl1(v);
+               break;
+       case KVM_REG_MIPS_CP0_SEGCTL2:
+               if (!cpu_guest_has_segments)
+                       return -EINVAL;
+               write_gc0_segctl2(v);
+               break;
+       case KVM_REG_MIPS_CP0_PWBASE:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               write_gc0_pwbase(v);
+               break;
+       case KVM_REG_MIPS_CP0_PWFIELD:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               write_gc0_pwfield(v);
+               break;
+       case KVM_REG_MIPS_CP0_PWSIZE:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               write_gc0_pwsize(v);
+               break;
+       case KVM_REG_MIPS_CP0_WIRED:
+               change_gc0_wired(MIPSR6_WIRED_WIRED, v);
+               break;
+       case KVM_REG_MIPS_CP0_PWCTL:
+               if (!cpu_guest_has_htw)
+                       return -EINVAL;
+               write_gc0_pwctl(v);
+               break;
+       case KVM_REG_MIPS_CP0_HWRENA:
+               write_gc0_hwrena(v);
+               break;
+       case KVM_REG_MIPS_CP0_BADVADDR:
+               write_gc0_badvaddr(v);
+               break;
+       case KVM_REG_MIPS_CP0_BADINSTR:
+               if (!cpu_guest_has_badinstr)
+                       return -EINVAL;
+               write_gc0_badinstr(v);
+               break;
+       case KVM_REG_MIPS_CP0_BADINSTRP:
+               if (!cpu_guest_has_badinstrp)
+                       return -EINVAL;
+               write_gc0_badinstrp(v);
+               break;
+       case KVM_REG_MIPS_CP0_COUNT:
+               kvm_mips_write_count(vcpu, v);
+               break;
+       case KVM_REG_MIPS_CP0_ENTRYHI:
+               write_gc0_entryhi(v);
+               break;
+       case KVM_REG_MIPS_CP0_COMPARE:
+               kvm_mips_write_compare(vcpu, v, false);
+               break;
+       case KVM_REG_MIPS_CP0_STATUS:
+               write_gc0_status(v);
+               break;
+       case KVM_REG_MIPS_CP0_INTCTL:
+               write_gc0_intctl(v);
+               break;
+       case KVM_REG_MIPS_CP0_CAUSE:
+               /*
+                * If the timer is stopped or started (DC bit) it must look
+                * atomic with changes to the timer interrupt pending bit (TI).
+                * A timer interrupt should not happen in between.
+                */
+               if ((read_gc0_cause() ^ v) & CAUSEF_DC) {
+                       if (v & CAUSEF_DC) {
+                               /* disable timer first */
+                               kvm_mips_count_disable_cause(vcpu);
+                               change_gc0_cause((u32)~CAUSEF_DC, v);
+                       } else {
+                               /* enable timer last */
+                               change_gc0_cause((u32)~CAUSEF_DC, v);
+                               kvm_mips_count_enable_cause(vcpu);
+                       }
+               } else {
+                       write_gc0_cause(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_EPC:
+               write_gc0_epc(v);
+               break;
+       case KVM_REG_MIPS_CP0_PRID:
+               switch (boot_cpu_type()) {
+               case CPU_CAVIUM_OCTEON3:
+                       /* Octeon III has a guest.PRid, but its read-only */
+                       break;
+               default:
+                       kvm_write_c0_guest_prid(cop0, v);
+                       break;
+               };
+               break;
+       case KVM_REG_MIPS_CP0_EBASE:
+               kvm_vz_write_gc0_ebase(v);
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG:
+               cur = read_gc0_config();
+               change = (cur ^ v) & kvm_vz_config_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG1:
+               if (!cpu_guest_has_conf1)
+                       break;
+               cur = read_gc0_config1();
+               change = (cur ^ v) & kvm_vz_config1_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config1(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG2:
+               if (!cpu_guest_has_conf2)
+                       break;
+               cur = read_gc0_config2();
+               change = (cur ^ v) & kvm_vz_config2_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config2(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG3:
+               if (!cpu_guest_has_conf3)
+                       break;
+               cur = read_gc0_config3();
+               change = (cur ^ v) & kvm_vz_config3_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config3(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG4:
+               if (!cpu_guest_has_conf4)
+                       break;
+               cur = read_gc0_config4();
+               change = (cur ^ v) & kvm_vz_config4_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config4(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_CONFIG5:
+               if (!cpu_guest_has_conf5)
+                       break;
+               cur = read_gc0_config5();
+               change = (cur ^ v) & kvm_vz_config5_user_wrmask(vcpu);
+               if (change) {
+                       v = cur ^ change;
+                       write_gc0_config5(v);
+               }
+               break;
+       case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f):
+               if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+                       return -EINVAL;
+               idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0);
+               if (idx >= ARRAY_SIZE(vcpu->arch.maar))
+                       return -EINVAL;
+               vcpu->arch.maar[idx] = mips_process_maar(dmtc_op, v);
+               break;
+       case KVM_REG_MIPS_CP0_MAARI:
+               if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar)
+                       return -EINVAL;
+               kvm_write_maari(vcpu, v);
+               break;
+#ifdef CONFIG_64BIT
+       case KVM_REG_MIPS_CP0_XCONTEXT:
+               write_gc0_xcontext(v);
+               break;
+#endif
+       case KVM_REG_MIPS_CP0_ERROREPC:
+               write_gc0_errorepc(v);
+               break;
+       case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+               idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+               if (!cpu_guest_has_kscr(idx))
+                       return -EINVAL;
+               switch (idx) {
+               case 2:
+                       write_gc0_kscratch1(v);
+                       break;
+               case 3:
+                       write_gc0_kscratch2(v);
+                       break;
+               case 4:
+                       write_gc0_kscratch3(v);
+                       break;
+               case 5:
+                       write_gc0_kscratch4(v);
+                       break;
+               case 6:
+                       write_gc0_kscratch5(v);
+                       break;
+               case 7:
+                       write_gc0_kscratch6(v);
+                       break;
+               }
+               break;
+       case KVM_REG_MIPS_COUNT_CTL:
+               ret = kvm_mips_set_count_ctl(vcpu, v);
+               break;
+       case KVM_REG_MIPS_COUNT_RESUME:
+               ret = kvm_mips_set_count_resume(vcpu, v);
+               break;
+       case KVM_REG_MIPS_COUNT_HZ:
+               ret = kvm_mips_set_count_hz(vcpu, v);
+               break;
+       default:
+               return -EINVAL;
+       }
+       return ret;
+}
+
+#define guestid_cache(cpu)     (cpu_data[cpu].guestid_cache)
+static void kvm_vz_get_new_guestid(unsigned long cpu, struct kvm_vcpu *vcpu)
+{
+       unsigned long guestid = guestid_cache(cpu);
+
+       if (!(++guestid & GUESTID_MASK)) {
+               if (cpu_has_vtag_icache)
+                       flush_icache_all();
+
+               if (!guestid)           /* fix version if needed */
+                       guestid = GUESTID_FIRST_VERSION;
+
+               ++guestid;              /* guestid 0 reserved for root */
+
+               /* start new guestid cycle */
+               kvm_vz_local_flush_roottlb_all_guests();
+               kvm_vz_local_flush_guesttlb_all();
+       }
+
+       guestid_cache(cpu) = guestid;
+}
+
+/* Returns 1 if the guest TLB may be clobbered */
+static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu)
+{
+       int ret = 0;
+       int i;
+
+       if (!vcpu->requests)
+               return 0;
+
+       if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+               if (cpu_has_guestid) {
+                       /* Drop all GuestIDs for this VCPU */
+                       for_each_possible_cpu(i)
+                               vcpu->arch.vzguestid[i] = 0;
+                       /* This will clobber guest TLB contents too */
+                       ret = 1;
+               }
+               /*
+                * For Root ASID Dealias (RAD) we don't do anything here, but we
+                * still need the request to ensure we recheck asid_flush_mask.
+                * We can still return 0 as only the root TLB will be affected
+                * by a root ASID flush.
+                */
+       }
+
+       return ret;
+}
+
+static void kvm_vz_vcpu_save_wired(struct kvm_vcpu *vcpu)
+{
+       unsigned int wired = read_gc0_wired();
+       struct kvm_mips_tlb *tlbs;
+       int i;
+
+       /* Expand the wired TLB array if necessary */
+       wired &= MIPSR6_WIRED_WIRED;
+       if (wired > vcpu->arch.wired_tlb_limit) {
+               tlbs = krealloc(vcpu->arch.wired_tlb, wired *
+                               sizeof(*vcpu->arch.wired_tlb), GFP_ATOMIC);
+               if (WARN_ON(!tlbs)) {
+                       /* Save whatever we can */
+                       wired = vcpu->arch.wired_tlb_limit;
+               } else {
+                       vcpu->arch.wired_tlb = tlbs;
+                       vcpu->arch.wired_tlb_limit = wired;
+               }
+       }
+
+       if (wired)
+               /* Save wired entries from the guest TLB */
+               kvm_vz_save_guesttlb(vcpu->arch.wired_tlb, 0, wired);
+       /* Invalidate any dropped entries since last time */
+       for (i = wired; i < vcpu->arch.wired_tlb_used; ++i) {
+               vcpu->arch.wired_tlb[i].tlb_hi = UNIQUE_GUEST_ENTRYHI(i);
+               vcpu->arch.wired_tlb[i].tlb_lo[0] = 0;
+               vcpu->arch.wired_tlb[i].tlb_lo[1] = 0;
+               vcpu->arch.wired_tlb[i].tlb_mask = 0;
+       }
+       vcpu->arch.wired_tlb_used = wired;
+}
+
+static void kvm_vz_vcpu_load_wired(struct kvm_vcpu *vcpu)
+{
+       /* Load wired entries into the guest TLB */
+       if (vcpu->arch.wired_tlb)
+               kvm_vz_load_guesttlb(vcpu->arch.wired_tlb, 0,
+                                    vcpu->arch.wired_tlb_used);
+}
+
+static void kvm_vz_vcpu_load_tlb(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct kvm *kvm = vcpu->kvm;
+       struct mm_struct *gpa_mm = &kvm->arch.gpa_mm;
+       bool migrated;
+
+       /*
+        * Are we entering guest context on a different CPU to last time?
+        * If so, the VCPU's guest TLB state on this CPU may be stale.
+        */
+       migrated = (vcpu->arch.last_exec_cpu != cpu);
+       vcpu->arch.last_exec_cpu = cpu;
+
+       /*
+        * A vcpu's GuestID is set in GuestCtl1.ID when the vcpu is loaded and
+        * remains set until another vcpu is loaded in.  As a rule GuestRID
+        * remains zeroed when in root context unless the kernel is busy
+        * manipulating guest tlb entries.
+        */
+       if (cpu_has_guestid) {
+               /*
+                * Check if our GuestID is of an older version and thus invalid.
+                *
+                * We also discard the stored GuestID if we've executed on
+                * another CPU, as the guest mappings may have changed without
+                * hypervisor knowledge.
+                */
+               if (migrated ||
+                   (vcpu->arch.vzguestid[cpu] ^ guestid_cache(cpu)) &
+                                       GUESTID_VERSION_MASK) {
+                       kvm_vz_get_new_guestid(cpu, vcpu);
+                       vcpu->arch.vzguestid[cpu] = guestid_cache(cpu);
+                       trace_kvm_guestid_change(vcpu,
+                                                vcpu->arch.vzguestid[cpu]);
+               }
+
+               /* Restore GuestID */
+               change_c0_guestctl1(GUESTID_MASK, vcpu->arch.vzguestid[cpu]);
+       } else {
+               /*
+                * The Guest TLB only stores a single guest's TLB state, so
+                * flush it if another VCPU has executed on this CPU.
+                *
+                * We also flush if we've executed on another CPU, as the guest
+                * mappings may have changed without hypervisor knowledge.
+                */
+               if (migrated || last_exec_vcpu[cpu] != vcpu)
+                       kvm_vz_local_flush_guesttlb_all();
+               last_exec_vcpu[cpu] = vcpu;
+
+               /*
+                * Root ASID dealiases guest GPA mappings in the root TLB.
+                * Allocate new root ASID if needed.
+                */
+               if (cpumask_test_and_clear_cpu(cpu, &kvm->arch.asid_flush_mask)
+                   || (cpu_context(cpu, gpa_mm) ^ asid_cache(cpu)) &
+                                               asid_version_mask(cpu))
+                       get_new_mmu_context(gpa_mm, cpu);
+       }
+}
+
+static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       bool migrated, all;
+
+       /*
+        * Have we migrated to a different CPU?
+        * If so, any old guest TLB state may be stale.
+        */
+       migrated = (vcpu->arch.last_sched_cpu != cpu);
+
+       /*
+        * Was this the last VCPU to run on this CPU?
+        * If not, any old guest state from this VCPU will have been clobbered.
+        */
+       all = migrated || (last_vcpu[cpu] != vcpu);
+       last_vcpu[cpu] = vcpu;
+
+       /*
+        * Restore CP0_Wired unconditionally as we clear it after use, and
+        * restore wired guest TLB entries (while in guest context).
+        */
+       kvm_restore_gc0_wired(cop0);
+       if (current->flags & PF_VCPU) {
+               tlbw_use_hazard();
+               kvm_vz_vcpu_load_tlb(vcpu, cpu);
+               kvm_vz_vcpu_load_wired(vcpu);
+       }
+
+       /*
+        * Restore timer state regardless, as e.g. Cause.TI can change over time
+        * if left unmaintained.
+        */
+       kvm_vz_restore_timer(vcpu);
+
+       /* Set MC bit if we want to trace guest mode changes */
+       if (kvm_trace_guest_mode_change)
+               set_c0_guestctl0(MIPS_GCTL0_MC);
+       else
+               clear_c0_guestctl0(MIPS_GCTL0_MC);
+
+       /* Don't bother restoring registers multiple times unless necessary */
+       if (!all)
+               return 0;
+
+       /*
+        * Restore config registers first, as some implementations restrict
+        * writes to other registers when the corresponding feature bits aren't
+        * set. For example Status.CU1 cannot be set unless Config1.FP is set.
+        */
+       kvm_restore_gc0_config(cop0);
+       if (cpu_guest_has_conf1)
+               kvm_restore_gc0_config1(cop0);
+       if (cpu_guest_has_conf2)
+               kvm_restore_gc0_config2(cop0);
+       if (cpu_guest_has_conf3)
+               kvm_restore_gc0_config3(cop0);
+       if (cpu_guest_has_conf4)
+               kvm_restore_gc0_config4(cop0);
+       if (cpu_guest_has_conf5)
+               kvm_restore_gc0_config5(cop0);
+       if (cpu_guest_has_conf6)
+               kvm_restore_gc0_config6(cop0);
+       if (cpu_guest_has_conf7)
+               kvm_restore_gc0_config7(cop0);
+
+       kvm_restore_gc0_index(cop0);
+       kvm_restore_gc0_entrylo0(cop0);
+       kvm_restore_gc0_entrylo1(cop0);
+       kvm_restore_gc0_context(cop0);
+       if (cpu_guest_has_contextconfig)
+               kvm_restore_gc0_contextconfig(cop0);
+#ifdef CONFIG_64BIT
+       kvm_restore_gc0_xcontext(cop0);
+       if (cpu_guest_has_contextconfig)
+               kvm_restore_gc0_xcontextconfig(cop0);
+#endif
+       kvm_restore_gc0_pagemask(cop0);
+       kvm_restore_gc0_pagegrain(cop0);
+       kvm_restore_gc0_hwrena(cop0);
+       kvm_restore_gc0_badvaddr(cop0);
+       kvm_restore_gc0_entryhi(cop0);
+       kvm_restore_gc0_status(cop0);
+       kvm_restore_gc0_intctl(cop0);
+       kvm_restore_gc0_epc(cop0);
+       kvm_vz_write_gc0_ebase(kvm_read_sw_gc0_ebase(cop0));
+       if (cpu_guest_has_userlocal)
+               kvm_restore_gc0_userlocal(cop0);
+
+       kvm_restore_gc0_errorepc(cop0);
+
+       /* restore KScratch registers if enabled in guest */
+       if (cpu_guest_has_conf4) {
+               if (cpu_guest_has_kscr(2))
+                       kvm_restore_gc0_kscratch1(cop0);
+               if (cpu_guest_has_kscr(3))
+                       kvm_restore_gc0_kscratch2(cop0);
+               if (cpu_guest_has_kscr(4))
+                       kvm_restore_gc0_kscratch3(cop0);
+               if (cpu_guest_has_kscr(5))
+                       kvm_restore_gc0_kscratch4(cop0);
+               if (cpu_guest_has_kscr(6))
+                       kvm_restore_gc0_kscratch5(cop0);
+               if (cpu_guest_has_kscr(7))
+                       kvm_restore_gc0_kscratch6(cop0);
+       }
+
+       if (cpu_guest_has_badinstr)
+               kvm_restore_gc0_badinstr(cop0);
+       if (cpu_guest_has_badinstrp)
+               kvm_restore_gc0_badinstrp(cop0);
+
+       if (cpu_guest_has_segments) {
+               kvm_restore_gc0_segctl0(cop0);
+               kvm_restore_gc0_segctl1(cop0);
+               kvm_restore_gc0_segctl2(cop0);
+       }
+
+       /* restore HTW registers */
+       if (cpu_guest_has_htw) {
+               kvm_restore_gc0_pwbase(cop0);
+               kvm_restore_gc0_pwfield(cop0);
+               kvm_restore_gc0_pwsize(cop0);
+               kvm_restore_gc0_pwctl(cop0);
+       }
+
+       /* restore Root.GuestCtl2 from unused Guest guestctl2 register */
+       if (cpu_has_guestctl2)
+               write_c0_guestctl2(
+                       cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]);
+
+       /*
+        * We should clear linked load bit to break interrupted atomics. This
+        * prevents a SC on the next VCPU from succeeding by matching a LL on
+        * the previous VCPU.
+        */
+       if (cpu_guest_has_rw_llb)
+               write_gc0_lladdr(0);
+
+       return 0;
+}
+
+static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+       if (current->flags & PF_VCPU)
+               kvm_vz_vcpu_save_wired(vcpu);
+
+       kvm_lose_fpu(vcpu);
+
+       kvm_save_gc0_index(cop0);
+       kvm_save_gc0_entrylo0(cop0);
+       kvm_save_gc0_entrylo1(cop0);
+       kvm_save_gc0_context(cop0);
+       if (cpu_guest_has_contextconfig)
+               kvm_save_gc0_contextconfig(cop0);
+#ifdef CONFIG_64BIT
+       kvm_save_gc0_xcontext(cop0);
+       if (cpu_guest_has_contextconfig)
+               kvm_save_gc0_xcontextconfig(cop0);
+#endif
+       kvm_save_gc0_pagemask(cop0);
+       kvm_save_gc0_pagegrain(cop0);
+       kvm_save_gc0_wired(cop0);
+       /* allow wired TLB entries to be overwritten */
+       clear_gc0_wired(MIPSR6_WIRED_WIRED);
+       kvm_save_gc0_hwrena(cop0);
+       kvm_save_gc0_badvaddr(cop0);
+       kvm_save_gc0_entryhi(cop0);
+       kvm_save_gc0_status(cop0);
+       kvm_save_gc0_intctl(cop0);
+       kvm_save_gc0_epc(cop0);
+       kvm_write_sw_gc0_ebase(cop0, kvm_vz_read_gc0_ebase());
+       if (cpu_guest_has_userlocal)
+               kvm_save_gc0_userlocal(cop0);
+
+       /* only save implemented config registers */
+       kvm_save_gc0_config(cop0);
+       if (cpu_guest_has_conf1)
+               kvm_save_gc0_config1(cop0);
+       if (cpu_guest_has_conf2)
+               kvm_save_gc0_config2(cop0);
+       if (cpu_guest_has_conf3)
+               kvm_save_gc0_config3(cop0);
+       if (cpu_guest_has_conf4)
+               kvm_save_gc0_config4(cop0);
+       if (cpu_guest_has_conf5)
+               kvm_save_gc0_config5(cop0);
+       if (cpu_guest_has_conf6)
+               kvm_save_gc0_config6(cop0);
+       if (cpu_guest_has_conf7)
+               kvm_save_gc0_config7(cop0);
+
+       kvm_save_gc0_errorepc(cop0);
+
+       /* save KScratch registers if enabled in guest */
+       if (cpu_guest_has_conf4) {
+               if (cpu_guest_has_kscr(2))
+                       kvm_save_gc0_kscratch1(cop0);
+               if (cpu_guest_has_kscr(3))
+                       kvm_save_gc0_kscratch2(cop0);
+               if (cpu_guest_has_kscr(4))
+                       kvm_save_gc0_kscratch3(cop0);
+               if (cpu_guest_has_kscr(5))
+                       kvm_save_gc0_kscratch4(cop0);
+               if (cpu_guest_has_kscr(6))
+                       kvm_save_gc0_kscratch5(cop0);
+               if (cpu_guest_has_kscr(7))
+                       kvm_save_gc0_kscratch6(cop0);
+       }
+
+       if (cpu_guest_has_badinstr)
+               kvm_save_gc0_badinstr(cop0);
+       if (cpu_guest_has_badinstrp)
+               kvm_save_gc0_badinstrp(cop0);
+
+       if (cpu_guest_has_segments) {
+               kvm_save_gc0_segctl0(cop0);
+               kvm_save_gc0_segctl1(cop0);
+               kvm_save_gc0_segctl2(cop0);
+       }
+
+       /* save HTW registers if enabled in guest */
+       if (cpu_guest_has_htw &&
+           kvm_read_sw_gc0_config3(cop0) & MIPS_CONF3_PW) {
+               kvm_save_gc0_pwbase(cop0);
+               kvm_save_gc0_pwfield(cop0);
+               kvm_save_gc0_pwsize(cop0);
+               kvm_save_gc0_pwctl(cop0);
+       }
+
+       kvm_vz_save_timer(vcpu);
+
+       /* save Root.GuestCtl2 in unused Guest guestctl2 register */
+       if (cpu_has_guestctl2)
+               cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] =
+                       read_c0_guestctl2();
+
+       return 0;
+}
+
+/**
+ * kvm_vz_resize_guest_vtlb() - Attempt to resize guest VTLB.
+ * @size:      Number of guest VTLB entries (0 < @size <= root VTLB entries).
+ *
+ * Attempt to resize the guest VTLB by writing guest Config registers. This is
+ * necessary for cores with a shared root/guest TLB to avoid overlap with wired
+ * entries in the root VTLB.
+ *
+ * Returns:    The resulting guest VTLB size.
+ */
+static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size)
+{
+       unsigned int config4 = 0, ret = 0, limit;
+
+       /* Write MMUSize - 1 into guest Config registers */
+       if (cpu_guest_has_conf1)
+               change_gc0_config1(MIPS_CONF1_TLBS,
+                                  (size - 1) << MIPS_CONF1_TLBS_SHIFT);
+       if (cpu_guest_has_conf4) {
+               config4 = read_gc0_config4();
+               if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+                   MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) {
+                       config4 &= ~MIPS_CONF4_VTLBSIZEEXT;
+                       config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+                               MIPS_CONF4_VTLBSIZEEXT_SHIFT;
+               } else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+                          MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) {
+                       config4 &= ~MIPS_CONF4_MMUSIZEEXT;
+                       config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) <<
+                               MIPS_CONF4_MMUSIZEEXT_SHIFT;
+               }
+               write_gc0_config4(config4);
+       }
+
+       /*
+        * Set Guest.Wired.Limit = 0 (no limit up to Guest.MMUSize-1), unless it
+        * would exceed Root.Wired.Limit (clearing Guest.Wired.Wired so write
+        * not dropped)
+        */
+       if (cpu_has_mips_r6) {
+               limit = (read_c0_wired() & MIPSR6_WIRED_LIMIT) >>
+                                               MIPSR6_WIRED_LIMIT_SHIFT;
+               if (size - 1 <= limit)
+                       limit = 0;
+               write_gc0_wired(limit << MIPSR6_WIRED_LIMIT_SHIFT);
+       }
+
+       /* Read back MMUSize - 1 */
+       back_to_back_c0_hazard();
+       if (cpu_guest_has_conf1)
+               ret = (read_gc0_config1() & MIPS_CONF1_TLBS) >>
+                                               MIPS_CONF1_TLBS_SHIFT;
+       if (config4) {
+               if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) ==
+                   MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT)
+                       ret |= ((config4 & MIPS_CONF4_VTLBSIZEEXT) >>
+                               MIPS_CONF4_VTLBSIZEEXT_SHIFT) <<
+                               MIPS_CONF1_TLBS_SIZE;
+               else if ((config4 & MIPS_CONF4_MMUEXTDEF) ==
+                        MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT)
+                       ret |= ((config4 & MIPS_CONF4_MMUSIZEEXT) >>
+                               MIPS_CONF4_MMUSIZEEXT_SHIFT) <<
+                               MIPS_CONF1_TLBS_SIZE;
+       }
+       return ret + 1;
+}
+
+static int kvm_vz_hardware_enable(void)
+{
+       unsigned int mmu_size, guest_mmu_size, ftlb_size;
+       u64 guest_cvmctl, cvmvmconfig;
+
+       switch (current_cpu_type()) {
+       case CPU_CAVIUM_OCTEON3:
+               /* Set up guest timer/perfcount IRQ lines */
+               guest_cvmctl = read_gc0_cvmctl();
+               guest_cvmctl &= ~CVMCTL_IPTI;
+               guest_cvmctl |= 7ull << CVMCTL_IPTI_SHIFT;
+               guest_cvmctl &= ~CVMCTL_IPPCI;
+               guest_cvmctl |= 6ull << CVMCTL_IPPCI_SHIFT;
+               write_gc0_cvmctl(guest_cvmctl);
+
+               cvmvmconfig = read_c0_cvmvmconfig();
+               /* No I/O hole translation. */
+               cvmvmconfig |= CVMVMCONF_DGHT;
+               /* Halve the root MMU size */
+               mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+                           >> CVMVMCONF_MMUSIZEM1_S) + 1;
+               guest_mmu_size = mmu_size / 2;
+               mmu_size -= guest_mmu_size;
+               cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+               cvmvmconfig |= mmu_size - 1;
+               write_c0_cvmvmconfig(cvmvmconfig);
+
+               /* Update our records */
+               current_cpu_data.tlbsize = mmu_size;
+               current_cpu_data.tlbsizevtlb = mmu_size;
+               current_cpu_data.guest.tlbsize = guest_mmu_size;
+
+               /* Flush moved entries in new (guest) context */
+               kvm_vz_local_flush_guesttlb_all();
+               break;
+       default:
+               /*
+                * ImgTec cores tend to use a shared root/guest TLB. To avoid
+                * overlap of root wired and guest entries, the guest TLB may
+                * need resizing.
+                */
+               mmu_size = current_cpu_data.tlbsizevtlb;
+               ftlb_size = current_cpu_data.tlbsize - mmu_size;
+
+               /* Try switching to maximum guest VTLB size for flush */
+               guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size);
+               current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+               kvm_vz_local_flush_guesttlb_all();
+
+               /*
+                * Reduce to make space for root wired entries and at least 2
+                * root non-wired entries. This does assume that long-term wired
+                * entries won't be added later.
+                */
+               guest_mmu_size = mmu_size - num_wired_entries() - 2;
+               guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size);
+               current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size;
+
+               /*
+                * Write the VTLB size, but if another CPU has already written,
+                * check it matches or we won't provide a consistent view to the
+                * guest. If this ever happens it suggests an asymmetric number
+                * of wired entries.
+                */
+               if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) &&
+                   WARN(guest_mmu_size != kvm_vz_guest_vtlb_size,
+                        "Available guest VTLB size mismatch"))
+                       return -EINVAL;
+               break;
+       }
+
+       /*
+        * Enable virtualization features granting guest direct control of
+        * certain features:
+        * CP0=1:       Guest coprocessor 0 context.
+        * AT=Guest:    Guest MMU.
+        * CG=1:        Hit (virtual address) CACHE operations (optional).
+        * CF=1:        Guest Config registers.
+        * CGI=1:       Indexed flush CACHE operations (optional).
+        */
+       write_c0_guestctl0(MIPS_GCTL0_CP0 |
+                          (MIPS_GCTL0_AT_GUEST << MIPS_GCTL0_AT_SHIFT) |
+                          MIPS_GCTL0_CG | MIPS_GCTL0_CF);
+       if (cpu_has_guestctl0ext)
+               set_c0_guestctl0ext(MIPS_GCTL0EXT_CGI);
+
+       if (cpu_has_guestid) {
+               write_c0_guestctl1(0);
+               kvm_vz_local_flush_roottlb_all_guests();
+
+               GUESTID_MASK = current_cpu_data.guestid_mask;
+               GUESTID_FIRST_VERSION = GUESTID_MASK + 1;
+               GUESTID_VERSION_MASK = ~GUESTID_MASK;
+
+               current_cpu_data.guestid_cache = GUESTID_FIRST_VERSION;
+       }
+
+       /* clear any pending injected virtual guest interrupts */
+       if (cpu_has_guestctl2)
+               clear_c0_guestctl2(0x3f << 10);
+
+       return 0;
+}
+
+static void kvm_vz_hardware_disable(void)
+{
+       u64 cvmvmconfig;
+       unsigned int mmu_size;
+
+       /* Flush any remaining guest TLB entries */
+       kvm_vz_local_flush_guesttlb_all();
+
+       switch (current_cpu_type()) {
+       case CPU_CAVIUM_OCTEON3:
+               /*
+                * Allocate whole TLB for root. Existing guest TLB entries will
+                * change ownership to the root TLB. We should be safe though as
+                * they've already been flushed above while in guest TLB.
+                */
+               cvmvmconfig = read_c0_cvmvmconfig();
+               mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1)
+                           >> CVMVMCONF_MMUSIZEM1_S) + 1;
+               cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1;
+               cvmvmconfig |= mmu_size - 1;
+               write_c0_cvmvmconfig(cvmvmconfig);
+
+               /* Update our records */
+               current_cpu_data.tlbsize = mmu_size;
+               current_cpu_data.tlbsizevtlb = mmu_size;
+               current_cpu_data.guest.tlbsize = 0;
+
+               /* Flush moved entries in new (root) context */
+               local_flush_tlb_all();
+               break;
+       }
+
+       if (cpu_has_guestid) {
+               write_c0_guestctl1(0);
+               kvm_vz_local_flush_roottlb_all_guests();
+       }
+}
+
+static int kvm_vz_check_extension(struct kvm *kvm, long ext)
+{
+       int r;
+
+       switch (ext) {
+       case KVM_CAP_MIPS_VZ:
+               /* we wouldn't be here unless cpu_has_vz */
+               r = 1;
+               break;
+#ifdef CONFIG_64BIT
+       case KVM_CAP_MIPS_64BIT:
+               /* We support 64-bit registers/operations and addresses */
+               r = 2;
+               break;
+#endif
+       default:
+               r = 0;
+               break;
+       }
+
+       return r;
+}
+
+static int kvm_vz_vcpu_init(struct kvm_vcpu *vcpu)
+{
+       int i;
+
+       for_each_possible_cpu(i)
+               vcpu->arch.vzguestid[i] = 0;
+
+       return 0;
+}
+
+static void kvm_vz_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+       int cpu;
+
+       /*
+        * If the VCPU is freed and reused as another VCPU, we don't want the
+        * matching pointer wrongly hanging around in last_vcpu[] or
+        * last_exec_vcpu[].
+        */
+       for_each_possible_cpu(cpu) {
+               if (last_vcpu[cpu] == vcpu)
+                       last_vcpu[cpu] = NULL;
+               if (last_exec_vcpu[cpu] == vcpu)
+                       last_exec_vcpu[cpu] = NULL;
+       }
+}
+
+static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+       struct mips_coproc *cop0 = vcpu->arch.cop0;
+       unsigned long count_hz = 100*1000*1000; /* default to 100 MHz */
+
+       /*
+        * Start off the timer at the same frequency as the host timer, but the
+        * soft timer doesn't handle frequencies greater than 1GHz yet.
+        */
+       if (mips_hpt_frequency && mips_hpt_frequency <= NSEC_PER_SEC)
+               count_hz = mips_hpt_frequency;
+       kvm_mips_init_count(vcpu, count_hz);
+
+       /*
+        * Initialize guest register state to valid architectural reset state.
+        */
+
+       /* PageGrain */
+       if (cpu_has_mips_r6)
+               kvm_write_sw_gc0_pagegrain(cop0, PG_RIE | PG_XIE | PG_IEC);
+       /* Wired */
+       if (cpu_has_mips_r6)
+               kvm_write_sw_gc0_wired(cop0,
+                                      read_gc0_wired() & MIPSR6_WIRED_LIMIT);
+       /* Status */
+       kvm_write_sw_gc0_status(cop0, ST0_BEV | ST0_ERL);
+       if (cpu_has_mips_r6)
+               kvm_change_sw_gc0_status(cop0, ST0_FR, read_gc0_status());
+       /* IntCtl */
+       kvm_write_sw_gc0_intctl(cop0, read_gc0_intctl() &
+                               (INTCTLF_IPFDC | INTCTLF_IPPCI | INTCTLF_IPTI));
+       /* PRId */
+       kvm_write_sw_gc0_prid(cop0, boot_cpu_data.processor_id);
+       /* EBase */
+       kvm_write_sw_gc0_ebase(cop0, (s32)0x80000000 | vcpu->vcpu_id);
+       /* Config */
+       kvm_save_gc0_config(cop0);
+       /* architecturally writable (e.g. from guest) */
+       kvm_change_sw_gc0_config(cop0, CONF_CM_CMASK,
+                                _page_cachable_default >> _CACHE_SHIFT);
+       /* architecturally read only, but maybe writable from root */
+       kvm_change_sw_gc0_config(cop0, MIPS_CONF_MT, read_c0_config());
+       if (cpu_guest_has_conf1) {
+               kvm_set_sw_gc0_config(cop0, MIPS_CONF_M);
+               /* Config1 */
+               kvm_save_gc0_config1(cop0);
+               /* architecturally read only, but maybe writable from root */
+               kvm_clear_sw_gc0_config1(cop0, MIPS_CONF1_C2    |
+                                              MIPS_CONF1_MD    |
+                                              MIPS_CONF1_PC    |
+                                              MIPS_CONF1_WR    |
+                                              MIPS_CONF1_CA    |
+                                              MIPS_CONF1_FP);
+       }
+       if (cpu_guest_has_conf2) {
+               kvm_set_sw_gc0_config1(cop0, MIPS_CONF_M);
+               /* Config2 */
+               kvm_save_gc0_config2(cop0);
+       }
+       if (cpu_guest_has_conf3) {
+               kvm_set_sw_gc0_config2(cop0, MIPS_CONF_M);
+               /* Config3 */
+               kvm_save_gc0_config3(cop0);
+               /* architecturally writable (e.g. from guest) */
+               kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_ISA_OE);
+               /* architecturally read only, but maybe writable from root */
+               kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_MSA   |
+                                              MIPS_CONF3_BPG   |
+                                              MIPS_CONF3_ULRI  |
+                                              MIPS_CONF3_DSP   |
+                                              MIPS_CONF3_CTXTC |
+                                              MIPS_CONF3_ITL   |
+                                              MIPS_CONF3_LPA   |
+                                              MIPS_CONF3_VEIC  |
+                                              MIPS_CONF3_VINT  |
+                                              MIPS_CONF3_SP    |
+                                              MIPS_CONF3_CDMM  |
+                                              MIPS_CONF3_MT    |
+                                              MIPS_CONF3_SM    |
+                                              MIPS_CONF3_TL);
+       }
+       if (cpu_guest_has_conf4) {
+               kvm_set_sw_gc0_config3(cop0, MIPS_CONF_M);
+               /* Config4 */
+               kvm_save_gc0_config4(cop0);
+       }
+       if (cpu_guest_has_conf5) {
+               kvm_set_sw_gc0_config4(cop0, MIPS_CONF_M);
+               /* Config5 */
+               kvm_save_gc0_config5(cop0);
+               /* architecturally writable (e.g. from guest) */
+               kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_K     |
+                                              MIPS_CONF5_CV    |
+                                              MIPS_CONF5_MSAEN |
+                                              MIPS_CONF5_UFE   |
+                                              MIPS_CONF5_FRE   |
+                                              MIPS_CONF5_SBRI  |
+                                              MIPS_CONF5_UFR);
+               /* architecturally read only, but maybe writable from root */
+               kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP);
+       }
+
+       if (cpu_guest_has_contextconfig) {
+               /* ContextConfig */
+               kvm_write_sw_gc0_contextconfig(cop0, 0x007ffff0);
+#ifdef CONFIG_64BIT
+               /* XContextConfig */
+               /* bits SEGBITS-13+3:4 set */
+               kvm_write_sw_gc0_xcontextconfig(cop0,
+                                       ((1ull << (cpu_vmbits - 13)) - 1) << 4);
+#endif
+       }
+
+       /* Implementation dependent, use the legacy layout */
+       if (cpu_guest_has_segments) {
+               /* SegCtl0, SegCtl1, SegCtl2 */
+               kvm_write_sw_gc0_segctl0(cop0, 0x00200010);
+               kvm_write_sw_gc0_segctl1(cop0, 0x00000002 |
+                               (_page_cachable_default >> _CACHE_SHIFT) <<
+                                               (16 + MIPS_SEGCFG_C_SHIFT));
+               kvm_write_sw_gc0_segctl2(cop0, 0x00380438);
+       }
+
+       /* reset HTW registers */
+       if (cpu_guest_has_htw && cpu_has_mips_r6) {
+               /* PWField */
+               kvm_write_sw_gc0_pwfield(cop0, 0x0c30c302);
+               /* PWSize */
+               kvm_write_sw_gc0_pwsize(cop0, 1 << MIPS_PWSIZE_PTW_SHIFT);
+       }
+
+       /* start with no pending virtual guest interrupts */
+       if (cpu_has_guestctl2)
+               cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0;
+
+       /* Put PC at reset vector */
+       vcpu->arch.pc = CKSEG1ADDR(0x1fc00000);
+
+       return 0;
+}
+
+static void kvm_vz_flush_shadow_all(struct kvm *kvm)
+{
+       if (cpu_has_guestid) {
+               /* Flush GuestID for each VCPU individually */
+               kvm_flush_remote_tlbs(kvm);
+       } else {
+               /*
+                * For each CPU there is a single GPA ASID used by all VCPUs in
+                * the VM, so it doesn't make sense for the VCPUs to handle
+                * invalidation of these ASIDs individually.
+                *
+                * Instead mark all CPUs as needing ASID invalidation in
+                * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to
+                * kick any running VCPUs so they check asid_flush_mask.
+                */
+               cpumask_setall(&kvm->arch.asid_flush_mask);
+               kvm_flush_remote_tlbs(kvm);
+       }
+}
+
+static void kvm_vz_flush_shadow_memslot(struct kvm *kvm,
+                                       const struct kvm_memory_slot *slot)
+{
+       kvm_vz_flush_shadow_all(kvm);
+}
+
+static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+       int preserve_guest_tlb;
+
+       preserve_guest_tlb = kvm_vz_check_requests(vcpu, cpu);
+
+       if (preserve_guest_tlb)
+               kvm_vz_vcpu_save_wired(vcpu);
+
+       kvm_vz_vcpu_load_tlb(vcpu, cpu);
+
+       if (preserve_guest_tlb)
+               kvm_vz_vcpu_load_wired(vcpu);
+}
+
+static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+       int cpu = smp_processor_id();
+       int r;
+
+       kvm_vz_acquire_htimer(vcpu);
+       /* Check if we have any exceptions/interrupts pending */
+       kvm_mips_deliver_interrupts(vcpu, read_gc0_cause());
+
+       kvm_vz_check_requests(vcpu, cpu);
+       kvm_vz_vcpu_load_tlb(vcpu, cpu);
+       kvm_vz_vcpu_load_wired(vcpu);
+
+       r = vcpu->arch.vcpu_run(run, vcpu);
+
+       kvm_vz_vcpu_save_wired(vcpu);
+
+       return r;
+}
+
+static struct kvm_mips_callbacks kvm_vz_callbacks = {
+       .handle_cop_unusable = kvm_trap_vz_handle_cop_unusable,
+       .handle_tlb_mod = kvm_trap_vz_handle_tlb_st_miss,
+       .handle_tlb_ld_miss = kvm_trap_vz_handle_tlb_ld_miss,
+       .handle_tlb_st_miss = kvm_trap_vz_handle_tlb_st_miss,
+       .handle_addr_err_st = kvm_trap_vz_no_handler,
+       .handle_addr_err_ld = kvm_trap_vz_no_handler,
+       .handle_syscall = kvm_trap_vz_no_handler,
+       .handle_res_inst = kvm_trap_vz_no_handler,
+       .handle_break = kvm_trap_vz_no_handler,
+       .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled,
+       .handle_guest_exit = kvm_trap_vz_handle_guest_exit,
+
+       .hardware_enable = kvm_vz_hardware_enable,
+       .hardware_disable = kvm_vz_hardware_disable,
+       .check_extension = kvm_vz_check_extension,
+       .vcpu_init = kvm_vz_vcpu_init,
+       .vcpu_uninit = kvm_vz_vcpu_uninit,
+       .vcpu_setup = kvm_vz_vcpu_setup,
+       .flush_shadow_all = kvm_vz_flush_shadow_all,
+       .flush_shadow_memslot = kvm_vz_flush_shadow_memslot,
+       .gva_to_gpa = kvm_vz_gva_to_gpa_cb,
+       .queue_timer_int = kvm_vz_queue_timer_int_cb,
+       .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb,
+       .queue_io_int = kvm_vz_queue_io_int_cb,
+       .dequeue_io_int = kvm_vz_dequeue_io_int_cb,
+       .irq_deliver = kvm_vz_irq_deliver_cb,
+       .irq_clear = kvm_vz_irq_clear_cb,
+       .num_regs = kvm_vz_num_regs,
+       .copy_reg_indices = kvm_vz_copy_reg_indices,
+       .get_one_reg = kvm_vz_get_one_reg,
+       .set_one_reg = kvm_vz_set_one_reg,
+       .vcpu_load = kvm_vz_vcpu_load,
+       .vcpu_put = kvm_vz_vcpu_put,
+       .vcpu_run = kvm_vz_vcpu_run,
+       .vcpu_reenter = kvm_vz_vcpu_reenter,
+};
+
+int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
+{
+       if (!cpu_has_vz)
+               return -ENODEV;
+
+       /*
+        * VZ requires at least 2 KScratch registers, so it should have been
+        * possible to allocate pgd_reg.
+        */
+       if (WARN(pgd_reg == -1,
+                "pgd_reg not allocated even though cpu_has_vz\n"))
+               return -ENODEV;
+
+       pr_info("Starting KVM with MIPS VZ extensions\n");
+
+       *install_callbacks = &kvm_vz_callbacks;
+       return 0;
+}
index 6db3413472023dbf586c433c2808b9c62ca67a8f..899e46279902819dcaee11e427411ee11edca9d8 100644 (file)
@@ -24,6 +24,7 @@
 /* Cache operations. */
 void (*flush_cache_all)(void);
 void (*__flush_cache_all)(void);
+EXPORT_SYMBOL_GPL(__flush_cache_all);
 void (*flush_cache_mm)(struct mm_struct *mm);
 void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start,
        unsigned long end);
index aa75849c36bcdd79489e4933d334232985d298c5..3ca20283b31eaf3e9a35a165fe59ed298c5691e2 100644 (file)
@@ -348,7 +348,7 @@ void maar_init(void)
                upper = ((upper & MIPS_MAAR_ADDR) << 4) | 0xffff;
 
                pr_info("  [%d]: ", i / 2);
-               if (!(attr & MIPS_MAAR_V)) {
+               if (!(attr & MIPS_MAAR_VL)) {
                        pr_cont("disabled\n");
                        continue;
                }
index 7bba8f41562705c5eb6c01161979e33ae816ddb9..01d05c76f1c7ddc5da990c5d7eaf312bc77d1017 100644 (file)
@@ -45,9 +45,6 @@
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
-#ifdef CONFIG_KVM_MMIO
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
-#endif
 #define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */
 
 /* These values are internal and can be increased later */
index 4edbe4bb0e8b0cdd2553c74df9988823990833d9..07fbeb927834f3a96278414aedaa59ea580ae8de 100644 (file)
@@ -29,6 +29,9 @@
 #define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_GUEST_DEBUG
 
+/* Not always available, but if it is, this is the correct offset.  */
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
 struct kvm_regs {
        __u64 pc;
        __u64 cr;
index 95c91a9de351c4b8b67f23c299279faa283efef7..0e42aa8a279f323d7728c993251c65b788ef27ae 100644 (file)
@@ -524,11 +524,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                /* We support this only for PR */
                r = !hv_enabled;
                break;
-#ifdef CONFIG_KVM_MMIO
-       case KVM_CAP_COALESCED_MMIO:
-               r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-               break;
-#endif
 #ifdef CONFIG_KVM_MPIC
        case KVM_CAP_IRQ_MPIC:
                r = 1;
index 74ef58c8ff53301ba1cb798aa4f47922812b53be..d962fa998a6fc523e0e9ed5c90aad9d452c6601d 100644 (file)
@@ -43,8 +43,6 @@
 #define KVM_PRIVATE_MEM_SLOTS 3
 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 
-#define KVM_PIO_PAGE_OFFSET 1
-#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 #define KVM_HALT_POLL_NS_DEFAULT 400000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
@@ -343,9 +341,10 @@ struct kvm_mmu {
        void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
                           u64 *spte, const void *pte);
        hpa_t root_hpa;
-       int root_level;
-       int shadow_root_level;
        union kvm_mmu_page_role base_role;
+       u8 root_level;
+       u8 shadow_root_level;
+       u8 ept_ad;
        bool direct_map;
 
        /*
index d74747b031ecd2e20dcf437944195a37e7c6bb3b..c4eda791f877b6c67808546ce072da07b9bb8002 100644 (file)
@@ -46,6 +46,7 @@ struct kvm_page_track_notifier_node {
 };
 
 void kvm_page_track_init(struct kvm *kvm);
+void kvm_page_track_cleanup(struct kvm *kvm);
 
 void kvm_page_track_free_memslot(struct kvm_memory_slot *free,
                                 struct kvm_memory_slot *dont);
index cc54b70265674e8149239b667d63a42e3aa4b58d..35cd06f636abc9a357532f67ec1d1b41fbc8aa85 100644 (file)
 #define SECONDARY_EXEC_APIC_REGISTER_VIRT       0x00000100
 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY    0x00000200
 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING      0x00000400
+#define SECONDARY_EXEC_RDRAND                  0x00000800
 #define SECONDARY_EXEC_ENABLE_INVPCID          0x00001000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
+#define SECONDARY_EXEC_RDSEED                  0x00010000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES                  0x00100000
 #define SECONDARY_EXEC_TSC_SCALING              0x02000000
@@ -516,12 +518,14 @@ struct vmx_msr_entry {
 #define EPT_VIOLATION_READABLE_BIT     3
 #define EPT_VIOLATION_WRITABLE_BIT     4
 #define EPT_VIOLATION_EXECUTABLE_BIT   5
+#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
 #define EPT_VIOLATION_ACC_READ         (1 << EPT_VIOLATION_ACC_READ_BIT)
 #define EPT_VIOLATION_ACC_WRITE                (1 << EPT_VIOLATION_ACC_WRITE_BIT)
 #define EPT_VIOLATION_ACC_INSTR                (1 << EPT_VIOLATION_ACC_INSTR_BIT)
 #define EPT_VIOLATION_READABLE         (1 << EPT_VIOLATION_READABLE_BIT)
 #define EPT_VIOLATION_WRITABLE         (1 << EPT_VIOLATION_WRITABLE_BIT)
 #define EPT_VIOLATION_EXECUTABLE       (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_GVA_TRANSLATED   (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
 
 /*
  * VM-instruction error numbers
index 739c0c5940226d7af38d2ab4bc068f7772f1a8c7..c2824d02ba3762553b62a07709058102050e34da 100644 (file)
@@ -9,6 +9,9 @@
 #include <linux/types.h>
 #include <linux/ioctl.h>
 
+#define KVM_PIO_PAGE_OFFSET 1
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+
 #define DE_VECTOR 0
 #define DB_VECTOR 1
 #define BP_VECTOR 3
index 14458658e988bb6c9333e73701ea55e7b6525a92..690a2dcf407860cf54793f03e69d8761b06f068d 100644 (file)
 #define EXIT_REASON_WBINVD              54
 #define EXIT_REASON_XSETBV              55
 #define EXIT_REASON_APIC_WRITE          56
+#define EXIT_REASON_RDRAND              57
 #define EXIT_REASON_INVPCID             58
+#define EXIT_REASON_VMFUNC              59
+#define EXIT_REASON_ENCLS               60
+#define EXIT_REASON_RDSEED              61
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
@@ -90,6 +94,7 @@
        { EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \
        { EXIT_REASON_CPUID,                 "CPUID" }, \
        { EXIT_REASON_HLT,                   "HLT" }, \
+       { EXIT_REASON_INVD,                  "INVD" }, \
        { EXIT_REASON_INVLPG,                "INVLPG" }, \
        { EXIT_REASON_RDPMC,                 "RDPMC" }, \
        { EXIT_REASON_RDTSC,                 "RDTSC" }, \
        { EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \
        { EXIT_REASON_MSR_READ,              "MSR_READ" }, \
        { EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \
+       { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
+       { EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
        { EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \
        { EXIT_REASON_MONITOR_TRAP_FLAG,     "MONITOR_TRAP_FLAG" }, \
        { EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \
        { EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \
        { EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \
        { EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \
-       { EXIT_REASON_GDTR_IDTR,             "GDTR_IDTR" }, \
-       { EXIT_REASON_LDTR_TR,               "LDTR_TR" }, \
+       { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
+       { EXIT_REASON_GDTR_IDTR,             "GDTR_IDTR" }, \
+       { EXIT_REASON_LDTR_TR,               "LDTR_TR" }, \
        { EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \
        { EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \
        { EXIT_REASON_INVEPT,                "INVEPT" }, \
+       { EXIT_REASON_RDTSCP,                "RDTSCP" }, \
        { EXIT_REASON_PREEMPTION_TIMER,      "PREEMPTION_TIMER" }, \
+       { EXIT_REASON_INVVPID,               "INVVPID" }, \
        { EXIT_REASON_WBINVD,                "WBINVD" }, \
+       { EXIT_REASON_XSETBV,                "XSETBV" }, \
        { EXIT_REASON_APIC_WRITE,            "APIC_WRITE" }, \
-       { EXIT_REASON_EOI_INDUCED,           "EOI_INDUCED" }, \
-       { EXIT_REASON_INVALID_STATE,         "INVALID_STATE" }, \
-       { EXIT_REASON_MSR_LOAD_FAIL,         "MSR_LOAD_FAIL" }, \
-       { EXIT_REASON_INVD,                  "INVD" }, \
-       { EXIT_REASON_INVVPID,               "INVVPID" }, \
+       { EXIT_REASON_RDRAND,                "RDRAND" }, \
        { EXIT_REASON_INVPCID,               "INVPCID" }, \
+       { EXIT_REASON_VMFUNC,                "VMFUNC" }, \
+       { EXIT_REASON_ENCLS,                 "ENCLS" }, \
+       { EXIT_REASON_RDSEED,                "RDSEED" }, \
+       { EXIT_REASON_PML_FULL,              "PML_FULL" }, \
        { EXIT_REASON_XSAVES,                "XSAVES" }, \
        { EXIT_REASON_XRSTORS,               "XRSTORS" }
 
index ab8e32f7b9a868c7ea8ed146d93a39a0c6a39aaa..760433b2574a502a907adcd1ddd71ad6b7f4f9fd 100644 (file)
@@ -86,18 +86,6 @@ config KVM_MMU_AUDIT
         This option adds a R/W kVM module parameter 'mmu_audit', which allows
         auditing of KVM MMU events at runtime.
 
-config KVM_DEVICE_ASSIGNMENT
-       bool "KVM legacy PCI device assignment support (DEPRECATED)"
-       depends on KVM && PCI && IOMMU_API
-       default n
-       ---help---
-         Provide support for legacy PCI device assignment through KVM.  The
-         kernel now also supports a full featured userspace device driver
-         framework through VFIO, which supersedes this support and provides
-         better security.
-
-         If unsure, say N.
-
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
index 3bff20710471468ad6fa3c1e373810448d50c338..09d4b17be0226613990f1cefc4cd2e0b3a272d8c 100644 (file)
@@ -15,8 +15,6 @@ kvm-y                 += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
                           i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
                           hyperv.o page_track.o debugfs.o
 
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)    += assigned-dev.o iommu.o
-
 kvm-intel-y            += vmx.o pmu_intel.o
 kvm-amd-y              += svm.o pmu_amd.o
 
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
deleted file mode 100644 (file)
index 308b859..0000000
+++ /dev/null
@@ -1,1058 +0,0 @@
-/*
- * Kernel-based Virtual Machine - device assignment support
- *
- * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
- *
- * This work is licensed under the terms of the GNU GPL, version 2.  See
- * the COPYING file in the top-level directory.
- *
- */
-
-#include <linux/kvm_host.h>
-#include <linux/kvm.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/namei.h>
-#include <linux/fs.h>
-#include "irq.h"
-#include "assigned-dev.h"
-#include "trace/events/kvm.h"
-
-struct kvm_assigned_dev_kernel {
-       struct kvm_irq_ack_notifier ack_notifier;
-       struct list_head list;
-       int assigned_dev_id;
-       int host_segnr;
-       int host_busnr;
-       int host_devfn;
-       unsigned int entries_nr;
-       int host_irq;
-       bool host_irq_disabled;
-       bool pci_2_3;
-       struct msix_entry *host_msix_entries;
-       int guest_irq;
-       struct msix_entry *guest_msix_entries;
-       unsigned long irq_requested_type;
-       int irq_source_id;
-       int flags;
-       struct pci_dev *dev;
-       struct kvm *kvm;
-       spinlock_t intx_lock;
-       spinlock_t intx_mask_lock;
-       char irq_name[32];
-       struct pci_saved_state *pci_saved_state;
-};
-
-static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
-                                                     int assigned_dev_id)
-{
-       struct kvm_assigned_dev_kernel *match;
-
-       list_for_each_entry(match, head, list) {
-               if (match->assigned_dev_id == assigned_dev_id)
-                       return match;
-       }
-       return NULL;
-}
-
-static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
-                                   *assigned_dev, int irq)
-{
-       int i, index;
-       struct msix_entry *host_msix_entries;
-
-       host_msix_entries = assigned_dev->host_msix_entries;
-
-       index = -1;
-       for (i = 0; i < assigned_dev->entries_nr; i++)
-               if (irq == host_msix_entries[i].vector) {
-                       index = i;
-                       break;
-               }
-       if (index < 0)
-               printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
-
-       return index;
-}
-
-static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-       int ret;
-
-       spin_lock(&assigned_dev->intx_lock);
-       if (pci_check_and_mask_intx(assigned_dev->dev)) {
-               assigned_dev->host_irq_disabled = true;
-               ret = IRQ_WAKE_THREAD;
-       } else
-               ret = IRQ_NONE;
-       spin_unlock(&assigned_dev->intx_lock);
-
-       return ret;
-}
-
-static void
-kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
-                                int vector)
-{
-       if (unlikely(assigned_dev->irq_requested_type &
-                    KVM_DEV_IRQ_GUEST_INTX)) {
-               spin_lock(&assigned_dev->intx_mask_lock);
-               if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
-                       kvm_set_irq(assigned_dev->kvm,
-                                   assigned_dev->irq_source_id, vector, 1,
-                                   false);
-               spin_unlock(&assigned_dev->intx_mask_lock);
-       } else
-               kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                           vector, 1, false);
-}
-
-static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-       if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-               spin_lock_irq(&assigned_dev->intx_lock);
-               disable_irq_nosync(irq);
-               assigned_dev->host_irq_disabled = true;
-               spin_unlock_irq(&assigned_dev->intx_lock);
-       }
-
-       kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                        assigned_dev->guest_irq);
-
-       return IRQ_HANDLED;
-}
-
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
-                               int level)
-{
-       struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-       struct kvm_kernel_irq_routing_entry *e;
-       int ret = -EINVAL;
-       int idx;
-
-       trace_kvm_set_irq(irq, level, irq_source_id);
-
-       /*
-        * Injection into either PIC or IOAPIC might need to scan all CPUs,
-        * which would need to be retried from thread context;  when same GSI
-        * is connected to both PIC and IOAPIC, we'd have to report a
-        * partial failure here.
-        * Since there's no easy way to do this, we only support injecting MSI
-        * which is limited to 1:1 GSI mapping.
-        */
-       idx = srcu_read_lock(&kvm->irq_srcu);
-       if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-               e = &entries[0];
-               ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
-                                               irq, level);
-       }
-       srcu_read_unlock(&kvm->irq_srcu, idx);
-       return ret;
-}
-
-
-static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-       int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                      assigned_dev->irq_source_id,
-                                      assigned_dev->guest_irq, 1);
-       return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-
-       kvm_assigned_dev_raise_guest_irq(assigned_dev,
-                                        assigned_dev->guest_irq);
-
-       return IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-       int index = find_index_from_host_irq(assigned_dev, irq);
-       u32 vector;
-       int ret = 0;
-
-       if (index >= 0) {
-               vector = assigned_dev->guest_msix_entries[index].vector;
-               ret = kvm_set_irq_inatomic(assigned_dev->kvm,
-                                          assigned_dev->irq_source_id,
-                                          vector, 1);
-       }
-
-       return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
-}
-
-static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
-       int index = find_index_from_host_irq(assigned_dev, irq);
-       u32 vector;
-
-       if (index >= 0) {
-               vector = assigned_dev->guest_msix_entries[index].vector;
-               kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
-       }
-
-       return IRQ_HANDLED;
-}
-
-/* Ack the irq line for an assigned device */
-static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
-{
-       struct kvm_assigned_dev_kernel *dev =
-               container_of(kian, struct kvm_assigned_dev_kernel,
-                            ack_notifier);
-
-       kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
-
-       spin_lock(&dev->intx_mask_lock);
-
-       if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
-               bool reassert = false;
-
-               spin_lock_irq(&dev->intx_lock);
-               /*
-                * The guest IRQ may be shared so this ack can come from an
-                * IRQ for another guest device.
-                */
-               if (dev->host_irq_disabled) {
-                       if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
-                               enable_irq(dev->host_irq);
-                       else if (!pci_check_and_unmask_intx(dev->dev))
-                               reassert = true;
-                       dev->host_irq_disabled = reassert;
-               }
-               spin_unlock_irq(&dev->intx_lock);
-
-               if (reassert)
-                       kvm_set_irq(dev->kvm, dev->irq_source_id,
-                                   dev->guest_irq, 1, false);
-       }
-
-       spin_unlock(&dev->intx_mask_lock);
-}
-
-static void deassign_guest_irq(struct kvm *kvm,
-                              struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       if (assigned_dev->ack_notifier.gsi != -1)
-               kvm_unregister_irq_ack_notifier(kvm,
-                                               &assigned_dev->ack_notifier);
-
-       kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-                   assigned_dev->guest_irq, 0, false);
-
-       if (assigned_dev->irq_source_id != -1)
-               kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
-       assigned_dev->irq_source_id = -1;
-       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
-}
-
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void deassign_host_irq(struct kvm *kvm,
-                             struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       /*
-        * We disable irq here to prevent further events.
-        *
-        * Notice this maybe result in nested disable if the interrupt type is
-        * INTx, but it's OK for we are going to free it.
-        *
-        * If this function is a part of VM destroy, please ensure that till
-        * now, the kvm state is still legal for probably we also have to wait
-        * on a currently running IRQ handler.
-        */
-       if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
-               int i;
-               for (i = 0; i < assigned_dev->entries_nr; i++)
-                       disable_irq(assigned_dev->host_msix_entries[i].vector);
-
-               for (i = 0; i < assigned_dev->entries_nr; i++)
-                       free_irq(assigned_dev->host_msix_entries[i].vector,
-                                assigned_dev);
-
-               assigned_dev->entries_nr = 0;
-               kfree(assigned_dev->host_msix_entries);
-               kfree(assigned_dev->guest_msix_entries);
-               pci_disable_msix(assigned_dev->dev);
-       } else {
-               /* Deal with MSI and INTx */
-               if ((assigned_dev->irq_requested_type &
-                    KVM_DEV_IRQ_HOST_INTX) &&
-                   (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                       spin_lock_irq(&assigned_dev->intx_lock);
-                       pci_intx(assigned_dev->dev, false);
-                       spin_unlock_irq(&assigned_dev->intx_lock);
-                       synchronize_irq(assigned_dev->host_irq);
-               } else
-                       disable_irq(assigned_dev->host_irq);
-
-               free_irq(assigned_dev->host_irq, assigned_dev);
-
-               if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
-                       pci_disable_msi(assigned_dev->dev);
-       }
-
-       assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
-}
-
-static int kvm_deassign_irq(struct kvm *kvm,
-                           struct kvm_assigned_dev_kernel *assigned_dev,
-                           unsigned long irq_requested_type)
-{
-       unsigned long guest_irq_type, host_irq_type;
-
-       if (!irqchip_in_kernel(kvm))
-               return -EINVAL;
-       /* no irq assignment to deassign */
-       if (!assigned_dev->irq_requested_type)
-               return -ENXIO;
-
-       host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
-       guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
-
-       if (host_irq_type)
-               deassign_host_irq(kvm, assigned_dev);
-       if (guest_irq_type)
-               deassign_guest_irq(kvm, assigned_dev);
-
-       return 0;
-}
-
-static void kvm_free_assigned_irq(struct kvm *kvm,
-                                 struct kvm_assigned_dev_kernel *assigned_dev)
-{
-       kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
-}
-
-static void kvm_free_assigned_device(struct kvm *kvm,
-                                    struct kvm_assigned_dev_kernel
-                                    *assigned_dev)
-{
-       kvm_free_assigned_irq(kvm, assigned_dev);
-
-       pci_reset_function(assigned_dev->dev);
-       if (pci_load_and_free_saved_state(assigned_dev->dev,
-                                         &assigned_dev->pci_saved_state))
-               printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-                      __func__, dev_name(&assigned_dev->dev->dev));
-       else
-               pci_restore_state(assigned_dev->dev);
-
-       pci_clear_dev_assigned(assigned_dev->dev);
-
-       pci_release_regions(assigned_dev->dev);
-       pci_disable_device(assigned_dev->dev);
-       pci_dev_put(assigned_dev->dev);
-
-       list_del(&assigned_dev->list);
-       kfree(assigned_dev);
-}
-
-void kvm_free_all_assigned_devices(struct kvm *kvm)
-{
-       struct kvm_assigned_dev_kernel *assigned_dev, *tmp;
-
-       list_for_each_entry_safe(assigned_dev, tmp,
-                                &kvm->arch.assigned_dev_head, list) {
-               kvm_free_assigned_device(kvm, assigned_dev);
-       }
-}
-
-static int assigned_device_enable_host_intx(struct kvm *kvm,
-                                           struct kvm_assigned_dev_kernel *dev)
-{
-       irq_handler_t irq_handler;
-       unsigned long flags;
-
-       dev->host_irq = dev->dev->irq;
-
-       /*
-        * We can only share the IRQ line with other host devices if we are
-        * able to disable the IRQ source at device-level - independently of
-        * the guest driver. Otherwise host devices may suffer from unbounded
-        * IRQ latencies when the guest keeps the line asserted.
-        */
-       if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-               irq_handler = kvm_assigned_dev_intx;
-               flags = IRQF_SHARED;
-       } else {
-               irq_handler = NULL;
-               flags = IRQF_ONESHOT;
-       }
-       if (request_threaded_irq(dev->host_irq, irq_handler,
-                                kvm_assigned_dev_thread_intx, flags,
-                                dev->irq_name, dev))
-               return -EIO;
-
-       if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
-               spin_lock_irq(&dev->intx_lock);
-               pci_intx(dev->dev, true);
-               spin_unlock_irq(&dev->intx_lock);
-       }
-       return 0;
-}
-
-static int assigned_device_enable_host_msi(struct kvm *kvm,
-                                          struct kvm_assigned_dev_kernel *dev)
-{
-       int r;
-
-       if (!dev->dev->msi_enabled) {
-               r = pci_enable_msi(dev->dev);
-               if (r)
-                       return r;
-       }
-
-       dev->host_irq = dev->dev->irq;
-       if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
-                                kvm_assigned_dev_thread_msi, 0,
-                                dev->irq_name, dev)) {
-               pci_disable_msi(dev->dev);
-               return -EIO;
-       }
-
-       return 0;
-}
-
-static int assigned_device_enable_host_msix(struct kvm *kvm,
-                                           struct kvm_assigned_dev_kernel *dev)
-{
-       int i, r = -EINVAL;
-
-       /* host_msix_entries and guest_msix_entries should have been
-        * initialized */
-       if (dev->entries_nr == 0)
-               return r;
-
-       r = pci_enable_msix_exact(dev->dev,
-                                 dev->host_msix_entries, dev->entries_nr);
-       if (r)
-               return r;
-
-       for (i = 0; i < dev->entries_nr; i++) {
-               r = request_threaded_irq(dev->host_msix_entries[i].vector,
-                                        kvm_assigned_dev_msix,
-                                        kvm_assigned_dev_thread_msix,
-                                        0, dev->irq_name, dev);
-               if (r)
-                       goto err;
-       }
-
-       return 0;
-err:
-       for (i -= 1; i >= 0; i--)
-               free_irq(dev->host_msix_entries[i].vector, dev);
-       pci_disable_msix(dev->dev);
-       return r;
-}
-
-static int assigned_device_enable_guest_intx(struct kvm *kvm,
-                               struct kvm_assigned_dev_kernel *dev,
-                               struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = irq->guest_irq;
-       return 0;
-}
-
-static int assigned_device_enable_guest_msi(struct kvm *kvm,
-                       struct kvm_assigned_dev_kernel *dev,
-                       struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = -1;
-       return 0;
-}
-
-static int assigned_device_enable_guest_msix(struct kvm *kvm,
-                       struct kvm_assigned_dev_kernel *dev,
-                       struct kvm_assigned_irq *irq)
-{
-       dev->guest_irq = irq->guest_irq;
-       dev->ack_notifier.gsi = -1;
-       return 0;
-}
-
-static int assign_host_irq(struct kvm *kvm,
-                          struct kvm_assigned_dev_kernel *dev,
-                          __u32 host_irq_type)
-{
-       int r = -EEXIST;
-
-       if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
-               return r;
-
-       snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
-                pci_name(dev->dev));
-
-       switch (host_irq_type) {
-       case KVM_DEV_IRQ_HOST_INTX:
-               r = assigned_device_enable_host_intx(kvm, dev);
-               break;
-       case KVM_DEV_IRQ_HOST_MSI:
-               r = assigned_device_enable_host_msi(kvm, dev);
-               break;
-       case KVM_DEV_IRQ_HOST_MSIX:
-               r = assigned_device_enable_host_msix(kvm, dev);
-               break;
-       default:
-               r = -EINVAL;
-       }
-       dev->host_irq_disabled = false;
-
-       if (!r)
-               dev->irq_requested_type |= host_irq_type;
-
-       return r;
-}
-
-static int assign_guest_irq(struct kvm *kvm,
-                           struct kvm_assigned_dev_kernel *dev,
-                           struct kvm_assigned_irq *irq,
-                           unsigned long guest_irq_type)
-{
-       int id;
-       int r = -EEXIST;
-
-       if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
-               return r;
-
-       id = kvm_request_irq_source_id(kvm);
-       if (id < 0)
-               return id;
-
-       dev->irq_source_id = id;
-
-       switch (guest_irq_type) {
-       case KVM_DEV_IRQ_GUEST_INTX:
-               r = assigned_device_enable_guest_intx(kvm, dev, irq);
-               break;
-       case KVM_DEV_IRQ_GUEST_MSI:
-               r = assigned_device_enable_guest_msi(kvm, dev, irq);
-               break;
-       case KVM_DEV_IRQ_GUEST_MSIX:
-               r = assigned_device_enable_guest_msix(kvm, dev, irq);
-               break;
-       default:
-               r = -EINVAL;
-       }
-
-       if (!r) {
-               dev->irq_requested_type |= guest_irq_type;
-               if (dev->ack_notifier.gsi != -1)
-                       kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
-       } else {
-               kvm_free_irq_source_id(kvm, dev->irq_source_id);
-               dev->irq_source_id = -1;
-       }
-
-       return r;
-}
-
-/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-                                  struct kvm_assigned_irq *assigned_irq)
-{
-       int r = -EINVAL;
-       struct kvm_assigned_dev_kernel *match;
-       unsigned long host_irq_type, guest_irq_type;
-
-       if (!irqchip_in_kernel(kvm))
-               return r;
-
-       mutex_lock(&kvm->lock);
-       r = -ENODEV;
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_irq->assigned_dev_id);
-       if (!match)
-               goto out;
-
-       host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
-       guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
-
-       r = -EINVAL;
-       /* can only assign one type at a time */
-       if (hweight_long(host_irq_type) > 1)
-               goto out;
-       if (hweight_long(guest_irq_type) > 1)
-               goto out;
-       if (host_irq_type == 0 && guest_irq_type == 0)
-               goto out;
-
-       r = 0;
-       if (host_irq_type)
-               r = assign_host_irq(kvm, match, host_irq_type);
-       if (r)
-               goto out;
-
-       if (guest_irq_type)
-               r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
-                                        struct kvm_assigned_irq
-                                        *assigned_irq)
-{
-       int r = -ENODEV;
-       struct kvm_assigned_dev_kernel *match;
-       unsigned long irq_type;
-
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_irq->assigned_dev_id);
-       if (!match)
-               goto out;
-
-       irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
-                                         KVM_DEV_IRQ_GUEST_MASK);
-       r = kvm_deassign_irq(kvm, match, irq_type);
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-/*
- * We want to test whether the caller has been granted permissions to
- * use this device.  To be able to configure and control the device,
- * the user needs access to PCI configuration space and BAR resources.
- * These are accessed through PCI sysfs.  PCI config space is often
- * passed to the process calling this ioctl via file descriptor, so we
- * can't rely on access to that file.  We can check for permissions
- * on each of the BAR resource files, which is a pretty clear
- * indicator that the user has been granted access to the device.
- */
-static int probe_sysfs_permissions(struct pci_dev *dev)
-{
-#ifdef CONFIG_SYSFS
-       int i;
-       bool bar_found = false;
-
-       for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
-               char *kpath, *syspath;
-               struct path path;
-               struct inode *inode;
-               int r;
-
-               if (!pci_resource_len(dev, i))
-                       continue;
-
-               kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
-               if (!kpath)
-                       return -ENOMEM;
-
-               /* Per sysfs-rules, sysfs is always at /sys */
-               syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
-               kfree(kpath);
-               if (!syspath)
-                       return -ENOMEM;
-
-               r = kern_path(syspath, LOOKUP_FOLLOW, &path);
-               kfree(syspath);
-               if (r)
-                       return r;
-
-               inode = d_backing_inode(path.dentry);
-
-               r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
-               path_put(&path);
-               if (r)
-                       return r;
-
-               bar_found = true;
-       }
-
-       /* If no resources, probably something special */
-       if (!bar_found)
-               return -EPERM;
-
-       return 0;
-#else
-       return -EINVAL; /* No way to control the device without sysfs */
-#endif
-}
-
-static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
-                                     struct kvm_assigned_pci_dev *assigned_dev)
-{
-       int r = 0, idx;
-       struct kvm_assigned_dev_kernel *match;
-       struct pci_dev *dev;
-
-       if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
-               return -EINVAL;
-
-       mutex_lock(&kvm->lock);
-       idx = srcu_read_lock(&kvm->srcu);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_dev->assigned_dev_id);
-       if (match) {
-               /* device already assigned */
-               r = -EEXIST;
-               goto out;
-       }
-
-       match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
-       if (match == NULL) {
-               printk(KERN_INFO "%s: Couldn't allocate memory\n",
-                      __func__);
-               r = -ENOMEM;
-               goto out;
-       }
-       dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
-                                  assigned_dev->busnr,
-                                  assigned_dev->devfn);
-       if (!dev) {
-               printk(KERN_INFO "%s: host device not found\n", __func__);
-               r = -EINVAL;
-               goto out_free;
-       }
-
-       /* Don't allow bridges to be assigned */
-       if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
-               r = -EPERM;
-               goto out_put;
-       }
-
-       r = probe_sysfs_permissions(dev);
-       if (r)
-               goto out_put;
-
-       if (pci_enable_device(dev)) {
-               printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
-               r = -EBUSY;
-               goto out_put;
-       }
-       r = pci_request_regions(dev, "kvm_assigned_device");
-       if (r) {
-               printk(KERN_INFO "%s: Could not get access to device regions\n",
-                      __func__);
-               goto out_disable;
-       }
-
-       pci_reset_function(dev);
-       pci_save_state(dev);
-       match->pci_saved_state = pci_store_saved_state(dev);
-       if (!match->pci_saved_state)
-               printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
-                      __func__, dev_name(&dev->dev));
-
-       if (!pci_intx_mask_supported(dev))
-               assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
-
-       match->assigned_dev_id = assigned_dev->assigned_dev_id;
-       match->host_segnr = assigned_dev->segnr;
-       match->host_busnr = assigned_dev->busnr;
-       match->host_devfn = assigned_dev->devfn;
-       match->flags = assigned_dev->flags;
-       match->dev = dev;
-       spin_lock_init(&match->intx_lock);
-       spin_lock_init(&match->intx_mask_lock);
-       match->irq_source_id = -1;
-       match->kvm = kvm;
-       match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-
-       list_add(&match->list, &kvm->arch.assigned_dev_head);
-
-       if (!kvm->arch.iommu_domain) {
-               r = kvm_iommu_map_guest(kvm);
-               if (r)
-                       goto out_list_del;
-       }
-       r = kvm_assign_device(kvm, match->dev);
-       if (r)
-               goto out_list_del;
-
-out:
-       srcu_read_unlock(&kvm->srcu, idx);
-       mutex_unlock(&kvm->lock);
-       return r;
-out_list_del:
-       if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
-               printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
-                      __func__, dev_name(&dev->dev));
-       list_del(&match->list);
-       pci_release_regions(dev);
-out_disable:
-       pci_disable_device(dev);
-out_put:
-       pci_dev_put(dev);
-out_free:
-       kfree(match);
-       srcu_read_unlock(&kvm->srcu, idx);
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
-               struct kvm_assigned_pci_dev *assigned_dev)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *match;
-
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_dev->assigned_dev_id);
-       if (!match) {
-               printk(KERN_INFO "%s: device hasn't been assigned before, "
-                 "so cannot be deassigned\n", __func__);
-               r = -EINVAL;
-               goto out;
-       }
-
-       kvm_deassign_device(kvm, match->dev);
-
-       kvm_free_assigned_device(kvm, match);
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-
-static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
-                                   struct kvm_assigned_msix_nr *entry_nr)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *adev;
-
-       mutex_lock(&kvm->lock);
-
-       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     entry_nr->assigned_dev_id);
-       if (!adev) {
-               r = -EINVAL;
-               goto msix_nr_out;
-       }
-
-       if (adev->entries_nr == 0) {
-               adev->entries_nr = entry_nr->entry_nr;
-               if (adev->entries_nr == 0 ||
-                   adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
-                       r = -EINVAL;
-                       goto msix_nr_out;
-               }
-
-               adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
-                                               entry_nr->entry_nr,
-                                               GFP_KERNEL);
-               if (!adev->host_msix_entries) {
-                       r = -ENOMEM;
-                       goto msix_nr_out;
-               }
-               adev->guest_msix_entries =
-                       kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
-                               GFP_KERNEL);
-               if (!adev->guest_msix_entries) {
-                       kfree(adev->host_msix_entries);
-                       r = -ENOMEM;
-                       goto msix_nr_out;
-               }
-       } else /* Not allowed set MSI-X number twice */
-               r = -EINVAL;
-msix_nr_out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
-                                      struct kvm_assigned_msix_entry *entry)
-{
-       int r = 0, i;
-       struct kvm_assigned_dev_kernel *adev;
-
-       mutex_lock(&kvm->lock);
-
-       adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     entry->assigned_dev_id);
-
-       if (!adev) {
-               r = -EINVAL;
-               goto msix_entry_out;
-       }
-
-       for (i = 0; i < adev->entries_nr; i++)
-               if (adev->guest_msix_entries[i].vector == 0 ||
-                   adev->guest_msix_entries[i].entry == entry->entry) {
-                       adev->guest_msix_entries[i].entry = entry->entry;
-                       adev->guest_msix_entries[i].vector = entry->gsi;
-                       adev->host_msix_entries[i].entry = entry->entry;
-                       break;
-               }
-       if (i == adev->entries_nr) {
-               r = -ENOSPC;
-               goto msix_entry_out;
-       }
-
-msix_entry_out:
-       mutex_unlock(&kvm->lock);
-
-       return r;
-}
-
-static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
-               struct kvm_assigned_pci_dev *assigned_dev)
-{
-       int r = 0;
-       struct kvm_assigned_dev_kernel *match;
-
-       mutex_lock(&kvm->lock);
-
-       match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
-                                     assigned_dev->assigned_dev_id);
-       if (!match) {
-               r = -ENODEV;
-               goto out;
-       }
-
-       spin_lock(&match->intx_mask_lock);
-
-       match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
-       match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
-
-       if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
-               if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
-                       kvm_set_irq(match->kvm, match->irq_source_id,
-                                   match->guest_irq, 0, false);
-                       /*
-                        * Masking at hardware-level is performed on demand,
-                        * i.e. when an IRQ actually arrives at the host.
-                        */
-               } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
-                       /*
-                        * Unmask the IRQ line if required. Unmasking at
-                        * device level will be performed by user space.
-                        */
-                       spin_lock_irq(&match->intx_lock);
-                       if (match->host_irq_disabled) {
-                               enable_irq(match->host_irq);
-                               match->host_irq_disabled = false;
-                       }
-                       spin_unlock_irq(&match->intx_lock);
-               }
-       }
-
-       spin_unlock(&match->intx_mask_lock);
-
-out:
-       mutex_unlock(&kvm->lock);
-       return r;
-}
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-                                 unsigned long arg)
-{
-       void __user *argp = (void __user *)arg;
-       int r;
-
-       switch (ioctl) {
-       case KVM_ASSIGN_PCI_DEVICE: {
-               struct kvm_assigned_pci_dev assigned_dev;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                       goto out;
-               r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_IRQ: {
-               r = -EOPNOTSUPP;
-               break;
-       }
-       case KVM_ASSIGN_DEV_IRQ: {
-               struct kvm_assigned_irq assigned_irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                       goto out;
-               r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_DEASSIGN_DEV_IRQ: {
-               struct kvm_assigned_irq assigned_irq;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
-                       goto out;
-               r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_DEASSIGN_PCI_DEVICE: {
-               struct kvm_assigned_pci_dev assigned_dev;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                       goto out;
-               r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_SET_MSIX_NR: {
-               struct kvm_assigned_msix_nr entry_nr;
-               r = -EFAULT;
-               if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
-                       goto out;
-               r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_SET_MSIX_ENTRY: {
-               struct kvm_assigned_msix_entry entry;
-               r = -EFAULT;
-               if (copy_from_user(&entry, argp, sizeof entry))
-                       goto out;
-               r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
-               if (r)
-                       goto out;
-               break;
-       }
-       case KVM_ASSIGN_SET_INTX_MASK: {
-               struct kvm_assigned_pci_dev assigned_dev;
-
-               r = -EFAULT;
-               if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
-                       goto out;
-               r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
-               break;
-       }
-       default:
-               r = -ENOTTY;
-               break;
-       }
-out:
-       return r;
-}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
deleted file mode 100644 (file)
index a428c1a..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
-#define ARCH_X86_KVM_ASSIGNED_DEV_H
-
-#include <linux/kvm_host.h>
-
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
-
-int kvm_iommu_map_guest(struct kvm *kvm);
-int kvm_iommu_unmap_guest(struct kvm *kvm);
-
-long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-                                 unsigned long arg);
-
-void kvm_free_all_assigned_devices(struct kvm *kvm);
-#else
-static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-       return 0;
-}
-
-static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
-                                               unsigned long arg)
-{
-       return -ENOTTY;
-}
-
-static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
-#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
-
-#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
index 73ea24d4f119c8dce2a0d3e5bfc24ef3d7562d3a..047b17a26269610b9cc083899cafaa6ca236eb5b 100644 (file)
@@ -657,6 +657,9 @@ void kvm_pic_destroy(struct kvm *kvm)
 {
        struct kvm_pic *vpic = kvm->arch.vpic;
 
+       if (!vpic)
+               return;
+
        kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
        kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
        kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
index 6e219e5c07d27c5dc41786953b1114b1e475e346..289270a6aecbb478ea14cc786c72fcfdf5058350 100644 (file)
@@ -635,6 +635,9 @@ void kvm_ioapic_destroy(struct kvm *kvm)
 {
        struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
+       if (!ioapic)
+               return;
+
        cancel_delayed_work_sync(&ioapic->eoi_inject);
        kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
        kvm->arch.vioapic = NULL;
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
deleted file mode 100644 (file)
index b181426..0000000
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2006, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place - Suite 330, Boston, MA 02111-1307 USA.
- *
- * Copyright (C) 2006-2008 Intel Corporation
- * Copyright IBM Corporation, 2008
- * Copyright 2010 Red Hat, Inc. and/or its affiliates.
- *
- * Author: Allen M. Kay <allen.m.kay@intel.com>
- * Author: Weidong Han <weidong.han@intel.com>
- * Author: Ben-Ami Yassour <benami@il.ibm.com>
- */
-
-#include <linux/list.h>
-#include <linux/kvm_host.h>
-#include <linux/moduleparam.h>
-#include <linux/pci.h>
-#include <linux/stat.h>
-#include <linux/iommu.h>
-#include "assigned-dev.h"
-
-static bool allow_unsafe_assigned_interrupts;
-module_param_named(allow_unsafe_assigned_interrupts,
-                  allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
- "Enable device assignment on platforms without interrupt remapping support.");
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm);
-static void kvm_iommu_put_pages(struct kvm *kvm,
-                               gfn_t base_gfn, unsigned long npages);
-
-static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
-                          unsigned long npages)
-{
-       gfn_t end_gfn;
-       kvm_pfn_t pfn;
-
-       pfn     = gfn_to_pfn_memslot(slot, gfn);
-       end_gfn = gfn + npages;
-       gfn    += 1;
-
-       if (is_error_noslot_pfn(pfn))
-               return pfn;
-
-       while (gfn < end_gfn)
-               gfn_to_pfn_memslot(slot, gfn++);
-
-       return pfn;
-}
-
-static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn,
-               unsigned long npages)
-{
-       unsigned long i;
-
-       for (i = 0; i < npages; ++i)
-               kvm_release_pfn_clean(pfn + i);
-}
-
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-       gfn_t gfn, end_gfn;
-       kvm_pfn_t pfn;
-       int r = 0;
-       struct iommu_domain *domain = kvm->arch.iommu_domain;
-       int flags;
-
-       /* check if iommu exists and in use */
-       if (!domain)
-               return 0;
-
-       gfn     = slot->base_gfn;
-       end_gfn = gfn + slot->npages;
-
-       flags = IOMMU_READ;
-       if (!(slot->flags & KVM_MEM_READONLY))
-               flags |= IOMMU_WRITE;
-       if (!kvm->arch.iommu_noncoherent)
-               flags |= IOMMU_CACHE;
-
-
-       while (gfn < end_gfn) {
-               unsigned long page_size;
-
-               /* Check if already mapped */
-               if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
-                       gfn += 1;
-                       continue;
-               }
-
-               /* Get the page size we could use to map */
-               page_size = kvm_host_page_size(kvm, gfn);
-
-               /* Make sure the page_size does not exceed the memslot */
-               while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
-                       page_size >>= 1;
-
-               /* Make sure gfn is aligned to the page size we want to map */
-               while ((gfn << PAGE_SHIFT) & (page_size - 1))
-                       page_size >>= 1;
-
-               /* Make sure hva is aligned to the page size we want to map */
-               while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
-                       page_size >>= 1;
-
-               /*
-                * Pin all pages we are about to map in memory. This is
-                * important because we unmap and unpin in 4kb steps later.
-                */
-               pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
-               if (is_error_noslot_pfn(pfn)) {
-                       gfn += 1;
-                       continue;
-               }
-
-               /* Map into IO address space */
-               r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
-                             page_size, flags);
-               if (r) {
-                       printk(KERN_ERR "kvm_iommu_map_address:"
-                              "iommu failed to map pfn=%llx\n", pfn);
-                       kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
-                       goto unmap_pages;
-               }
-
-               gfn += page_size >> PAGE_SHIFT;
-
-               cond_resched();
-       }
-
-       return 0;
-
-unmap_pages:
-       kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
-       return r;
-}
-
-static int kvm_iommu_map_memslots(struct kvm *kvm)
-{
-       int idx, r = 0;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-
-       if (kvm->arch.iommu_noncoherent)
-               kvm_arch_register_noncoherent_dma(kvm);
-
-       idx = srcu_read_lock(&kvm->srcu);
-       slots = kvm_memslots(kvm);
-
-       kvm_for_each_memslot(memslot, slots) {
-               r = kvm_iommu_map_pages(kvm, memslot);
-               if (r)
-                       break;
-       }
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       return r;
-}
-
-int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-       struct iommu_domain *domain = kvm->arch.iommu_domain;
-       int r;
-       bool noncoherent;
-
-       /* check if iommu exists and in use */
-       if (!domain)
-               return 0;
-
-       if (pdev == NULL)
-               return -ENODEV;
-
-       r = iommu_attach_device(domain, &pdev->dev);
-       if (r) {
-               dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
-               return r;
-       }
-
-       noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
-
-       /* Check if need to update IOMMU page table for guest memory */
-       if (noncoherent != kvm->arch.iommu_noncoherent) {
-               kvm_iommu_unmap_memslots(kvm);
-               kvm->arch.iommu_noncoherent = noncoherent;
-               r = kvm_iommu_map_memslots(kvm);
-               if (r)
-                       goto out_unmap;
-       }
-
-       kvm_arch_start_assignment(kvm);
-       pci_set_dev_assigned(pdev);
-
-       dev_info(&pdev->dev, "kvm assign device\n");
-
-       return 0;
-out_unmap:
-       kvm_iommu_unmap_memslots(kvm);
-       return r;
-}
-
-int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
-{
-       struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-       /* check if iommu exists and in use */
-       if (!domain)
-               return 0;
-
-       if (pdev == NULL)
-               return -ENODEV;
-
-       iommu_detach_device(domain, &pdev->dev);
-
-       pci_clear_dev_assigned(pdev);
-       kvm_arch_end_assignment(kvm);
-
-       dev_info(&pdev->dev, "kvm deassign device\n");
-
-       return 0;
-}
-
-int kvm_iommu_map_guest(struct kvm *kvm)
-{
-       int r;
-
-       if (!iommu_present(&pci_bus_type)) {
-               printk(KERN_ERR "%s: iommu not found\n", __func__);
-               return -ENODEV;
-       }
-
-       mutex_lock(&kvm->slots_lock);
-
-       kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
-       if (!kvm->arch.iommu_domain) {
-               r = -ENOMEM;
-               goto out_unlock;
-       }
-
-       if (!allow_unsafe_assigned_interrupts &&
-           !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
-               printk(KERN_WARNING "%s: No interrupt remapping support,"
-                      " disallowing device assignment."
-                      " Re-enable with \"allow_unsafe_assigned_interrupts=1\""
-                      " module option.\n", __func__);
-               iommu_domain_free(kvm->arch.iommu_domain);
-               kvm->arch.iommu_domain = NULL;
-               r = -EPERM;
-               goto out_unlock;
-       }
-
-       r = kvm_iommu_map_memslots(kvm);
-       if (r)
-               kvm_iommu_unmap_memslots(kvm);
-
-out_unlock:
-       mutex_unlock(&kvm->slots_lock);
-       return r;
-}
-
-static void kvm_iommu_put_pages(struct kvm *kvm,
-                               gfn_t base_gfn, unsigned long npages)
-{
-       struct iommu_domain *domain;
-       gfn_t end_gfn, gfn;
-       kvm_pfn_t pfn;
-       u64 phys;
-
-       domain  = kvm->arch.iommu_domain;
-       end_gfn = base_gfn + npages;
-       gfn     = base_gfn;
-
-       /* check if iommu exists and in use */
-       if (!domain)
-               return;
-
-       while (gfn < end_gfn) {
-               unsigned long unmap_pages;
-               size_t size;
-
-               /* Get physical address */
-               phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
-
-               if (!phys) {
-                       gfn++;
-                       continue;
-               }
-
-               pfn  = phys >> PAGE_SHIFT;
-
-               /* Unmap address from IO address space */
-               size       = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
-               unmap_pages = 1ULL << get_order(size);
-
-               /* Unpin all pages we just unmapped to not leak any memory */
-               kvm_unpin_pages(kvm, pfn, unmap_pages);
-
-               gfn += unmap_pages;
-
-               cond_resched();
-       }
-}
-
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
-{
-       kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
-}
-
-static int kvm_iommu_unmap_memslots(struct kvm *kvm)
-{
-       int idx;
-       struct kvm_memslots *slots;
-       struct kvm_memory_slot *memslot;
-
-       idx = srcu_read_lock(&kvm->srcu);
-       slots = kvm_memslots(kvm);
-
-       kvm_for_each_memslot(memslot, slots)
-               kvm_iommu_unmap_pages(kvm, memslot);
-
-       srcu_read_unlock(&kvm->srcu, idx);
-
-       if (kvm->arch.iommu_noncoherent)
-               kvm_arch_unregister_noncoherent_dma(kvm);
-
-       return 0;
-}
-
-int kvm_iommu_unmap_guest(struct kvm *kvm)
-{
-       struct iommu_domain *domain = kvm->arch.iommu_domain;
-
-       /* check if iommu exists and in use */
-       if (!domain)
-               return 0;
-
-       mutex_lock(&kvm->slots_lock);
-       kvm_iommu_unmap_memslots(kvm);
-       kvm->arch.iommu_domain = NULL;
-       kvm->arch.iommu_noncoherent = false;
-       mutex_unlock(&kvm->slots_lock);
-
-       iommu_domain_free(domain);
-       return 0;
-}
index ac7810513d0e959a0855f1b2f56558edb861275c..558676538fca3c213d9e360f39e03ef15e2b1d47 100644 (file)
@@ -4340,7 +4340,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
 
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+                            bool accessed_dirty)
 {
        struct kvm_mmu *context = &vcpu->arch.mmu;
 
@@ -4349,6 +4350,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly)
        context->shadow_root_level = kvm_x86_ops->get_tdp_level();
 
        context->nx = true;
+       context->ept_ad = accessed_dirty;
        context->page_fault = ept_page_fault;
        context->gva_to_gpa = ept_gva_to_gpa;
        context->sync_page = ept_sync_page;
index ddc56e91f2e491ff26631ccc432b2cf15cec05a0..d8ccb32f7308ab3e2d1955b6dbd11203716f03a2 100644 (file)
@@ -74,7 +74,8 @@ enum {
 
 int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct);
 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
-void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly);
+void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
+                            bool accessed_dirty);
 
 static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
 {
index 37942e419c32e599a4ba05d3b75a77680f0065d9..60168cdd05463e2e18c993e20dfdeed7986808ce 100644 (file)
@@ -160,6 +160,14 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn,
        return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]);
 }
 
+void kvm_page_track_cleanup(struct kvm *kvm)
+{
+       struct kvm_page_track_notifier_head *head;
+
+       head = &kvm->arch.track_notifier_head;
+       cleanup_srcu_struct(&head->track_srcu);
+}
+
 void kvm_page_track_init(struct kvm *kvm)
 {
        struct kvm_page_track_notifier_head *head;
index a01105485315ab56faffb79489f2eb55ba117c29..314d2071b3376e697163dc5a20ce2bbb11953ed0 100644 (file)
  * so the code in this file is compiled twice, once per pte size.
  */
 
-/*
- * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro
- * uses for EPT without A/D paging type.
- */
-extern u64 __pure __using_nonexistent_pte_bit(void)
-              __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT");
-
 #if PTTYPE == 64
        #define pt_element_t u64
        #define guest_walker guest_walker64
@@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
-       #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-       #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+       #define PT_HAVE_ACCESSED_DIRTY(mmu) true
        #ifdef CONFIG_X86_64
        #define PT_MAX_FULL_LEVELS 4
        #define CMPXCHG cmpxchg
@@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
        #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
        #define PT_LEVEL_BITS PT32_LEVEL_BITS
        #define PT_MAX_FULL_LEVELS 2
-       #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK
-       #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK
        #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
        #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
+       #define PT_HAVE_ACCESSED_DIRTY(mmu) true
        #define CMPXCHG cmpxchg
 #elif PTTYPE == PTTYPE_EPT
        #define pt_element_t u64
@@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void)
        #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
        #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
        #define PT_LEVEL_BITS PT64_LEVEL_BITS
-       #define PT_GUEST_ACCESSED_MASK 0
-       #define PT_GUEST_DIRTY_MASK 0
-       #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit()
-       #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit()
+       #define PT_GUEST_DIRTY_SHIFT 9
+       #define PT_GUEST_ACCESSED_SHIFT 8
+       #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
        #define CMPXCHG cmpxchg64
        #define PT_MAX_FULL_LEVELS 4
 #else
        #error Invalid PTTYPE value
 #endif
 
+#define PT_GUEST_DIRTY_MASK    (1 << PT_GUEST_DIRTY_SHIFT)
+#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
+
 #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
 #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
 
@@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
        return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
 }
 
-static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
+static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access,
+                                            unsigned gpte)
 {
        unsigned mask;
 
        /* dirty bit is not supported, so no need to track it */
-       if (!PT_GUEST_DIRTY_MASK)
+       if (!PT_HAVE_ACCESSED_DIRTY(mmu))
                return;
 
        BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
@@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
                goto no_present;
 
        /* if accessed bit is not supported prefetch non accessed gpte */
-       if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK))
+       if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK))
                goto no_present;
 
        return false;
@@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
        int ret;
 
        /* dirty/accessed bits are not supported, so no need to update them */
-       if (!PT_GUEST_DIRTY_MASK)
+       if (!PT_HAVE_ACCESSED_DIRTY(mmu))
                return 0;
 
        for (level = walker->max_level; level >= walker->level; --level) {
@@ -286,7 +280,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        pt_element_t __user *uninitialized_var(ptep_user);
        gfn_t table_gfn;
        unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey;
+       unsigned nested_access;
        gpa_t pte_gpa;
+       bool have_ad;
        int offset;
        const int write_fault = access & PFERR_WRITE_MASK;
        const int user_fault  = access & PFERR_USER_MASK;
@@ -299,6 +295,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 retry_walk:
        walker->level = mmu->root_level;
        pte           = mmu->get_cr3(vcpu);
+       have_ad       = PT_HAVE_ACCESSED_DIRTY(mmu);
 
 #if PTTYPE == 64
        if (walker->level == PT32E_ROOT_LEVEL) {
@@ -312,7 +309,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        walker->max_level = walker->level;
        ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
 
-       accessed_dirty = PT_GUEST_ACCESSED_MASK;
+       accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0;
+
+       /*
+        * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
+        * by the MOV to CR instruction are treated as reads and do not cause the
+        * processor to set the dirty flag in any EPT paging-structure entry.
+        */
+       nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK;
+
        pt_access = pte_access = ACC_ALL;
        ++walker->level;
 
@@ -332,7 +337,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
                walker->pte_gpa[walker->level - 1] = pte_gpa;
 
                real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
-                                             PFERR_USER_MASK|PFERR_WRITE_MASK,
+                                             nested_access,
                                              &walker->fault);
 
                /*
@@ -394,7 +399,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
        walker->gfn = real_gpa >> PAGE_SHIFT;
 
        if (!write_fault)
-               FNAME(protect_clean_gpte)(&pte_access, pte);
+               FNAME(protect_clean_gpte)(mmu, &pte_access, pte);
        else
                /*
                 * On a write fault, fold the dirty bit into accessed_dirty.
@@ -485,7 +490,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
        gfn = gpte_to_gfn(gpte);
        pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
-       FNAME(protect_clean_gpte)(&pte_access, gpte);
+       FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
        pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
                        no_dirty_log && (pte_access & ACC_WRITE_MASK));
        if (is_error_pfn(pfn))
@@ -979,7 +984,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
                gfn = gpte_to_gfn(gpte);
                pte_access = sp->role.access;
                pte_access &= FNAME(gpte_access)(vcpu, gpte);
-               FNAME(protect_clean_gpte)(&pte_access, gpte);
+               FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
 
                if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
                      &nr_present))
@@ -1025,3 +1030,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 #undef PT_GUEST_DIRTY_MASK
 #undef PT_GUEST_DIRTY_SHIFT
 #undef PT_GUEST_ACCESSED_SHIFT
+#undef PT_HAVE_ACCESSED_DIRTY
index d1efe2c62b3f8d0db7392970cdfd8e018dd3ac06..1b203abf76e1e60a75819be391e44e3cb7e0a28e 100644 (file)
@@ -1379,6 +1379,9 @@ static void avic_vm_destroy(struct kvm *kvm)
        unsigned long flags;
        struct kvm_arch *vm_data = &kvm->arch;
 
+       if (!avic)
+               return;
+
        avic_free_vm_id(vm_data->avic_vm_id);
 
        if (vm_data->avic_logical_id_table_page)
@@ -5253,6 +5256,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
        avic_handle_ldr_update(vcpu);
 }
 
+static void svm_setup_mce(struct kvm_vcpu *vcpu)
+{
+       /* [63:9] are reserved. */
+       vcpu->arch.mcg_cap &= 0x1ff;
+}
+
 static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .cpu_has_kvm_support = has_svm,
        .disabled_by_bios = is_disabled,
@@ -5364,6 +5373,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
        .pmu_ops = &amd_pmu_ops,
        .deliver_posted_interrupt = svm_deliver_avic_intr,
        .update_pi_irte = svm_update_pi_irte,
+       .setup_mce = svm_setup_mce,
 };
 
 static int __init svm_init(void)
index 98e82ee1e6996671f912a152dec5367baea45b80..cfdb0d9389d1f611050c0cfffa6143d383ac4657 100644 (file)
@@ -615,10 +615,6 @@ struct vcpu_vmx {
        int vpid;
        bool emulation_required;
 
-       /* Support for vnmi-less CPUs */
-       int soft_vnmi_blocked;
-       ktime_t entry_time;
-       s64 vnmi_blocked_time;
        u32 exit_reason;
 
        /* Posted interrupt descriptor */
@@ -1239,6 +1235,11 @@ static inline bool cpu_has_vmx_invvpid_global(void)
        return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid(void)
+{
+       return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1285,11 +1286,6 @@ static inline bool cpu_has_vmx_invpcid(void)
                SECONDARY_EXEC_ENABLE_INVPCID;
 }
 
-static inline bool cpu_has_virtual_nmis(void)
-{
-       return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
-}
-
 static inline bool cpu_has_vmx_wbinvd_exit(void)
 {
        return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -2749,11 +2745,11 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                vmx->nested.nested_vmx_secondary_ctls_high);
        vmx->nested.nested_vmx_secondary_ctls_low = 0;
        vmx->nested.nested_vmx_secondary_ctls_high &=
+               SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED |
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_RDTSCP |
                SECONDARY_EXEC_DESC |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
-               SECONDARY_EXEC_ENABLE_VPID |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING |
@@ -2764,14 +2760,16 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                vmx->nested.nested_vmx_secondary_ctls_high |=
                        SECONDARY_EXEC_ENABLE_EPT;
                vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
-                        VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
-                        VMX_EPT_INVEPT_BIT;
+                        VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
                if (cpu_has_vmx_ept_execute_only())
                        vmx->nested.nested_vmx_ept_caps |=
                                VMX_EPT_EXECUTE_ONLY_BIT;
                vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
                vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
-                       VMX_EPT_EXTENT_CONTEXT_BIT;
+                       VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
+                       VMX_EPT_1GB_PAGE_BIT;
+              if (enable_ept_ad_bits)
+                      vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
        } else
                vmx->nested.nested_vmx_ept_caps = 0;
 
@@ -2781,10 +2779,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
         * though it is treated as global context.  The alternative is
         * not failing the single-context invvpid, and it is worse.
         */
-       if (enable_vpid)
+       if (enable_vpid) {
+               vmx->nested.nested_vmx_secondary_ctls_high |=
+                       SECONDARY_EXEC_ENABLE_VPID;
                vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
                        VMX_VPID_EXTENT_SUPPORTED_MASK;
-       else
+       else
                vmx->nested.nested_vmx_vpid_caps = 0;
 
        if (enable_unrestricted_guest)
@@ -3617,9 +3617,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                                &_vmexit_control) < 0)
                return -EIO;
 
-       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
-       opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
-                PIN_BASED_VMX_PREEMPTION_TIMER;
+       min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
+               PIN_BASED_VIRTUAL_NMIS;
+       opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
        if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
                                &_pin_based_exec_control) < 0)
                return -EIO;
@@ -4011,11 +4011,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-       vpid_sync_context(vpid);
        if (enable_ept) {
                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                        return;
                ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+       } else {
+               vpid_sync_context(vpid);
        }
 }
 
@@ -4024,6 +4025,12 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
        __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
 }
 
+static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu)
+{
+       if (enable_ept)
+               vmx_flush_tlb(vcpu);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -5285,8 +5292,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        vmx->rmode.vm86_active = 0;
 
-       vmx->soft_vnmi_blocked = 0;
-
        vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
        kvm_set_cr8(vcpu, 0);
 
@@ -5406,8 +5411,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
-       if (!cpu_has_virtual_nmis() ||
-           vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+       if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
                enable_irq_window(vcpu);
                return;
        }
@@ -5448,19 +5452,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        if (!is_guest_mode(vcpu)) {
-               if (!cpu_has_virtual_nmis()) {
-                       /*
-                        * Tracking the NMI-blocked state in software is built upon
-                        * finding the next open IRQ window. This, in turn, depends on
-                        * well-behaving guests: They have to keep IRQs disabled at
-                        * least as long as the NMI handler runs. Otherwise we may
-                        * cause NMI nesting, maybe breaking the guest. But as this is
-                        * highly unlikely, we can live with the residual risk.
-                        */
-                       vmx->soft_vnmi_blocked = 1;
-                       vmx->vnmi_blocked_time = 0;
-               }
-
                ++vcpu->stat.nmi_injections;
                vmx->nmi_known_unmasked = false;
        }
@@ -5477,8 +5468,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 
 static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
-       if (!cpu_has_virtual_nmis())
-               return to_vmx(vcpu)->soft_vnmi_blocked;
        if (to_vmx(vcpu)->nmi_known_unmasked)
                return false;
        return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@ -5488,20 +5477,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-       if (!cpu_has_virtual_nmis()) {
-               if (vmx->soft_vnmi_blocked != masked) {
-                       vmx->soft_vnmi_blocked = masked;
-                       vmx->vnmi_blocked_time = 0;
-               }
-       } else {
-               vmx->nmi_known_unmasked = !masked;
-               if (masked)
-                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-                                     GUEST_INTR_STATE_NMI);
-               else
-                       vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-                                       GUEST_INTR_STATE_NMI);
-       }
+       vmx->nmi_known_unmasked = !masked;
+       if (masked)
+               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                             GUEST_INTR_STATE_NMI);
+       else
+               vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+                               GUEST_INTR_STATE_NMI);
 }
 
 static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5509,9 +5491,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
        if (to_vmx(vcpu)->nested.nested_run_pending)
                return 0;
 
-       if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-               return 0;
-
        return  !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
                  (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
                   | GUEST_INTR_STATE_NMI));
@@ -6232,21 +6211,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
        unsigned long exit_qualification;
        gpa_t gpa;
        u32 error_code;
-       int gla_validity;
 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-       gla_validity = (exit_qualification >> 7) & 0x3;
-       if (gla_validity == 0x2) {
-               printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
-               printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
-                       (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-                       vmcs_readl(GUEST_LINEAR_ADDRESS));
-               printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
-                       (long unsigned int)exit_qualification);
-               vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
-               vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
-               return 0;
+       if (is_guest_mode(vcpu)
+           && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) {
+               /*
+                * Fix up exit_qualification according to whether guest
+                * page table accesses are reads or writes.
+                */
+               u64 eptp = nested_ept_get_cr3(vcpu);
+               if (eptp & VMX_EPT_AD_ENABLE_BIT)
+                       exit_qualification &= ~EPT_VIOLATION_ACC_WRITE;
        }
 
        /*
@@ -6256,7 +6232,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
         * AAK134, BY25.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                       cpu_has_virtual_nmis() &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
 
@@ -6517,8 +6492,10 @@ static __init int hardware_setup(void)
        if (boot_cpu_has(X86_FEATURE_NX))
                kvm_enable_efer_bits(EFER_NX);
 
-       if (!cpu_has_vmx_vpid())
+       if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+               !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
                enable_vpid = 0;
+
        if (!cpu_has_vmx_shadow_vmcs())
                enable_shadow_vmcs = 0;
        if (enable_shadow_vmcs)
@@ -7805,7 +7782,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
         * "blocked by NMI" bit has to be set before next VM entry.
         */
        if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-                       cpu_has_virtual_nmis() &&
                        (exit_qualification & INTR_INFO_UNBLOCK_NMI))
                vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
                                GUEST_INTR_STATE_NMI);
@@ -8107,6 +8083,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
        case EXIT_REASON_RDPMC:
                return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
+       case EXIT_REASON_RDRAND:
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+       case EXIT_REASON_RDSEED:
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
        case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
                return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
        case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -8477,31 +8457,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
                return 0;
        }
 
-       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
-           !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
-                                       get_vmcs12(vcpu))))) {
-               if (vmx_interrupt_allowed(vcpu)) {
-                       vmx->soft_vnmi_blocked = 0;
-               } else if (vmx->vnmi_blocked_time > 1000000000LL &&
-                          vcpu->arch.nmi_pending) {
-                       /*
-                        * This CPU don't support us in finding the end of an
-                        * NMI-blocked window if the guest runs with IRQs
-                        * disabled. So we pull the trigger after 1 s of
-                        * futile waiting, but inform the user about this.
-                        */
-                       printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
-                              "state on VCPU %d after 1 s timeout\n",
-                              __func__, vcpu->vcpu_id);
-                       vmx->soft_vnmi_blocked = 0;
-               }
-       }
-
        if (exit_reason < kvm_vmx_max_exit_handlers
            && kvm_vmx_exit_handlers[exit_reason])
                return kvm_vmx_exit_handlers[exit_reason](vcpu);
        else {
-               WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason);
+               vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+                               exit_reason);
                kvm_queue_exception(vcpu, UD_VECTOR);
                return 1;
        }
@@ -8547,6 +8508,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
        } else {
                sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
                sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+               vmx_flush_tlb_ept_only(vcpu);
        }
        vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
 
@@ -8572,8 +8534,10 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
         */
        if (!is_guest_mode(vcpu) ||
            !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
-                            SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+                            SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
                vmcs_write64(APIC_ACCESS_ADDR, hpa);
+               vmx_flush_tlb_ept_only(vcpu);
+       }
 }
 
 static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
@@ -8768,37 +8732,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
 
        idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 
-       if (cpu_has_virtual_nmis()) {
-               if (vmx->nmi_known_unmasked)
-                       return;
-               /*
-                * Can't use vmx->exit_intr_info since we're not sure what
-                * the exit reason is.
-                */
-               exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-               unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
-               vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
-               /*
-                * SDM 3: 27.7.1.2 (September 2008)
-                * Re-set bit "block by NMI" before VM entry if vmexit caused by
-                * a guest IRET fault.
-                * SDM 3: 23.2.2 (September 2008)
-                * Bit 12 is undefined in any of the following cases:
-                *  If the VM exit sets the valid bit in the IDT-vectoring
-                *   information field.
-                *  If the VM exit is due to a double fault.
-                */
-               if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
-                   vector != DF_VECTOR && !idtv_info_valid)
-                       vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-                                     GUEST_INTR_STATE_NMI);
-               else
-                       vmx->nmi_known_unmasked =
-                               !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
-                                 & GUEST_INTR_STATE_NMI);
-       } else if (unlikely(vmx->soft_vnmi_blocked))
-               vmx->vnmi_blocked_time +=
-                       ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
+       if (vmx->nmi_known_unmasked)
+               return;
+       /*
+        * Can't use vmx->exit_intr_info since we're not sure what
+        * the exit reason is.
+        */
+       exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+       unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+       vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+       /*
+        * SDM 3: 27.7.1.2 (September 2008)
+        * Re-set bit "block by NMI" before VM entry if vmexit caused by
+        * a guest IRET fault.
+        * SDM 3: 23.2.2 (September 2008)
+        * Bit 12 is undefined in any of the following cases:
+        *  If the VM exit sets the valid bit in the IDT-vectoring
+        *   information field.
+        *  If the VM exit is due to a double fault.
+        */
+       if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+           vector != DF_VECTOR && !idtv_info_valid)
+               vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+                             GUEST_INTR_STATE_NMI);
+       else
+               vmx->nmi_known_unmasked =
+                       !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+                         & GUEST_INTR_STATE_NMI);
 }
 
 static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -8915,10 +8875,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        unsigned long debugctlmsr, cr4;
 
-       /* Record the guest's net vcpu time for enforced NMI injections. */
-       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-               vmx->entry_time = ktime_get();
-
        /* Don't enter VMX if guest state is invalid, let the exit handler
           start emulation until we arrive back to a valid state */
        if (vmx->emulation_required)
@@ -9126,16 +9082,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
        vmx_complete_interrupts(vmx);
 }
 
-static void vmx_load_vmcs01(struct kvm_vcpu *vcpu)
+static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        int cpu;
 
-       if (vmx->loaded_vmcs == &vmx->vmcs01)
+       if (vmx->loaded_vmcs == vmcs)
                return;
 
        cpu = get_cpu();
-       vmx->loaded_vmcs = &vmx->vmcs01;
+       vmx->loaded_vmcs = vmcs;
        vmx_vcpu_put(vcpu);
        vmx_vcpu_load(vcpu, cpu);
        vcpu->cpu = cpu;
@@ -9153,7 +9109,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu)
 
        r = vcpu_load(vcpu);
        BUG_ON(r);
-       vmx_load_vmcs01(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
        free_nested(vmx);
        vcpu_put(vcpu);
 }
@@ -9478,17 +9434,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
        return get_vmcs12(vcpu)->ept_pointer;
 }
 
-static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
+static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 {
+       u64 eptp;
+
        WARN_ON(mmu_is_nested(vcpu));
+       eptp = nested_ept_get_cr3(vcpu);
+       if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits)
+               return 1;
+
+       kvm_mmu_unload(vcpu);
        kvm_init_shadow_ept_mmu(vcpu,
                        to_vmx(vcpu)->nested.nested_vmx_ept_caps &
-                       VMX_EPT_EXECUTE_ONLY_BIT);
+                       VMX_EPT_EXECUTE_ONLY_BIT,
+                       eptp & VMX_EPT_AD_ENABLE_BIT);
        vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
        vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
        vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
 
        vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+       return 0;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
@@ -9974,7 +9939,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        u32 exec_control;
-       bool nested_ept_enabled = false;
 
        vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
        vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
@@ -10121,8 +10085,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
                                vmcs12->guest_intr_status);
                }
 
-               nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
-
                /*
                 * Write an illegal value to APIC_ACCESS_ADDR. Later,
                 * nested_get_vmcs12_pages will either fix it up or
@@ -10253,8 +10215,13 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        }
 
        if (nested_cpu_has_ept(vmcs12)) {
-               kvm_mmu_unload(vcpu);
-               nested_ept_init_mmu_context(vcpu);
+               if (nested_ept_init_mmu_context(vcpu)) {
+                       *entry_failure_code = ENTRY_FAIL_DEFAULT;
+                       return 1;
+               }
+       } else if (nested_cpu_has2(vmcs12,
+                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+               vmx_flush_tlb_ept_only(vcpu);
        }
 
        /*
@@ -10282,12 +10249,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
        vmx_set_efer(vcpu, vcpu->arch.efer);
 
        /* Shadow page tables on either EPT or shadow page tables. */
-       if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled,
+       if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
                                entry_failure_code))
                return 1;
 
-       kvm_mmu_reset_context(vcpu);
-
        if (!enable_ept)
                vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
 
@@ -10407,7 +10372,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
        struct loaded_vmcs *vmcs02;
-       int cpu;
        u32 msr_entry_idx;
        u32 exit_qual;
 
@@ -10420,18 +10384,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
        if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
                vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
-       cpu = get_cpu();
-       vmx->loaded_vmcs = vmcs02;
-       vmx_vcpu_put(vcpu);
-       vmx_vcpu_load(vcpu, cpu);
-       vcpu->cpu = cpu;
-       put_cpu();
-
+       vmx_switch_vmcs(vcpu, vmcs02);
        vmx_segment_cache_clear(vmx);
 
        if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
                leave_guest_mode(vcpu);
-               vmx_load_vmcs01(vcpu);
+               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
                nested_vmx_entry_failure(vcpu, vmcs12,
                                         EXIT_REASON_INVALID_STATE, exit_qual);
                return 1;
@@ -10444,7 +10402,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
                                            vmcs12->vm_entry_msr_load_count);
        if (msr_entry_idx) {
                leave_guest_mode(vcpu);
-               vmx_load_vmcs01(vcpu);
+               vmx_switch_vmcs(vcpu, &vmx->vmcs01);
                nested_vmx_entry_failure(vcpu, vmcs12,
                                EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx);
                return 1;
@@ -11012,7 +10970,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
        if (unlikely(vmx->fail))
                vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR);
 
-       vmx_load_vmcs01(vcpu);
+       vmx_switch_vmcs(vcpu, &vmx->vmcs01);
 
        if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
            && nested_exit_intr_ack_set(vcpu)) {
@@ -11056,6 +11014,10 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
                vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
                vmx_set_virtual_x2apic_mode(vcpu,
                                vcpu->arch.apic_base & X2APIC_ENABLE);
+       } else if (!nested_cpu_has_ept(vmcs12) &&
+                  nested_cpu_has2(vmcs12,
+                                  SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
+               vmx_flush_tlb_ept_only(vcpu);
        }
 
        /* This is needed for same reason as it was needed in prepare_vmcs02 */
index 1faf620a6fdc206705a03357d3a8ec5814b2c790..6bc47e2712c87dc1a752a7fdca7dea9afe387daf 100644 (file)
@@ -27,7 +27,6 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "cpuid.h"
-#include "assigned-dev.h"
 #include "pmu.h"
 #include "hyperv.h"
 
@@ -2675,10 +2674,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_SET_BOOT_CPU_ID:
        case KVM_CAP_SPLIT_IRQCHIP:
        case KVM_CAP_IMMEDIATE_EXIT:
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-       case KVM_CAP_ASSIGN_DEV_IRQ:
-       case KVM_CAP_PCI_2_3:
-#endif
                r = 1;
                break;
        case KVM_CAP_ADJUST_CLOCK:
@@ -2695,9 +2690,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
                 */
                r = kvm_x86_ops->cpu_has_high_real_mode_segbase();
                break;
-       case KVM_CAP_COALESCED_MMIO:
-               r = KVM_COALESCED_MMIO_PAGE_OFFSET;
-               break;
        case KVM_CAP_VAPIC:
                r = !kvm_x86_ops->cpu_has_accelerated_tpr();
                break;
@@ -2713,11 +2705,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
        case KVM_CAP_PV_MMU:    /* obsolete */
                r = 0;
                break;
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-       case KVM_CAP_IOMMU:
-               r = iommu_present(&pci_bus_type);
-               break;
-#endif
        case KVM_CAP_MCE:
                r = KVM_MAX_MCE_BANKS;
                break;
@@ -3124,7 +3111,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
                return -EINVAL;
 
        if (events->exception.injected &&
-           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
+           (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR ||
+            is_guest_mode(vcpu)))
+               return -EINVAL;
+
+       /* INITs are latched while in SMM */
+       if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
+           (events->smi.smm || events->smi.pending) &&
+           vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
                return -EINVAL;
 
        process_nmi(vcpu);
@@ -4230,7 +4224,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
                break;
        }
        default:
-               r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
+               r = -ENOTTY;
        }
 out:
        return r;
@@ -7355,6 +7349,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
            mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
                return -EINVAL;
 
+       /* INITs are latched while in SMM */
+       if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+           (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
+            mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
+               return -EINVAL;
+
        if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
                vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
                set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
@@ -8068,7 +8068,6 @@ void kvm_arch_sync_events(struct kvm *kvm)
 {
        cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
        cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
-       kvm_free_all_assigned_devices(kvm);
        kvm_free_pit(kvm);
 }
 
@@ -8152,12 +8151,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
        }
        if (kvm_x86_ops->vm_destroy)
                kvm_x86_ops->vm_destroy(kvm);
-       kvm_iommu_unmap_guest(kvm);
-       kfree(kvm->arch.vpic);
-       kfree(kvm->arch.vioapic);
+       kvm_pic_destroy(kvm);
+       kvm_ioapic_destroy(kvm);
        kvm_free_vcpus(kvm);
        kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
        kvm_mmu_uninit_vm(kvm);
+       kvm_page_track_cleanup(kvm);
 }
 
 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
@@ -8566,11 +8565,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 {
        struct x86_exception fault;
 
-       trace_kvm_async_pf_ready(work->arch.token, work->gva);
        if (work->wakeup_all)
                work->arch.token = ~0; /* broadcast wakeup */
        else
                kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+       trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
        if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
            !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
index 09b4df74291e2618e95e842dc4f5d7bdb178ec93..bb865695d7a62d20fa66800c8ed421dcfa8cd8c2 100644 (file)
@@ -193,10 +193,7 @@ static int __init ptp_kvm_init(void)
 
        kvm_ptp_clock.ptp_clock = ptp_clock_register(&kvm_ptp_clock.caps, NULL);
 
-       if (IS_ERR(kvm_ptp_clock.ptp_clock))
-               return PTR_ERR(kvm_ptp_clock.ptp_clock);
-
-       return 0;
+       return PTR_ERR_OR_ZERO(kvm_ptp_clock.ptp_clock);
 }
 
 module_init(ptp_kvm_init);
index 2c14ad9809da94bde727f3ebc744fabd47673f98..7e74ae4d99bbef5ccb2b0bacc3f80542427c3427 100644 (file)
@@ -162,8 +162,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
                    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
                            int len, struct kvm_io_device *dev);
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                             struct kvm_io_device *dev);
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                              struct kvm_io_device *dev);
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
                                         gpa_t addr);
 
@@ -403,7 +403,7 @@ struct kvm {
        struct kvm_vm_stat stat;
        struct kvm_arch arch;
        refcount_t users_count;
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
        struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
        spinlock_t ring_lock;
        struct list_head coalesced_zones;
@@ -877,22 +877,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
-#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
-int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
-#else
-static inline int kvm_iommu_map_pages(struct kvm *kvm,
-                                     struct kvm_memory_slot *slot)
-{
-       return 0;
-}
-
-static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
-                                        struct kvm_memory_slot *slot)
-{
-}
-#endif
-
 /*
  * search_memslots() and __gfn_to_memslot() are here because they are
  * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
index 33dd2a4e36dc3f124756f92ef1ce586e8456a3d3..6180ea50e9ef01c62d1817bcb8399a29da3973e5 100644 (file)
@@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt {
 #define KVM_VM_PPC_HV 1
 #define KVM_VM_PPC_PR 2
 
+/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */
+#define KVM_VM_MIPS_TE         0
+#define KVM_VM_MIPS_VZ         1
+
 #define KVM_S390_SIE_PAGE_OFFSET 1
 
 /*
@@ -883,8 +887,11 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_PPC_MMU_RADIX 134
 #define KVM_CAP_PPC_MMU_HASH_V3 135
 #define KVM_CAP_IMMEDIATE_EXIT 136
-#define KVM_CAP_S390_GS 137
-#define KVM_CAP_S390_AIS 138
+#define KVM_CAP_MIPS_VZ 137
+#define KVM_CAP_MIPS_TE 138
+#define KVM_CAP_MIPS_64BIT 139
+#define KVM_CAP_S390_GS 140
+#define KVM_CAP_S390_AIS 141
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
index 581278c5848877ea1a90cfdbf8045afd23298c0c..8f74ed8e72372994213d6cca31e4313edb6a2501 100755 (executable)
@@ -30,8 +30,8 @@ import fcntl
 import resource
 import struct
 import re
+import subprocess
 from collections import defaultdict
-from time import sleep
 
 VMX_EXIT_REASONS = {
     'EXCEPTION_NMI':        0,
@@ -225,6 +225,7 @@ IOCTL_NUMBERS = {
     'RESET':       0x00002403,
 }
 
+
 class Arch(object):
     """Encapsulates global architecture specific data.
 
@@ -255,12 +256,14 @@ class Arch(object):
                     return ArchX86(SVM_EXIT_REASONS)
                 return
 
+
 class ArchX86(Arch):
     def __init__(self, exit_reasons):
         self.sc_perf_evt_open = 298
         self.ioctl_numbers = IOCTL_NUMBERS
         self.exit_reasons = exit_reasons
 
+
 class ArchPPC(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 319
@@ -275,12 +278,14 @@ class ArchPPC(Arch):
         self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16
         self.exit_reasons = {}
 
+
 class ArchA64(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 241
         self.ioctl_numbers = IOCTL_NUMBERS
         self.exit_reasons = AARCH64_EXIT_REASONS
 
+
 class ArchS390(Arch):
     def __init__(self):
         self.sc_perf_evt_open = 331
@@ -316,6 +321,61 @@ def parse_int_list(list_string):
     return integers
 
 
+def get_pid_from_gname(gname):
+    """Fuzzy function to convert guest name to QEMU process pid.
+
+    Returns a list of potential pids, can be empty if no match found.
+    Throws an exception on processing errors.
+
+    """
+    pids = []
+    try:
+        child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'],
+                                 stdout=subprocess.PIPE)
+    except:
+        raise Exception
+    for line in child.stdout:
+        line = line.lstrip().split(' ', 1)
+        # perform a sanity check before calling the more expensive
+        # function to possibly extract the guest name
+        if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]):
+            pids.append(int(line[0]))
+    child.stdout.close()
+
+    return pids
+
+
+def get_gname_from_pid(pid):
+    """Returns the guest name for a QEMU process pid.
+
+    Extracts the guest name from the QEMU comma line by processing the '-name'
+    option. Will also handle names specified out of sequence.
+
+    """
+    name = ''
+    try:
+        line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0')
+        parms = line[line.index('-name') + 1].split(',')
+        while '' in parms:
+            # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in
+            # ['foo', '', 'bar'], which we revert here
+            idx = parms.index('')
+            parms[idx - 1] += ',' + parms[idx + 1]
+            del parms[idx:idx+2]
+        # the '-name' switch allows for two ways to specify the guest name,
+        # where the plain name overrides the name specified via 'guest='
+        for arg in parms:
+            if '=' not in arg:
+                name = arg
+                break
+            if arg[:6] == 'guest=':
+                name = arg[6:]
+    except (ValueError, IOError, IndexError):
+        pass
+
+    return name
+
+
 def get_online_cpus():
     """Returns a list of cpu id integers."""
     with open('/sys/devices/system/cpu/online') as cpu_list:
@@ -342,6 +402,7 @@ def get_filters():
 libc = ctypes.CDLL('libc.so.6', use_errno=True)
 syscall = libc.syscall
 
+
 class perf_event_attr(ctypes.Structure):
     """Struct that holds the necessary data to set up a trace event.
 
@@ -370,6 +431,7 @@ class perf_event_attr(ctypes.Structure):
         self.size = ctypes.sizeof(self)
         self.read_format = PERF_FORMAT_GROUP
 
+
 def perf_event_open(attr, pid, cpu, group_fd, flags):
     """Wrapper for the sys_perf_evt_open() syscall.
 
@@ -395,6 +457,7 @@ PERF_FORMAT_GROUP = 1 << 3
 PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing'
 PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm'
 
+
 class Group(object):
     """Represents a perf event group."""
 
@@ -427,6 +490,7 @@ class Group(object):
                         struct.unpack(read_format,
                                       os.read(self.events[0].fd, length))))
 
+
 class Event(object):
     """Represents a performance event and manages its life cycle."""
     def __init__(self, name, group, trace_cpu, trace_pid, trace_point,
@@ -510,6 +574,7 @@ class Event(object):
         """Resets the count of the trace event in the kernel."""
         fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0)
 
+
 class TracepointProvider(object):
     """Data provider for the stats class.
 
@@ -551,6 +616,7 @@ class TracepointProvider(object):
     def setup_traces(self):
         """Creates all event and group objects needed to be able to retrieve
         data."""
+        fields = self.get_available_fields()
         if self._pid > 0:
             # Fetch list of all threads of the monitored pid, as qemu
             # starts a thread for each vcpu.
@@ -561,7 +627,7 @@ class TracepointProvider(object):
 
         # The constant is needed as a buffer for python libs, std
         # streams and other files that the script opens.
-        newlim = len(groupids) * len(self._fields) + 50
+        newlim = len(groupids) * len(fields) + 50
         try:
             softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE)
 
@@ -577,7 +643,7 @@ class TracepointProvider(object):
 
         for groupid in groupids:
             group = Group()
-            for name in self._fields:
+            for name in fields:
                 tracepoint = name
                 tracefilter = None
                 match = re.match(r'(.*)\((.*)\)', name)
@@ -650,13 +716,23 @@ class TracepointProvider(object):
                     ret[name] += val
         return ret
 
+    def reset(self):
+        """Reset all field counters"""
+        for group in self.group_leaders:
+            for event in group.events:
+                event.reset()
+
+
 class DebugfsProvider(object):
     """Provides data from the files that KVM creates in the kvm debugfs
     folder."""
     def __init__(self):
         self._fields = self.get_available_fields()
+        self._baseline = {}
         self._pid = 0
         self.do_read = True
+        self.paths = []
+        self.reset()
 
     def get_available_fields(self):
         """"Returns a list of available fields.
@@ -673,6 +749,7 @@ class DebugfsProvider(object):
     @fields.setter
     def fields(self, fields):
         self._fields = fields
+        self.reset()
 
     @property
     def pid(self):
@@ -690,10 +767,11 @@ class DebugfsProvider(object):
             self.paths = filter(lambda x: "{}-".format(pid) in x, vms)
 
         else:
-            self.paths = ['']
+            self.paths = []
             self.do_read = True
+        self.reset()
 
-    def read(self):
+    def read(self, reset=0):
         """Returns a dict with format:'file name / field -> current value'."""
         results = {}
 
@@ -701,10 +779,22 @@ class DebugfsProvider(object):
         if not self.do_read:
             return results
 
-        for path in self.paths:
+        paths = self.paths
+        if self._pid == 0:
+            paths = []
+            for entry in os.walk(PATH_DEBUGFS_KVM):
+                for dir in entry[1]:
+                    paths.append(dir)
+        for path in paths:
             for field in self._fields:
-                results[field] = results.get(field, 0) \
-                                 + self.read_field(field, path)
+                value = self.read_field(field, path)
+                key = path + field
+                if reset:
+                    self._baseline[key] = value
+                if self._baseline.get(key, -1) == -1:
+                    self._baseline[key] = value
+                results[field] = (results.get(field, 0) + value -
+                                  self._baseline.get(key, 0))
 
         return results
 
@@ -718,6 +808,12 @@ class DebugfsProvider(object):
         except IOError:
             return 0
 
+    def reset(self):
+        """Reset field counters"""
+        self._baseline = {}
+        self.read(1)
+
+
 class Stats(object):
     """Manages the data providers and the data they provide.
 
@@ -753,14 +849,20 @@ class Stats(object):
         for provider in self.providers:
             provider.pid = self._pid_filter
 
+    def reset(self):
+        self.values = {}
+        for provider in self.providers:
+            provider.reset()
+
     @property
     def fields_filter(self):
         return self._fields_filter
 
     @fields_filter.setter
     def fields_filter(self, fields_filter):
-        self._fields_filter = fields_filter
-        self.update_provider_filters()
+        if fields_filter != self._fields_filter:
+            self._fields_filter = fields_filter
+            self.update_provider_filters()
 
     @property
     def pid_filter(self):
@@ -768,9 +870,10 @@ class Stats(object):
 
     @pid_filter.setter
     def pid_filter(self, pid):
-        self._pid_filter = pid
-        self.values = {}
-        self.update_provider_pid()
+        if pid != self._pid_filter:
+            self._pid_filter = pid
+            self.values = {}
+            self.update_provider_pid()
 
     def get(self):
         """Returns a dict with field -> (value, delta to last value) of all
@@ -778,23 +881,26 @@ class Stats(object):
         for provider in self.providers:
             new = provider.read()
             for key in provider.fields:
-                oldval = self.values.get(key, (0, 0))
+                oldval = self.values.get(key, (0, 0))[0]
                 newval = new.get(key, 0)
-                newdelta = None
-                if oldval is not None:
-                    newdelta = newval - oldval[0]
+                newdelta = newval - oldval
                 self.values[key] = (newval, newdelta)
         return self.values
 
 LABEL_WIDTH = 40
 NUMBER_WIDTH = 10
+DELAY_INITIAL = 0.25
+DELAY_REGULAR = 3.0
+MAX_GUEST_NAME_LEN = 48
+MAX_REGEX_LEN = 44
+DEFAULT_REGEX = r'^[^\(]*$'
+
 
 class Tui(object):
     """Instruments curses to draw a nice text ui."""
     def __init__(self, stats):
         self.stats = stats
         self.screen = None
-        self.drilldown = False
         self.update_drilldown()
 
     def __enter__(self):
@@ -809,7 +915,14 @@ class Tui(object):
         # return from C start_color() is ignorable.
         try:
             curses.start_color()
-        except:
+        except curses.error:
+            pass
+
+        # Hide cursor in extra statement as some monochrome terminals
+        # might support hiding but not colors.
+        try:
+            curses.curs_set(0)
+        except curses.error:
             pass
 
         curses.use_default_colors()
@@ -827,36 +940,60 @@ class Tui(object):
     def update_drilldown(self):
         """Sets or removes a filter that only allows fields without braces."""
         if not self.stats.fields_filter:
-            self.stats.fields_filter = r'^[^\(]*$'
+            self.stats.fields_filter = DEFAULT_REGEX
 
-        elif self.stats.fields_filter == r'^[^\(]*$':
+        elif self.stats.fields_filter == DEFAULT_REGEX:
             self.stats.fields_filter = None
 
     def update_pid(self, pid):
         """Propagates pid selection to stats object."""
         self.stats.pid_filter = pid
 
-    def refresh(self, sleeptime):
-        """Refreshes on-screen data."""
+    def refresh_header(self, pid=None):
+        """Refreshes the header."""
+        if pid is None:
+            pid = self.stats.pid_filter
         self.screen.erase()
-        if self.stats.pid_filter > 0:
-            self.screen.addstr(0, 0, 'kvm statistics - pid {0}'
-                               .format(self.stats.pid_filter),
-                               curses.A_BOLD)
+        gname = get_gname_from_pid(pid)
+        if gname:
+            gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...'
+                                   if len(gname) > MAX_GUEST_NAME_LEN
+                                   else gname))
+        if pid > 0:
+            self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}'
+                               .format(pid, gname), curses.A_BOLD)
         else:
             self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD)
+        if self.stats.fields_filter and self.stats.fields_filter \
+           != DEFAULT_REGEX:
+            regex = self.stats.fields_filter
+            if len(regex) > MAX_REGEX_LEN:
+                regex = regex[:MAX_REGEX_LEN] + '...'
+            self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex))
         self.screen.addstr(2, 1, 'Event')
         self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH -
                            len('Total'), 'Total')
-        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 -
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 -
+                           len('%Total'), '%Total')
+        self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 -
                            len('Current'), 'Current')
+        self.screen.addstr(4, 1, 'Collecting data...')
+        self.screen.refresh()
+
+    def refresh_body(self, sleeptime):
         row = 3
+        self.screen.move(row, 0)
+        self.screen.clrtobot()
         stats = self.stats.get()
+
         def sortkey(x):
             if stats[x][1]:
                 return (-stats[x][1], -stats[x][0])
             else:
                 return (0, -stats[x][0])
+        total = 0.
+        for val in stats.values():
+            total += val[0]
         for key in sorted(stats.keys(), key=sortkey):
 
             if row >= self.screen.getmaxyx()[0]:
@@ -869,6 +1006,8 @@ class Tui(object):
             col += LABEL_WIDTH
             self.screen.addstr(row, col, '%10d' % (values[0],))
             col += NUMBER_WIDTH
+            self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,))
+            col += 7
             if values[1] is not None:
                 self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,))
             row += 1
@@ -893,20 +1032,24 @@ class Tui(object):
             regex = self.screen.getstr()
             curses.noecho()
             if len(regex) == 0:
+                self.stats.fields_filter = DEFAULT_REGEX
+                self.refresh_header()
                 return
             try:
                 re.compile(regex)
                 self.stats.fields_filter = regex
+                self.refresh_header()
                 return
             except re.error:
                 continue
 
-    def show_vm_selection(self):
+    def show_vm_selection_by_pid(self):
         """Draws PID selection mask.
 
         Asks for a pid until a valid pid or 0 has been entered.
 
         """
+        msg = ''
         while True:
             self.screen.erase()
             self.screen.addstr(0, 0,
@@ -915,6 +1058,7 @@ class Tui(object):
             self.screen.addstr(1, 0,
                                'This might limit the shown data to the trace '
                                'statistics.')
+            self.screen.addstr(5, 0, msg)
 
             curses.echo()
             self.screen.addstr(3, 0, "Pid [0 or pid]: ")
@@ -922,60 +1066,128 @@ class Tui(object):
             curses.noecho()
 
             try:
-                pid = int(pid)
-
-                if pid == 0:
-                    self.update_pid(pid)
-                    break
-                else:
-                    if not os.path.isdir(os.path.join('/proc/', str(pid))):
+                if len(pid) > 0:
+                    pid = int(pid)
+                    if pid != 0 and not os.path.isdir(os.path.join('/proc/',
+                                                                   str(pid))):
+                        msg = '"' + str(pid) + '": Not a running process'
                         continue
-                    else:
-                        self.update_pid(pid)
-                        break
+                else:
+                    pid = 0
+                self.refresh_header(pid)
+                self.update_pid(pid)
+                break
 
             except ValueError:
+                msg = '"' + str(pid) + '": Not a valid pid'
                 continue
 
+    def show_vm_selection_by_guest_name(self):
+        """Draws guest selection mask.
+
+        Asks for a guest name until a valid guest name or '' is entered.
+
+        """
+        msg = ''
+        while True:
+            self.screen.erase()
+            self.screen.addstr(0, 0,
+                               'Show statistics for specific guest.',
+                               curses.A_BOLD)
+            self.screen.addstr(1, 0,
+                               'This might limit the shown data to the trace '
+                               'statistics.')
+            self.screen.addstr(5, 0, msg)
+            curses.echo()
+            self.screen.addstr(3, 0, "Guest [ENTER or guest]: ")
+            gname = self.screen.getstr()
+            curses.noecho()
+
+            if not gname:
+                self.refresh_header(0)
+                self.update_pid(0)
+                break
+            else:
+                pids = []
+                try:
+                    pids = get_pid_from_gname(gname)
+                except:
+                    msg = '"' + gname + '": Internal error while searching, ' \
+                          'use pid filter instead'
+                    continue
+                if len(pids) == 0:
+                    msg = '"' + gname + '": Not an active guest'
+                    continue
+                if len(pids) > 1:
+                    msg = '"' + gname + '": Multiple matches found, use pid ' \
+                          'filter instead'
+                    continue
+                self.refresh_header(pids[0])
+                self.update_pid(pids[0])
+                break
+
     def show_stats(self):
         """Refreshes the screen and processes user input."""
-        sleeptime = 0.25
+        sleeptime = DELAY_INITIAL
+        self.refresh_header()
         while True:
-            self.refresh(sleeptime)
+            self.refresh_body(sleeptime)
             curses.halfdelay(int(sleeptime * 10))
-            sleeptime = 3
+            sleeptime = DELAY_REGULAR
             try:
                 char = self.screen.getkey()
                 if char == 'x':
-                    self.drilldown = not self.drilldown
+                    self.refresh_header()
                     self.update_drilldown()
+                    sleeptime = DELAY_INITIAL
                 if char == 'q':
                     break
+                if char == 'c':
+                    self.stats.fields_filter = DEFAULT_REGEX
+                    self.refresh_header(0)
+                    self.update_pid(0)
+                    sleeptime = DELAY_INITIAL
                 if char == 'f':
                     self.show_filter_selection()
+                    sleeptime = DELAY_INITIAL
+                if char == 'g':
+                    self.show_vm_selection_by_guest_name()
+                    sleeptime = DELAY_INITIAL
                 if char == 'p':
-                    self.show_vm_selection()
+                    self.show_vm_selection_by_pid()
+                    sleeptime = DELAY_INITIAL
+                if char == 'r':
+                    self.refresh_header()
+                    self.stats.reset()
+                    sleeptime = DELAY_INITIAL
             except KeyboardInterrupt:
                 break
             except curses.error:
                 continue
 
+
 def batch(stats):
     """Prints statistics in a key, value format."""
-    s = stats.get()
-    time.sleep(1)
-    s = stats.get()
-    for key in sorted(s.keys()):
-        values = s[key]
-        print '%-42s%10d%10d' % (key, values[0], values[1])
+    try:
+        s = stats.get()
+        time.sleep(1)
+        s = stats.get()
+        for key in sorted(s.keys()):
+            values = s[key]
+            print '%-42s%10d%10d' % (key, values[0], values[1])
+    except KeyboardInterrupt:
+        pass
+
 
 def log(stats):
     """Prints statistics as reiterating key block, multiple value blocks."""
     keys = sorted(stats.get().iterkeys())
+
     def banner():
         for k in keys:
             print '%s' % k,
         print
+
     def statline():
         s = stats.get()
         for k in keys:
@@ -984,11 +1196,15 @@ def log(stats):
     line = 0
     banner_repeat = 20
     while True:
-        time.sleep(1)
-        if line % banner_repeat == 0:
-            banner()
-        statline()
-        line += 1
+        try:
+            time.sleep(1)
+            if line % banner_repeat == 0:
+                banner()
+            statline()
+            line += 1
+        except KeyboardInterrupt:
+            break
+
 
 def get_options():
     """Returns processed program arguments."""
@@ -1009,6 +1225,16 @@ Requirements:
   CAP_SYS_ADMIN and perf events are used.
 - CAP_SYS_RESOURCE if the hard limit is not high enough to allow
   the large number of files that are possibly opened.
+
+Interactive Commands:
+   c     clear filter
+   f     filter by regular expression
+   g     filter by guest name
+   p     filter by PID
+   q     quit
+   x     toggle reporting of stats for individual child trace events
+   r     reset stats
+Press any other key to refresh statistics immediately.
 """
 
     class PlainHelpFormatter(optparse.IndentedHelpFormatter):
@@ -1018,6 +1244,22 @@ Requirements:
             else:
                 return ""
 
+    def cb_guest_to_pid(option, opt, val, parser):
+        try:
+            pids = get_pid_from_gname(val)
+        except:
+            raise optparse.OptionValueError('Error while searching for guest '
+                                            '"{}", use "-p" to specify a pid '
+                                            'instead'.format(val))
+        if len(pids) == 0:
+            raise optparse.OptionValueError('No guest by the name "{}" '
+                                            'found'.format(val))
+        if len(pids) > 1:
+            raise optparse.OptionValueError('Multiple processes found (pids: '
+                                            '{}) - use "-p" to specify a pid '
+                                            'instead'.format(" ".join(pids)))
+        parser.values.pid = pids[0]
+
     optparser = optparse.OptionParser(description=description_text,
                                       formatter=PlainHelpFormatter())
     optparser.add_option('-1', '--once', '--batch',
@@ -1051,15 +1293,24 @@ Requirements:
                          help='fields to display (regex)',
                          )
     optparser.add_option('-p', '--pid',
-                        action='store',
-                        default=0,
-                        type=int,
-                        dest='pid',
-                        help='restrict statistics to pid',
-                        )
+                         action='store',
+                         default=0,
+                         type='int',
+                         dest='pid',
+                         help='restrict statistics to pid',
+                         )
+    optparser.add_option('-g', '--guest',
+                         action='callback',
+                         type='string',
+                         dest='pid',
+                         metavar='GUEST',
+                         help='restrict statistics to guest by name',
+                         callback=cb_guest_to_pid,
+                         )
     (options, _) = optparser.parse_args(sys.argv)
     return options
 
+
 def get_providers(options):
     """Returns a list of data providers depending on the passed options."""
     providers = []
@@ -1073,6 +1324,7 @@ def get_providers(options):
 
     return providers
 
+
 def check_access(options):
     """Exits if the current user can't access all needed directories."""
     if not os.path.exists('/sys/kernel/debug'):
@@ -1086,8 +1338,8 @@ def check_access(options):
                          "Also ensure, that the kvm modules are loaded.\n")
         sys.exit(1)
 
-    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints
-                                                     or not options.debugfs):
+    if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or
+                                                     not options.debugfs):
         sys.stderr.write("Please enable CONFIG_TRACING in your kernel "
                          "when using the option -t (default).\n"
                          "If it is enabled, make {0} readable by the "
@@ -1098,10 +1350,11 @@ def check_access(options):
 
         sys.stderr.write("Falling back to debugfs statistics!\n")
         options.debugfs = True
-        sleep(5)
+        time.sleep(5)
 
     return options
 
+
 def main():
     options = get_options()
     options = check_access(options)
index b92a153d7115c00bfbcf20b54858f2a1d79477d0..109431bdc63c991fa355d89d8ef7057a285ecca5 100644 (file)
@@ -18,11 +18,33 @@ state transitions such as guest mode entry and exit.
 This tool is useful for observing guest behavior from the host perspective.
 Often conclusions about performance or buggy behavior can be drawn from the
 output.
+While running in regular mode, use any of the keys listed in section
+'Interactive Commands' below.
+Use batch and logging modes for scripting purposes.
 
 The set of KVM kernel module trace events may be specific to the kernel version
 or architecture.  It is best to check the KVM kernel module source code for the
 meaning of events.
 
+INTERACTIVE COMMANDS
+--------------------
+[horizontal]
+*c*::  clear filter
+
+*f*::  filter by regular expression
+
+*g*::  filter by guest name
+
+*p*::  filter by PID
+
+*q*::  quit
+
+*r*::  reset stats
+
+*x*::  toggle reporting of stats for child trace events
+
+Press any other key to refresh statistics immediately.
+
 OPTIONS
 -------
 -1::
@@ -46,6 +68,10 @@ OPTIONS
 --pid=<pid>::
        limit statistics to one virtual machine (pid)
 
+-g<guest>::
+--guest=<guest_name>::
+       limit statistics to one virtual machine (guest name)
+
 -f<fields>::
 --fields=<fields>::
        fields to display (regex)
index a29786dd95221017b141a060b031c5c899dac2e5..4d28a9ddbee01077fea01beeeae5523917822da9 100644 (file)
@@ -870,7 +870,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
                        continue;
 
                kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev);
-               kvm->buses[bus_idx]->ioeventfd_count--;
+               if (kvm->buses[bus_idx])
+                       kvm->buses[bus_idx]->ioeventfd_count--;
                ioeventfd_release(p);
                ret = 0;
                break;
index a17d78759727f352991a97b4c2bed21266657760..f489167839c48b49aad6982bc3bee4dc858f56c2 100644 (file)
@@ -727,8 +727,11 @@ static void kvm_destroy_vm(struct kvm *kvm)
        list_del(&kvm->vm_list);
        spin_unlock(&kvm_lock);
        kvm_free_irq_routing(kvm);
-       for (i = 0; i < KVM_NR_BUSES; i++)
-               kvm_io_bus_destroy(kvm->buses[i]);
+       for (i = 0; i < KVM_NR_BUSES; i++) {
+               if (kvm->buses[i])
+                       kvm_io_bus_destroy(kvm->buses[i]);
+               kvm->buses[i] = NULL;
+       }
        kvm_coalesced_mmio_free(kvm);
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
        mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
@@ -1016,8 +1019,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
                old_memslots = install_new_memslots(kvm, as_id, slots);
 
-               /* slot was deleted or moved, clear iommu mapping */
-               kvm_iommu_unmap_pages(kvm, &old);
                /* From this point no new shadow pages pointing to a deleted,
                 * or moved, memslot will be created.
                 *
@@ -1052,21 +1053,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
        kvm_free_memslot(kvm, &old, &new);
        kvfree(old_memslots);
-
-       /*
-        * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
-        * un-mapped and re-mapped if their base changes.  Since base change
-        * unmapping is handled above with slot deletion, mapping alone is
-        * needed here.  Anything else the iommu might care about for existing
-        * slots (size changes, userspace addr changes and read-only flag
-        * changes) is disallowed above, so any other attribute changes getting
-        * here can be skipped.
-        */
-       if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
-               r = kvm_iommu_map_pages(kvm, &new);
-               return r;
-       }
-
        return 0;
 
 out_slots:
@@ -2363,7 +2349,7 @@ static int kvm_vcpu_fault(struct vm_fault *vmf)
        else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->arch.pio_data);
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
        else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
                page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
 #endif
@@ -2932,6 +2918,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
        case KVM_CAP_IOEVENTFD_ANY_LENGTH:
        case KVM_CAP_CHECK_EXTENSION_VM:
                return 1;
+#ifdef CONFIG_KVM_MMIO
+       case KVM_CAP_COALESCED_MMIO:
+               return KVM_COALESCED_MMIO_PAGE_OFFSET;
+#endif
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
        case KVM_CAP_IRQ_ROUTING:
                return KVM_MAX_IRQ_ROUTES;
@@ -2981,7 +2971,7 @@ static long kvm_vm_ioctl(struct file *filp,
                r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
                break;
        }
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
        case KVM_REGISTER_COALESCED_MMIO: {
                struct kvm_coalesced_mmio_zone zone;
 
@@ -3173,7 +3163,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type)
        kvm = kvm_create_vm(type);
        if (IS_ERR(kvm))
                return PTR_ERR(kvm);
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
        r = kvm_coalesced_mmio_init(kvm);
        if (r < 0) {
                kvm_put_kvm(kvm);
@@ -3226,7 +3216,7 @@ static long kvm_dev_ioctl(struct file *filp,
 #ifdef CONFIG_X86
                r += PAGE_SIZE;    /* pio data page */
 #endif
-#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
+#ifdef CONFIG_KVM_MMIO
                r += PAGE_SIZE;    /* coalesced mmio ring page */
 #endif
                break;
@@ -3474,6 +3464,8 @@ int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
        };
 
        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       if (!bus)
+               return -ENOMEM;
        r = __kvm_io_bus_write(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
@@ -3491,6 +3483,8 @@ int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
        };
 
        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       if (!bus)
+               return -ENOMEM;
 
        /* First try the device referenced by cookie. */
        if ((cookie >= 0) && (cookie < bus->dev_count) &&
@@ -3541,6 +3535,8 @@ int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
        };
 
        bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+       if (!bus)
+               return -ENOMEM;
        r = __kvm_io_bus_read(vcpu, bus, &range, val);
        return r < 0 ? r : 0;
 }
@@ -3553,6 +3549,9 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
        struct kvm_io_bus *new_bus, *bus;
 
        bus = kvm->buses[bus_idx];
+       if (!bus)
+               return -ENOMEM;
+
        /* exclude ioeventfd which is limited by maximum fd */
        if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
                return -ENOSPC;
@@ -3572,37 +3571,41 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 }
 
 /* Caller must hold slots_lock. */
-int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
-                             struct kvm_io_device *dev)
+void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+                              struct kvm_io_device *dev)
 {
-       int i, r;
+       int i;
        struct kvm_io_bus *new_bus, *bus;
 
        bus = kvm->buses[bus_idx];
-       r = -ENOENT;
+       if (!bus)
+               return;
+
        for (i = 0; i < bus->dev_count; i++)
                if (bus->range[i].dev == dev) {
-                       r = 0;
                        break;
                }
 
-       if (r)
-               return r;
+       if (i == bus->dev_count)
+               return;
 
        new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
                          sizeof(struct kvm_io_range)), GFP_KERNEL);
-       if (!new_bus)
-               return -ENOMEM;
+       if (!new_bus)  {
+               pr_err("kvm: failed to shrink bus, removing it completely\n");
+               goto broken;
+       }
 
        memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
        new_bus->dev_count--;
        memcpy(new_bus->range + i, bus->range + i + 1,
               (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
 
+broken:
        rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
        synchronize_srcu_expedited(&kvm->srcu);
        kfree(bus);
-       return r;
+       return;
 }
 
 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -3615,6 +3618,8 @@ struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
        srcu_idx = srcu_read_lock(&kvm->srcu);
 
        bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+       if (!bus)
+               goto out_unlock;
 
        dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
        if (dev_idx < 0)