]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - virt/kvm/kvm_main.c
Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm
[linux.git] / virt / kvm / kvm_main.c
index fd68fbe0a75d2f24594c2ab0a89ba7923b12e3eb..00268290dcbd85a75ec2f17cc2caa5431c03208b 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/bsearch.h>
 #include <linux/io.h>
 #include <linux/lockdep.h>
+#include <linux/kthread.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
                                  unsigned long arg);
 #define KVM_COMPAT(c)  .compat_ioctl   = (c)
 #else
+/*
+ * For architectures that don't implement a compat infrastructure,
+ * adopt a double line of defense:
+ * - Prevent a compat task from opening /dev/kvm
+ * - If the open has been done by a 64bit task, and the KVM fd
+ *   passed to a compat task, let the ioctls fail.
+ */
 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
                                unsigned long arg) { return -EINVAL; }
-#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl
+
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
+{
+       return is_compat_task() ? -ENODEV : 0;
+}
+#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl,  \
+                       .open           = kvm_no_compat_open
 #endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
@@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
        return 0;
 }
 
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+       /*
+        * The metadata used by is_zone_device_page() to determine whether or
+        * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+        * the device has been pinned, e.g. by get_user_pages().  WARN if the
+        * page_count() is zero to help detect bad usage of this helper.
+        */
+       if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+               return false;
+
+       return is_zone_device_page(pfn_to_page(pfn));
+}
+
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
+       /*
+        * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+        * perspective they are "normal" pages, albeit with slightly different
+        * usage rules.
+        */
        if (pfn_valid(pfn))
-               return PageReserved(pfn_to_page(pfn));
+               return PageReserved(pfn_to_page(pfn)) &&
+                      !kvm_is_zone_device_pfn(pfn);
 
        return true;
 }
@@ -625,10 +659,28 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
        return 0;
 }
 
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
-       int r, i;
        struct kvm *kvm = kvm_arch_alloc_vm();
+       int r = -ENOMEM;
+       int i;
 
        if (!kvm)
                return ERR_PTR(-ENOMEM);
@@ -640,45 +692,50 @@ static struct kvm *kvm_create_vm(unsigned long type)
        mutex_init(&kvm->lock);
        mutex_init(&kvm->irq_lock);
        mutex_init(&kvm->slots_lock);
-       refcount_set(&kvm->users_count, 1);
        INIT_LIST_HEAD(&kvm->devices);
 
-       r = kvm_arch_init_vm(kvm, type);
-       if (r)
-               goto out_err_no_disable;
-
-       r = hardware_enable_all();
-       if (r)
-               goto out_err_no_disable;
-
-#ifdef CONFIG_HAVE_KVM_IRQFD
-       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
-#endif
-
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
-       r = -ENOMEM;
+       if (init_srcu_struct(&kvm->srcu))
+               goto out_err_no_srcu;
+       if (init_srcu_struct(&kvm->irq_srcu))
+               goto out_err_no_irq_srcu;
+
+       refcount_set(&kvm->users_count, 1);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                struct kvm_memslots *slots = kvm_alloc_memslots();
+
                if (!slots)
-                       goto out_err_no_srcu;
+                       goto out_err_no_arch_destroy_vm;
                /* Generations must be different for each address space. */
                slots->generation = i;
                rcu_assign_pointer(kvm->memslots[i], slots);
        }
 
-       if (init_srcu_struct(&kvm->srcu))
-               goto out_err_no_srcu;
-       if (init_srcu_struct(&kvm->irq_srcu))
-               goto out_err_no_irq_srcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
                rcu_assign_pointer(kvm->buses[i],
                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
                if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_arch_destroy_vm;
        }
 
+       r = kvm_arch_init_vm(kvm, type);
+       if (r)
+               goto out_err_no_arch_destroy_vm;
+
+       r = hardware_enable_all();
+       if (r)
+               goto out_err_no_disable;
+
+#ifdef CONFIG_HAVE_KVM_IRQFD
+       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
        r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
        if (r)
                goto out_err;
 
@@ -691,17 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type)
        return kvm;
 
 out_err:
-       cleanup_srcu_struct(&kvm->irq_srcu);
-out_err_no_irq_srcu:
-       cleanup_srcu_struct(&kvm->srcu);
-out_err_no_srcu:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
        hardware_disable_all();
 out_err_no_disable:
-       refcount_set(&kvm->users_count, 0);
+       kvm_arch_destroy_vm(kvm);
+out_err_no_arch_destroy_vm:
+       WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm_get_bus(kvm, i));
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+       cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
+       cleanup_srcu_struct(&kvm->srcu);
+out_err_no_srcu:
        kvm_arch_free_vm(kvm);
        mmdrop(current->mm);
        return ERR_PTR(r);
@@ -733,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
        mutex_lock(&kvm_lock);
        list_del(&kvm->vm_list);
        mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++) {
                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -772,6 +838,18 @@ void kvm_put_kvm(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
 
+/*
+ * Used to put a reference that was taken on behalf of an object associated
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
+ * of the new file descriptor fails and the reference cannot be transferred to
+ * its final owner.  In such cases, the caller is still actively using @kvm and
+ * will fail miserably if the refcount unexpectedly hits zero.
+ */
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
+{
+       WARN_ON(refcount_dec_and_test(&kvm->users_count));
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
 
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
@@ -1853,7 +1931,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn)) {
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
                struct page *page = pfn_to_page(pfn);
 
                SetPageDirty(page);
@@ -1863,7 +1941,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn))
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2360,20 +2438,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        kvm_arch_vcpu_unblocking(vcpu);
        block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
-       if (!vcpu_valid_wakeup(vcpu))
-               shrink_halt_poll_ns(vcpu);
-       else if (halt_poll_ns) {
-               if (block_ns <= vcpu->halt_poll_ns)
-                       ;
-               /* we had a long block, shrink polling */
-               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+       if (!kvm_arch_no_poll(vcpu)) {
+               if (!vcpu_valid_wakeup(vcpu)) {
                        shrink_halt_poll_ns(vcpu);
-               /* we had a short halt and our poll time is too small */
-               else if (vcpu->halt_poll_ns < halt_poll_ns &&
-                       block_ns < halt_poll_ns)
-                       grow_halt_poll_ns(vcpu);
-       } else
-               vcpu->halt_poll_ns = 0;
+               } else if (halt_poll_ns) {
+                       if (block_ns <= vcpu->halt_poll_ns)
+                               ;
+                       /* we had a long block, shrink polling */
+                       else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                               shrink_halt_poll_ns(vcpu);
+                       /* we had a short halt and our poll time is too small */
+                       else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                               block_ns < halt_poll_ns)
+                               grow_halt_poll_ns(vcpu);
+               } else {
+                       vcpu->halt_poll_ns = 0;
+               }
+       }
 
        trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
        kvm_arch_vcpu_block_finish(vcpu);
@@ -2670,17 +2751,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto unlock_vcpu_destroy;
        }
 
-       BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+       vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+       BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
 
        /* Now it's all set up, let userspace reach it */
        kvm_get_kvm(kvm);
        r = create_vcpu_fd(vcpu);
        if (r < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                goto unlock_vcpu_destroy;
        }
 
-       kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+       kvm->vcpus[vcpu->vcpu_idx] = vcpu;
 
        /*
         * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
@@ -3046,14 +3128,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
        return filp->private_data;
 }
 
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 #ifdef CONFIG_KVM_MPIC
        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
 #endif
 };
 
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
 {
        if (type >= ARRAY_SIZE(kvm_device_ops_table))
                return -ENOSPC;
@@ -3074,7 +3156,7 @@ void kvm_unregister_device_ops(u32 type)
 static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
 {
-       struct kvm_device_ops *ops = NULL;
+       const struct kvm_device_ops *ops = NULL;
        struct kvm_device *dev;
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int type;
@@ -3114,7 +3196,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        kvm_get_kvm(kvm);
        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
        if (ret < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                mutex_lock(&kvm->lock);
                list_del(&dev->vm_node);
                mutex_unlock(&kvm->lock);
@@ -4272,12 +4354,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
        r = kvm_arch_hardware_setup();
        if (r < 0)
-               goto out_free_0a;
+               goto out_free_1;
 
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu, check_processor_compat, &r, 1);
                if (r < 0)
-                       goto out_free_1;
+                       goto out_free_2;
        }
 
        r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
@@ -4334,9 +4416,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        unregister_reboot_notifier(&kvm_reboot_notifier);
        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
 out_free_2:
-out_free_1:
        kvm_arch_hardware_unsetup();
-out_free_0a:
+out_free_1:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        kvm_irqfd_exit();
@@ -4364,3 +4445,86 @@ void kvm_exit(void)
        kvm_vfio_ops_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+       struct kvm *kvm;
+       struct task_struct *parent;
+       struct completion init_done;
+       kvm_vm_thread_fn_t thread_fn;
+       uintptr_t data;
+       int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+       /*
+        * The init_context is allocated on the stack of the parent thread, so
+        * we have to locally copy anything that is needed beyond initialization
+        */
+       struct kvm_vm_worker_thread_context *init_context = context;
+       struct kvm *kvm = init_context->kvm;
+       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+       uintptr_t data = init_context->data;
+       int err;
+
+       err = kthread_park(current);
+       /* kthread_park(current) is never supposed to return an error */
+       WARN_ON(err != 0);
+       if (err)
+               goto init_complete;
+
+       err = cgroup_attach_task_all(init_context->parent, current);
+       if (err) {
+               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+                       __func__, err);
+               goto init_complete;
+       }
+
+       set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+       init_context->err = err;
+       complete(&init_context->init_done);
+       init_context = NULL;
+
+       if (err)
+               return err;
+
+       /* Wait to be woken up by the spawner before proceeding. */
+       kthread_parkme();
+
+       if (!kthread_should_stop())
+               err = thread_fn(kvm, data);
+
+       return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+                               uintptr_t data, const char *name,
+                               struct task_struct **thread_ptr)
+{
+       struct kvm_vm_worker_thread_context init_context = {};
+       struct task_struct *thread;
+
+       *thread_ptr = NULL;
+       init_context.kvm = kvm;
+       init_context.parent = current;
+       init_context.thread_fn = thread_fn;
+       init_context.data = data;
+       init_completion(&init_context.init_done);
+
+       thread = kthread_run(kvm_vm_worker_thread, &init_context,
+                            "%s-%d", name, task_pid_nr(current));
+       if (IS_ERR(thread))
+               return PTR_ERR(thread);
+
+       /* kthread_run is never supposed to return NULL */
+       WARN_ON(thread == NULL);
+
+       wait_for_completion(&init_context.init_done);
+
+       if (!init_context.err)
+               *thread_ptr = thread;
+
+       return init_context.err;
+}