Merge tag 'for-linus' of git://git.armlinux.org.uk/~rmk/linux-arm

[linux.git] / virt / kvm / kvm_main.c
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c

index fd68fbe0a75d2f24594c2ab0a89ba7923b12e3eb..00268290dcbd85a75ec2f17cc2caa5431c03208b 100644 (file)
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -50,6 +50,7 @@
  #include <linux/bsearch.h>
  #include <linux/io.h>
  #include <linux/lockdep.h>
+#include <linux/kthread.h>
  
  #include <asm/processor.h>
  #include <asm/ioctl.h>
@@ -121,9 +122,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
                                   unsigned long arg);
  #define KVM_COMPAT(c)  .compat_ioctl   = (c)
  #else
+/*
+ * For architectures that don't implement a compat infrastructure,
+ * adopt a double line of defense:
+ * - Prevent a compat task from opening /dev/kvm
+ * - If the open has been done by a 64bit task, and the KVM fd
+ *   passed to a compat task, let the ioctls fail.
+ */
  static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
                                 unsigned long arg) { return -EINVAL; }
-#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl
+
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
+{
+       return is_compat_task() ? -ENODEV : 0;
+}
+#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl,  \
+                       .open           = kvm_no_compat_open
  #endif
  static int hardware_enable_all(void);
  static void hardware_disable_all(void);
@@ -149,10 +163,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
         return 0;
  }
  
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+       /*
+        * The metadata used by is_zone_device_page() to determine whether or
+        * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+        * the device has been pinned, e.g. by get_user_pages().  WARN if the
+        * page_count() is zero to help detect bad usage of this helper.
+        */
+       if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+               return false;
+
+       return is_zone_device_page(pfn_to_page(pfn));
+}
+
  bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
  {
+       /*
+        * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+        * perspective they are "normal" pages, albeit with slightly different
+        * usage rules.
+        */
         if (pfn_valid(pfn))
-               return PageReserved(pfn_to_page(pfn));
+               return PageReserved(pfn_to_page(pfn)) &&
+                      !kvm_is_zone_device_pfn(pfn);
  
         return true;
  }
@@ -625,10 +659,28 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
         return 0;
  }
  
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
  static struct kvm *kvm_create_vm(unsigned long type)
  {
-       int r, i;
         struct kvm *kvm = kvm_arch_alloc_vm();
+       int r = -ENOMEM;
+       int i;
  
         if (!kvm)
                 return ERR_PTR(-ENOMEM);
@@ -640,45 +692,50 @@ static struct kvm *kvm_create_vm(unsigned long type)
         mutex_init(&kvm->lock);
         mutex_init(&kvm->irq_lock);
         mutex_init(&kvm->slots_lock);
-       refcount_set(&kvm->users_count, 1);
         INIT_LIST_HEAD(&kvm->devices);
  
-       r = kvm_arch_init_vm(kvm, type);
-       if (r)
-               goto out_err_no_disable;
-
-       r = hardware_enable_all();
-       if (r)
-               goto out_err_no_disable;
-
-#ifdef CONFIG_HAVE_KVM_IRQFD
-       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
-#endif
-
         BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
  
-       r = -ENOMEM;
+       if (init_srcu_struct(&kvm->srcu))
+               goto out_err_no_srcu;
+       if (init_srcu_struct(&kvm->irq_srcu))
+               goto out_err_no_irq_srcu;
+
+       refcount_set(&kvm->users_count, 1);
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                 struct kvm_memslots *slots = kvm_alloc_memslots();
+
                 if (!slots)
-                       goto out_err_no_srcu;
+                       goto out_err_no_arch_destroy_vm;
                 /* Generations must be different for each address space. */
                 slots->generation = i;
                 rcu_assign_pointer(kvm->memslots[i], slots);
         }
  
-       if (init_srcu_struct(&kvm->srcu))
-               goto out_err_no_srcu;
-       if (init_srcu_struct(&kvm->irq_srcu))
-               goto out_err_no_irq_srcu;
         for (i = 0; i < KVM_NR_BUSES; i++) {
                 rcu_assign_pointer(kvm->buses[i],
                         kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
                 if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_arch_destroy_vm;
         }
  
+       r = kvm_arch_init_vm(kvm, type);
+       if (r)
+               goto out_err_no_arch_destroy_vm;
+
+       r = hardware_enable_all();
+       if (r)
+               goto out_err_no_disable;
+
+#ifdef CONFIG_HAVE_KVM_IRQFD
+       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
         r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
         if (r)
                 goto out_err;
  
@@ -691,17 +748,24 @@ static struct kvm *kvm_create_vm(unsigned long type)
         return kvm;
  
  out_err:
-       cleanup_srcu_struct(&kvm->irq_srcu);
-out_err_no_irq_srcu:
-       cleanup_srcu_struct(&kvm->srcu);
-out_err_no_srcu:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
         hardware_disable_all();
  out_err_no_disable:
-       refcount_set(&kvm->users_count, 0);
+       kvm_arch_destroy_vm(kvm);
+out_err_no_arch_destroy_vm:
+       WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
         for (i = 0; i < KVM_NR_BUSES; i++)
                 kfree(kvm_get_bus(kvm, i));
         for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
                 kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+       cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
+       cleanup_srcu_struct(&kvm->srcu);
+out_err_no_srcu:
         kvm_arch_free_vm(kvm);
         mmdrop(current->mm);
         return ERR_PTR(r);
@@ -733,6 +797,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
         mutex_lock(&kvm_lock);
         list_del(&kvm->vm_list);
         mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
         kvm_free_irq_routing(kvm);
         for (i = 0; i < KVM_NR_BUSES; i++) {
                 struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -772,6 +838,18 @@ void kvm_put_kvm(struct kvm *kvm)
  }
  EXPORT_SYMBOL_GPL(kvm_put_kvm);
  
+/*
+ * Used to put a reference that was taken on behalf of an object associated
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
+ * of the new file descriptor fails and the reference cannot be transferred to
+ * its final owner.  In such cases, the caller is still actively using @kvm and
+ * will fail miserably if the refcount unexpectedly hits zero.
+ */
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
+{
+       WARN_ON(refcount_dec_and_test(&kvm->users_count));
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
  
  static int kvm_vm_release(struct inode *inode, struct file *filp)
  {
@@ -1853,7 +1931,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
  
  void kvm_set_pfn_dirty(kvm_pfn_t pfn)
  {
-       if (!kvm_is_reserved_pfn(pfn)) {
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
                 struct page *page = pfn_to_page(pfn);
  
                 SetPageDirty(page);
@@ -1863,7 +1941,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
  
  void kvm_set_pfn_accessed(kvm_pfn_t pfn)
  {
-       if (!kvm_is_reserved_pfn(pfn))
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
                 mark_page_accessed(pfn_to_page(pfn));
  }
  EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2360,20 +2438,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
         kvm_arch_vcpu_unblocking(vcpu);
         block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
  
-       if (!vcpu_valid_wakeup(vcpu))
-               shrink_halt_poll_ns(vcpu);
-       else if (halt_poll_ns) {
-               if (block_ns <= vcpu->halt_poll_ns)
-                       ;
-               /* we had a long block, shrink polling */
-               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+       if (!kvm_arch_no_poll(vcpu)) {
+               if (!vcpu_valid_wakeup(vcpu)) {
                         shrink_halt_poll_ns(vcpu);
-               /* we had a short halt and our poll time is too small */
-               else if (vcpu->halt_poll_ns < halt_poll_ns &&
-                       block_ns < halt_poll_ns)
-                       grow_halt_poll_ns(vcpu);
-       } else
-               vcpu->halt_poll_ns = 0;
+               } else if (halt_poll_ns) {
+                       if (block_ns <= vcpu->halt_poll_ns)
+                               ;
+                       /* we had a long block, shrink polling */
+                       else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                               shrink_halt_poll_ns(vcpu);
+                       /* we had a short halt and our poll time is too small */
+                       else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                               block_ns < halt_poll_ns)
+                               grow_halt_poll_ns(vcpu);
+               } else {
+                       vcpu->halt_poll_ns = 0;
+               }
+       }
  
         trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
         kvm_arch_vcpu_block_finish(vcpu);
@@ -2670,17 +2751,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                 goto unlock_vcpu_destroy;
         }
  
-       BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+       vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+       BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
  
         /* Now it's all set up, let userspace reach it */
         kvm_get_kvm(kvm);
         r = create_vcpu_fd(vcpu);
         if (r < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                 goto unlock_vcpu_destroy;
         }
  
-       kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+       kvm->vcpus[vcpu->vcpu_idx] = vcpu;
  
         /*
          * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
@@ -3046,14 +3128,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
         return filp->private_data;
  }
  
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
  #ifdef CONFIG_KVM_MPIC
         [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
         [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
  #endif
  };
  
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
  {
         if (type >= ARRAY_SIZE(kvm_device_ops_table))
                 return -ENOSPC;
@@ -3074,7 +3156,7 @@ void kvm_unregister_device_ops(u32 type)
  static int kvm_ioctl_create_device(struct kvm *kvm,
                                    struct kvm_create_device *cd)
  {
-       struct kvm_device_ops *ops = NULL;
+       const struct kvm_device_ops *ops = NULL;
         struct kvm_device *dev;
         bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
         int type;
@@ -3114,7 +3196,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
         kvm_get_kvm(kvm);
         ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
         if (ret < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                 mutex_lock(&kvm->lock);
                 list_del(&dev->vm_node);
                 mutex_unlock(&kvm->lock);
@@ -4272,12 +4354,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
  
         r = kvm_arch_hardware_setup();
         if (r < 0)
-               goto out_free_0a;
+               goto out_free_1;
  
         for_each_online_cpu(cpu) {
                 smp_call_function_single(cpu, check_processor_compat, &r, 1);
                 if (r < 0)
-                       goto out_free_1;
+                       goto out_free_2;
         }
  
         r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
@@ -4334,9 +4416,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
         unregister_reboot_notifier(&kvm_reboot_notifier);
         cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
  out_free_2:
-out_free_1:
         kvm_arch_hardware_unsetup();
-out_free_0a:
+out_free_1:
         free_cpumask_var(cpus_hardware_enabled);
  out_free_0:
         kvm_irqfd_exit();
@@ -4364,3 +4445,86 @@ void kvm_exit(void)
         kvm_vfio_ops_exit();
  }
  EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+       struct kvm *kvm;
+       struct task_struct *parent;
+       struct completion init_done;
+       kvm_vm_thread_fn_t thread_fn;
+       uintptr_t data;
+       int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+       /*
+        * The init_context is allocated on the stack of the parent thread, so
+        * we have to locally copy anything that is needed beyond initialization
+        */
+       struct kvm_vm_worker_thread_context *init_context = context;
+       struct kvm *kvm = init_context->kvm;
+       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+       uintptr_t data = init_context->data;
+       int err;
+
+       err = kthread_park(current);
+       /* kthread_park(current) is never supposed to return an error */
+       WARN_ON(err != 0);
+       if (err)
+               goto init_complete;
+
+       err = cgroup_attach_task_all(init_context->parent, current);
+       if (err) {
+               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+                       __func__, err);
+               goto init_complete;
+       }
+
+       set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+       init_context->err = err;
+       complete(&init_context->init_done);
+       init_context = NULL;
+
+       if (err)
+               return err;
+
+       /* Wait to be woken up by the spawner before proceeding. */
+       kthread_parkme();
+
+       if (!kthread_should_stop())
+               err = thread_fn(kvm, data);
+
+       return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+                               uintptr_t data, const char *name,
+                               struct task_struct **thread_ptr)
+{
+       struct kvm_vm_worker_thread_context init_context = {};
+       struct task_struct *thread;
+
+       *thread_ptr = NULL;
+       init_context.kvm = kvm;
+       init_context.parent = current;
+       init_context.thread_fn = thread_fn;
+       init_context.data = data;
+       init_completion(&init_context.init_done);
+
+       thread = kthread_run(kvm_vm_worker_thread, &init_context,
+                            "%s-%d", name, task_pid_nr(current));
+       if (IS_ERR(thread))
+               return PTR_ERR(thread);
+
+       /* kthread_run is never supposed to return NULL */
+       WARN_ON(thread == NULL);
+
+       wait_for_completion(&init_context.init_done);
+
+       if (!init_context.err)
+               *thread_ptr = thread;
+
+       return init_context.err;
+}