]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - virt/kvm/kvm_main.c
KVM: Move initialization of preempt notifier to kvm_vcpu_init()
[linux.git] / virt / kvm / kvm_main.c
index fd68fbe0a75d2f24594c2ab0a89ba7923b12e3eb..1ddb6d4cfbfd0bc663568015b63e2406c7e9818b 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/bsearch.h>
 #include <linux/io.h>
 #include <linux/lockdep.h>
+#include <linux/kthread.h>
 
 #include <asm/processor.h>
 #include <asm/ioctl.h>
@@ -103,8 +104,7 @@ static cpumask_var_t cpus_hardware_enabled;
 static int kvm_usage_count;
 static atomic_t hardware_enable_failed;
 
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+static struct kmem_cache *kvm_vcpu_cache;
 
 static __read_mostly struct preempt_ops kvm_preempt_ops;
 
@@ -112,7 +112,7 @@ struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
 static int kvm_debugfs_num_entries;
-static const struct file_operations *stat_fops_per_vm[];
+static const struct file_operations stat_fops_per_vm;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
@@ -121,9 +121,22 @@ static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
                                  unsigned long arg);
 #define KVM_COMPAT(c)  .compat_ioctl   = (c)
 #else
+/*
+ * For architectures that don't implement a compat infrastructure,
+ * adopt a double line of defense:
+ * - Prevent a compat task from opening /dev/kvm
+ * - If the open has been done by a 64bit task, and the KVM fd
+ *   passed to a compat task, let the ioctls fail.
+ */
 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl,
                                unsigned long arg) { return -EINVAL; }
-#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl
+
+static int kvm_no_compat_open(struct inode *inode, struct file *file)
+{
+       return is_compat_task() ? -ENODEV : 0;
+}
+#define KVM_COMPAT(c)  .compat_ioctl   = kvm_no_compat_ioctl,  \
+                       .open           = kvm_no_compat_open
 #endif
 static int hardware_enable_all(void);
 static void hardware_disable_all(void);
@@ -149,10 +162,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
        return 0;
 }
 
+bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
+{
+       /*
+        * The metadata used by is_zone_device_page() to determine whether or
+        * not a page is ZONE_DEVICE is guaranteed to be valid if and only if
+        * the device has been pinned, e.g. by get_user_pages().  WARN if the
+        * page_count() is zero to help detect bad usage of this helper.
+        */
+       if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn))))
+               return false;
+
+       return is_zone_device_page(pfn_to_page(pfn));
+}
+
 bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
 {
+       /*
+        * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
+        * perspective they are "normal" pages, albeit with slightly different
+        * usage rules.
+        */
        if (pfn_valid(pfn))
-               return PageReserved(pfn_to_page(pfn));
+               return PageReserved(pfn_to_page(pfn)) &&
+                      !kvm_is_zone_device_pfn(pfn);
 
        return true;
 }
@@ -288,7 +321,7 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+static int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
        struct page *page;
        int r;
@@ -315,6 +348,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        kvm_vcpu_set_dy_eligible(vcpu, false);
        vcpu->preempted = false;
        vcpu->ready = false;
+       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 
        r = kvm_arch_vcpu_init(vcpu);
        if (r < 0)
@@ -326,9 +360,8 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 fail:
        return r;
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
        /*
         * no need for rcu_read_lock as VCPU_RUN is the only place that
@@ -339,7 +372,15 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
        kvm_arch_vcpu_uninit(vcpu);
        free_page((unsigned long)vcpu->run);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+
+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+       kvm_arch_vcpu_destroy(vcpu);
+
+       kvm_vcpu_uninit(vcpu);
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
@@ -616,19 +657,37 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                        return -ENOMEM;
 
                stat_data->kvm = kvm;
-               stat_data->offset = p->offset;
-               stat_data->mode = p->mode ? p->mode : 0644;
+               stat_data->dbgfs_item = p;
                kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-               debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
-                                   stat_data, stat_fops_per_vm[p->kind]);
+               debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+                                   kvm->debugfs_dentry, stat_data,
+                                   &stat_fops_per_vm);
        }
        return 0;
 }
 
+/*
+ * Called after the VM is otherwise initialized, but just before adding it to
+ * the vm_list.
+ */
+int __weak kvm_arch_post_init_vm(struct kvm *kvm)
+{
+       return 0;
+}
+
+/*
+ * Called just after removing the VM from the vm_list, but before doing any
+ * other destruction.
+ */
+void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
+{
+}
+
 static struct kvm *kvm_create_vm(unsigned long type)
 {
-       int r, i;
        struct kvm *kvm = kvm_arch_alloc_vm();
+       int r = -ENOMEM;
+       int i;
 
        if (!kvm)
                return ERR_PTR(-ENOMEM);
@@ -640,45 +699,50 @@ static struct kvm *kvm_create_vm(unsigned long type)
        mutex_init(&kvm->lock);
        mutex_init(&kvm->irq_lock);
        mutex_init(&kvm->slots_lock);
-       refcount_set(&kvm->users_count, 1);
        INIT_LIST_HEAD(&kvm->devices);
 
-       r = kvm_arch_init_vm(kvm, type);
-       if (r)
-               goto out_err_no_disable;
-
-       r = hardware_enable_all();
-       if (r)
-               goto out_err_no_disable;
-
-#ifdef CONFIG_HAVE_KVM_IRQFD
-       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
-#endif
-
        BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
 
-       r = -ENOMEM;
+       if (init_srcu_struct(&kvm->srcu))
+               goto out_err_no_srcu;
+       if (init_srcu_struct(&kvm->irq_srcu))
+               goto out_err_no_irq_srcu;
+
+       refcount_set(&kvm->users_count, 1);
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
                struct kvm_memslots *slots = kvm_alloc_memslots();
+
                if (!slots)
-                       goto out_err_no_srcu;
+                       goto out_err_no_arch_destroy_vm;
                /* Generations must be different for each address space. */
                slots->generation = i;
                rcu_assign_pointer(kvm->memslots[i], slots);
        }
 
-       if (init_srcu_struct(&kvm->srcu))
-               goto out_err_no_srcu;
-       if (init_srcu_struct(&kvm->irq_srcu))
-               goto out_err_no_irq_srcu;
        for (i = 0; i < KVM_NR_BUSES; i++) {
                rcu_assign_pointer(kvm->buses[i],
                        kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
                if (!kvm->buses[i])
-                       goto out_err;
+                       goto out_err_no_arch_destroy_vm;
        }
 
+       r = kvm_arch_init_vm(kvm, type);
+       if (r)
+               goto out_err_no_arch_destroy_vm;
+
+       r = hardware_enable_all();
+       if (r)
+               goto out_err_no_disable;
+
+#ifdef CONFIG_HAVE_KVM_IRQFD
+       INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
+#endif
+
        r = kvm_init_mmu_notifier(kvm);
+       if (r)
+               goto out_err_no_mmu_notifier;
+
+       r = kvm_arch_post_init_vm(kvm);
        if (r)
                goto out_err;
 
@@ -691,17 +755,24 @@ static struct kvm *kvm_create_vm(unsigned long type)
        return kvm;
 
 out_err:
-       cleanup_srcu_struct(&kvm->irq_srcu);
-out_err_no_irq_srcu:
-       cleanup_srcu_struct(&kvm->srcu);
-out_err_no_srcu:
+#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
+       if (kvm->mmu_notifier.ops)
+               mmu_notifier_unregister(&kvm->mmu_notifier, current->mm);
+#endif
+out_err_no_mmu_notifier:
        hardware_disable_all();
 out_err_no_disable:
-       refcount_set(&kvm->users_count, 0);
+       kvm_arch_destroy_vm(kvm);
+out_err_no_arch_destroy_vm:
+       WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
        for (i = 0; i < KVM_NR_BUSES; i++)
                kfree(kvm_get_bus(kvm, i));
        for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
                kvm_free_memslots(kvm, __kvm_memslots(kvm, i));
+       cleanup_srcu_struct(&kvm->irq_srcu);
+out_err_no_irq_srcu:
+       cleanup_srcu_struct(&kvm->srcu);
+out_err_no_srcu:
        kvm_arch_free_vm(kvm);
        mmdrop(current->mm);
        return ERR_PTR(r);
@@ -733,6 +804,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
        mutex_lock(&kvm_lock);
        list_del(&kvm->vm_list);
        mutex_unlock(&kvm_lock);
+       kvm_arch_pre_destroy_vm(kvm);
+
        kvm_free_irq_routing(kvm);
        for (i = 0; i < KVM_NR_BUSES; i++) {
                struct kvm_io_bus *bus = kvm_get_bus(kvm, i);
@@ -772,6 +845,18 @@ void kvm_put_kvm(struct kvm *kvm)
 }
 EXPORT_SYMBOL_GPL(kvm_put_kvm);
 
+/*
+ * Used to put a reference that was taken on behalf of an object associated
+ * with a user-visible file descriptor, e.g. a vcpu or device, if installation
+ * of the new file descriptor fails and the reference cannot be transferred to
+ * its final owner.  In such cases, the caller is still actively using @kvm and
+ * will fail miserably if the refcount unexpectedly hits zero.
+ */
+void kvm_put_kvm_no_destroy(struct kvm *kvm)
+{
+       WARN_ON(refcount_dec_and_test(&kvm->users_count));
+}
+EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
 
 static int kvm_vm_release(struct inode *inode, struct file *filp)
 {
@@ -886,7 +971,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 
        /*
         * Increment the new memslot generation a second time, dropping the
-        * update in-progress flag and incrementing then generation based on
+        * update in-progress flag and incrementing the generation based on
         * the number of address spaces.  This provides a unique and easily
         * identifiable generation number while the memslots are in flux.
         */
@@ -1039,7 +1124,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                 *
                 * validation of sp->gfn happens in:
                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                *      - kvm_is_visible_gfn (mmu_check_roots)
+                *      - kvm_is_visible_gfn (mmu_check_root)
                 */
                kvm_arch_flush_shadow_memslot(kvm, slot);
 
@@ -1441,7 +1526,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 /*
  * The fast path to get the writable pfn which will be stored in @pfn,
  * true indicates success, otherwise false is returned.  It's also the
- * only part that runs if we can are in atomic context.
+ * only part that runs if we can in atomic context.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
                            bool *writable, kvm_pfn_t *pfn)
@@ -1853,17 +1938,14 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn)) {
-               struct page *page = pfn_to_page(pfn);
-
-               SetPageDirty(page);
-       }
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
+               SetPageDirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
 void kvm_set_pfn_accessed(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn))
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
                mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
@@ -2360,20 +2442,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        kvm_arch_vcpu_unblocking(vcpu);
        block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
-       if (!vcpu_valid_wakeup(vcpu))
-               shrink_halt_poll_ns(vcpu);
-       else if (halt_poll_ns) {
-               if (block_ns <= vcpu->halt_poll_ns)
-                       ;
-               /* we had a long block, shrink polling */
-               else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+       if (!kvm_arch_no_poll(vcpu)) {
+               if (!vcpu_valid_wakeup(vcpu)) {
                        shrink_halt_poll_ns(vcpu);
-               /* we had a short halt and our poll time is too small */
-               else if (vcpu->halt_poll_ns < halt_poll_ns &&
-                       block_ns < halt_poll_ns)
-                       grow_halt_poll_ns(vcpu);
-       } else
-               vcpu->halt_poll_ns = 0;
+               } else if (halt_poll_ns) {
+                       if (block_ns <= vcpu->halt_poll_ns)
+                               ;
+                       /* we had a long block, shrink polling */
+                       else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+                               shrink_halt_poll_ns(vcpu);
+                       /* we had a short halt and our poll time is too small */
+                       else if (vcpu->halt_poll_ns < halt_poll_ns &&
+                               block_ns < halt_poll_ns)
+                               grow_halt_poll_ns(vcpu);
+               } else {
+                       vcpu->halt_poll_ns = 0;
+               }
+       }
 
        trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu));
        kvm_arch_vcpu_block_finish(vcpu);
@@ -2650,13 +2735,23 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        kvm->created_vcpus++;
        mutex_unlock(&kvm->lock);
 
-       vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu)) {
-               r = PTR_ERR(vcpu);
+       r = kvm_arch_vcpu_precreate(kvm, id);
+       if (r)
+               goto vcpu_decrement;
+
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       if (!vcpu) {
+               r = -ENOMEM;
                goto vcpu_decrement;
        }
 
-       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+       r = kvm_vcpu_init(vcpu, kvm, id);
+       if (r)
+               goto vcpu_free;
+
+       r = kvm_arch_vcpu_create(vcpu);
+       if (r)
+               goto vcpu_uninit;
 
        r = kvm_arch_vcpu_setup(vcpu);
        if (r)
@@ -2670,17 +2765,18 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
                goto unlock_vcpu_destroy;
        }
 
-       BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]);
+       vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus);
+       BUG_ON(kvm->vcpus[vcpu->vcpu_idx]);
 
        /* Now it's all set up, let userspace reach it */
        kvm_get_kvm(kvm);
        r = create_vcpu_fd(vcpu);
        if (r < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                goto unlock_vcpu_destroy;
        }
 
-       kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu;
+       kvm->vcpus[vcpu->vcpu_idx] = vcpu;
 
        /*
         * Pairs with smp_rmb() in kvm_get_vcpu.  Write kvm->vcpus
@@ -2698,6 +2794,10 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        debugfs_remove_recursive(vcpu->debugfs_dentry);
 vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+vcpu_uninit:
+       kvm_vcpu_uninit(vcpu);
+vcpu_free:
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 vcpu_decrement:
        mutex_lock(&kvm->lock);
        kvm->created_vcpus--;
@@ -3046,14 +3146,14 @@ struct kvm_device *kvm_device_from_filp(struct file *filp)
        return filp->private_data;
 }
 
-static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
+static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
 #ifdef CONFIG_KVM_MPIC
        [KVM_DEV_TYPE_FSL_MPIC_20]      = &kvm_mpic_ops,
        [KVM_DEV_TYPE_FSL_MPIC_42]      = &kvm_mpic_ops,
 #endif
 };
 
-int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
+int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
 {
        if (type >= ARRAY_SIZE(kvm_device_ops_table))
                return -ENOSPC;
@@ -3074,7 +3174,7 @@ void kvm_unregister_device_ops(u32 type)
 static int kvm_ioctl_create_device(struct kvm *kvm,
                                   struct kvm_create_device *cd)
 {
-       struct kvm_device_ops *ops = NULL;
+       const struct kvm_device_ops *ops = NULL;
        struct kvm_device *dev;
        bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
        int type;
@@ -3114,7 +3214,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
        kvm_get_kvm(kvm);
        ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC);
        if (ret < 0) {
-               kvm_put_kvm(kvm);
+               kvm_put_kvm_no_destroy(kvm);
                mutex_lock(&kvm->lock);
                list_del(&dev->vm_node);
                mutex_unlock(&kvm->lock);
@@ -3931,8 +4031,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
                return -ENOENT;
 
        if (simple_attr_open(inode, file, get,
-                            stat_data->mode & S_IWUGO ? set : NULL,
-                            fmt)) {
+                   KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
+                   ? set : NULL,
+                   fmt)) {
                kvm_put_kvm(stat_data->kvm);
                return -ENOMEM;
        }
@@ -3951,105 +4052,111 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static int vm_stat_get_per_vm(void *data, u64 *val)
+static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
 {
-       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       *val = *(ulong *)((void *)kvm + offset);
 
-       *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
+       return 0;
+}
+
+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
+{
+       *(ulong *)((void *)kvm + offset) = 0;
 
        return 0;
 }
 
-static int vm_stat_clear_per_vm(void *data, u64 val)
+static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
 {
-       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       int i;
+       struct kvm_vcpu *vcpu;
 
-       if (val)
-               return -EINVAL;
+       *val = 0;
 
-       *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               *val += *(u64 *)((void *)vcpu + offset);
 
        return 0;
 }
 
-static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
 {
-       __simple_attr_check_format("%llu\n", 0ull);
-       return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
-                               vm_stat_clear_per_vm, "%llu\n");
-}
+       int i;
+       struct kvm_vcpu *vcpu;
 
-static const struct file_operations vm_stat_get_per_vm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = vm_stat_get_per_vm_open,
-       .release = kvm_debugfs_release,
-       .read    = simple_attr_read,
-       .write   = simple_attr_write,
-       .llseek  = no_llseek,
-};
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               *(u64 *)((void *)vcpu + offset) = 0;
+
+       return 0;
+}
 
-static int vcpu_stat_get_per_vm(void *data, u64 *val)
+static int kvm_stat_data_get(void *data, u64 *val)
 {
-       int i;
+       int r = -EFAULT;
        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-       struct kvm_vcpu *vcpu;
-
-       *val = 0;
 
-       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *val += *(u64 *)((void *)vcpu + stat_data->offset);
+       switch (stat_data->dbgfs_item->kind) {
+       case KVM_STAT_VM:
+               r = kvm_get_stat_per_vm(stat_data->kvm,
+                                       stat_data->dbgfs_item->offset, val);
+               break;
+       case KVM_STAT_VCPU:
+               r = kvm_get_stat_per_vcpu(stat_data->kvm,
+                                         stat_data->dbgfs_item->offset, val);
+               break;
+       }
 
-       return 0;
+       return r;
 }
 
-static int vcpu_stat_clear_per_vm(void *data, u64 val)
+static int kvm_stat_data_clear(void *data, u64 val)
 {
-       int i;
+       int r = -EFAULT;
        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-       struct kvm_vcpu *vcpu;
 
        if (val)
                return -EINVAL;
 
-       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *(u64 *)((void *)vcpu + stat_data->offset) = 0;
+       switch (stat_data->dbgfs_item->kind) {
+       case KVM_STAT_VM:
+               r = kvm_clear_stat_per_vm(stat_data->kvm,
+                                         stat_data->dbgfs_item->offset);
+               break;
+       case KVM_STAT_VCPU:
+               r = kvm_clear_stat_per_vcpu(stat_data->kvm,
+                                           stat_data->dbgfs_item->offset);
+               break;
+       }
 
-       return 0;
+       return r;
 }
 
-static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_stat_data_open(struct inode *inode, struct file *file)
 {
        __simple_attr_check_format("%llu\n", 0ull);
-       return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
-                                vcpu_stat_clear_per_vm, "%llu\n");
+       return kvm_debugfs_open(inode, file, kvm_stat_data_get,
+                               kvm_stat_data_clear, "%llu\n");
 }
 
-static const struct file_operations vcpu_stat_get_per_vm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = vcpu_stat_get_per_vm_open,
+static const struct file_operations stat_fops_per_vm = {
+       .owner = THIS_MODULE,
+       .open = kvm_stat_data_open,
        .release = kvm_debugfs_release,
-       .read    = simple_attr_read,
-       .write   = simple_attr_write,
-       .llseek  = no_llseek,
-};
-
-static const struct file_operations *stat_fops_per_vm[] = {
-       [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
-       [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+       .read = simple_attr_read,
+       .write = simple_attr_write,
+       .llseek = no_llseek,
 };
 
 static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
        u64 tmp_val;
 
        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               kvm_get_stat_per_vm(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
@@ -4060,15 +4167,13 @@ static int vm_stat_clear(void *_offset, u64 val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
 
        if (val)
                return -EINVAL;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vm_stat_clear_per_vm((void *)&stat_tmp, 0);
+               kvm_clear_stat_per_vm(kvm, offset);
        }
        mutex_unlock(&kvm_lock);
 
@@ -4081,14 +4186,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
        u64 tmp_val;
 
        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
@@ -4099,15 +4202,13 @@ static int vcpu_stat_clear(void *_offset, u64 val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
 
        if (val)
                return -EINVAL;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
+               kvm_clear_stat_per_vcpu(kvm, offset);
        }
        mutex_unlock(&kvm_lock);
 
@@ -4180,9 +4281,8 @@ static void kvm_init_debug(void)
 
        kvm_debugfs_num_entries = 0;
        for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-               int mode = p->mode ? p->mode : 0644;
-               debugfs_create_file(p->name, mode, kvm_debugfs_dir,
-                                   (void *)(long)p->offset,
+               debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+                                   kvm_debugfs_dir, (void *)(long)p->offset,
                                    stat_fops[p->kind]);
        }
 }
@@ -4272,12 +4372,12 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
        r = kvm_arch_hardware_setup();
        if (r < 0)
-               goto out_free_0a;
+               goto out_free_1;
 
        for_each_online_cpu(cpu) {
                smp_call_function_single(cpu, check_processor_compat, &r, 1);
                if (r < 0)
-                       goto out_free_1;
+                       goto out_free_2;
        }
 
        r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting",
@@ -4334,9 +4434,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
        unregister_reboot_notifier(&kvm_reboot_notifier);
        cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING);
 out_free_2:
-out_free_1:
        kvm_arch_hardware_unsetup();
-out_free_0a:
+out_free_1:
        free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
        kvm_irqfd_exit();
@@ -4364,3 +4463,86 @@ void kvm_exit(void)
        kvm_vfio_ops_exit();
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
+
+struct kvm_vm_worker_thread_context {
+       struct kvm *kvm;
+       struct task_struct *parent;
+       struct completion init_done;
+       kvm_vm_thread_fn_t thread_fn;
+       uintptr_t data;
+       int err;
+};
+
+static int kvm_vm_worker_thread(void *context)
+{
+       /*
+        * The init_context is allocated on the stack of the parent thread, so
+        * we have to locally copy anything that is needed beyond initialization
+        */
+       struct kvm_vm_worker_thread_context *init_context = context;
+       struct kvm *kvm = init_context->kvm;
+       kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
+       uintptr_t data = init_context->data;
+       int err;
+
+       err = kthread_park(current);
+       /* kthread_park(current) is never supposed to return an error */
+       WARN_ON(err != 0);
+       if (err)
+               goto init_complete;
+
+       err = cgroup_attach_task_all(init_context->parent, current);
+       if (err) {
+               kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
+                       __func__, err);
+               goto init_complete;
+       }
+
+       set_user_nice(current, task_nice(init_context->parent));
+
+init_complete:
+       init_context->err = err;
+       complete(&init_context->init_done);
+       init_context = NULL;
+
+       if (err)
+               return err;
+
+       /* Wait to be woken up by the spawner before proceeding. */
+       kthread_parkme();
+
+       if (!kthread_should_stop())
+               err = thread_fn(kvm, data);
+
+       return err;
+}
+
+int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
+                               uintptr_t data, const char *name,
+                               struct task_struct **thread_ptr)
+{
+       struct kvm_vm_worker_thread_context init_context = {};
+       struct task_struct *thread;
+
+       *thread_ptr = NULL;
+       init_context.kvm = kvm;
+       init_context.parent = current;
+       init_context.thread_fn = thread_fn;
+       init_context.data = data;
+       init_completion(&init_context.init_done);
+
+       thread = kthread_run(kvm_vm_worker_thread, &init_context,
+                            "%s-%d", name, task_pid_nr(current));
+       if (IS_ERR(thread))
+               return PTR_ERR(thread);
+
+       /* kthread_run is never supposed to return NULL */
+       WARN_ON(thread == NULL);
+
+       wait_for_completion(&init_context.init_done);
+
+       if (!init_context.err)
+               *thread_ptr = thread;
+
+       return init_context.err;
+}