]> asedeno.scripts.mit.edu Git - linux.git/blobdiff - virt/kvm/kvm_main.c
Merge tag 'iomap-5.6-merge-3' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
[linux.git] / virt / kvm / kvm_main.c
index 00268290dcbd85a75ec2f17cc2caa5431c03208b..7e63a3236364087def614afc0bee92c5cbc36779 100644 (file)
@@ -104,16 +104,16 @@ static cpumask_var_t cpus_hardware_enabled;
 static int kvm_usage_count;
 static atomic_t hardware_enable_failed;
 
-struct kmem_cache *kvm_vcpu_cache;
-EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+static struct kmem_cache *kvm_vcpu_cache;
 
 static __read_mostly struct preempt_ops kvm_preempt_ops;
+static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
 
 struct dentry *kvm_debugfs_dir;
 EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
 
 static int kvm_debugfs_num_entries;
-static const struct file_operations *stat_fops_per_vm[];
+static const struct file_operations stat_fops_per_vm;
 
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
                           unsigned long arg);
@@ -191,12 +191,24 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn)
        return true;
 }
 
+bool kvm_is_transparent_hugepage(kvm_pfn_t pfn)
+{
+       struct page *page = pfn_to_page(pfn);
+
+       if (!PageTransCompoundMap(page))
+               return false;
+
+       return is_transparent_hugepage(compound_head(page));
+}
+
 /*
  * Switches to specified vcpu, until a matching vcpu_put()
  */
 void vcpu_load(struct kvm_vcpu *vcpu)
 {
        int cpu = get_cpu();
+
+       __this_cpu_write(kvm_running_vcpu, vcpu);
        preempt_notifier_register(&vcpu->preempt_notifier);
        kvm_arch_vcpu_load(vcpu, cpu);
        put_cpu();
@@ -208,6 +220,7 @@ void vcpu_put(struct kvm_vcpu *vcpu)
        preempt_disable();
        kvm_arch_vcpu_put(vcpu);
        preempt_notifier_unregister(&vcpu->preempt_notifier);
+       __this_cpu_write(kvm_running_vcpu, NULL);
        preempt_enable();
 }
 EXPORT_SYMBOL_GPL(vcpu_put);
@@ -322,11 +335,8 @@ void kvm_reload_remote_mmus(struct kvm *kvm)
        kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
-int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
+static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
-       struct page *page;
-       int r;
-
        mutex_init(&vcpu->mutex);
        vcpu->cpu = -1;
        vcpu->kvm = kvm;
@@ -338,42 +348,28 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
        vcpu->pre_pcpu = -1;
        INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
 
-       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
-       if (!page) {
-               r = -ENOMEM;
-               goto fail;
-       }
-       vcpu->run = page_address(page);
-
        kvm_vcpu_set_in_spin_loop(vcpu, false);
        kvm_vcpu_set_dy_eligible(vcpu, false);
        vcpu->preempted = false;
        vcpu->ready = false;
-
-       r = kvm_arch_vcpu_init(vcpu);
-       if (r < 0)
-               goto fail_free_run;
-       return 0;
-
-fail_free_run:
-       free_page((unsigned long)vcpu->run);
-fail:
-       return r;
+       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_init);
 
-void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
+void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
+       kvm_arch_vcpu_destroy(vcpu);
+
        /*
-        * no need for rcu_read_lock as VCPU_RUN is the only place that
-        * will change the vcpu->pid pointer and on uninit all file
-        * descriptors are already gone.
+        * No need for rcu_read_lock as VCPU_RUN is the only place that changes
+        * the vcpu->pid pointer, and at destruction time all file descriptors
+        * are already gone.
         */
        put_pid(rcu_dereference_protected(vcpu->pid, 1));
-       kvm_arch_vcpu_uninit(vcpu);
+
        free_page((unsigned long)vcpu->run);
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
-EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
+EXPORT_SYMBOL_GPL(kvm_vcpu_destroy);
 
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
@@ -650,11 +646,11 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
                        return -ENOMEM;
 
                stat_data->kvm = kvm;
-               stat_data->offset = p->offset;
-               stat_data->mode = p->mode ? p->mode : 0644;
+               stat_data->dbgfs_item = p;
                kvm->debugfs_stat_data[p - debugfs_entries] = stat_data;
-               debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry,
-                                   stat_data, stat_fops_per_vm[p->kind]);
+               debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+                                   kvm->debugfs_dentry, stat_data,
+                                   &stat_fops_per_vm);
        }
        return 0;
 }
@@ -964,7 +960,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 
        /*
         * Increment the new memslot generation a second time, dropping the
-        * update in-progress flag and incrementing then generation based on
+        * update in-progress flag and incrementing the generation based on
         * the number of address spaces.  This provides a unique and easily
         * identifiable generation number while the memslots are in flux.
         */
@@ -1117,7 +1113,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
                 *
                 * validation of sp->gfn happens in:
                 *      - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
-                *      - kvm_is_visible_gfn (mmu_check_roots)
+                *      - kvm_is_visible_gfn (mmu_check_root)
                 */
                kvm_arch_flush_shadow_memslot(kvm, slot);
 
@@ -1406,14 +1402,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
 
-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
        struct vm_area_struct *vma;
        unsigned long addr, size;
 
        size = PAGE_SIZE;
 
-       addr = gfn_to_hva(kvm, gfn);
+       addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
        if (kvm_is_error_hva(addr))
                return PAGE_SIZE;
 
@@ -1519,7 +1515,7 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 /*
  * The fast path to get the writable pfn which will be stored in @pfn,
  * true indicates success, otherwise false is returned.  It's also the
- * only part that runs if we can are in atomic context.
+ * only part that runs if we can in atomic context.
  */
 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
                            bool *writable, kvm_pfn_t *pfn)
@@ -1821,26 +1817,72 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
 }
 EXPORT_SYMBOL_GPL(gfn_to_page);
 
-static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
-                        struct kvm_host_map *map)
+void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
+{
+       if (pfn == 0)
+               return;
+
+       if (cache)
+               cache->pfn = cache->gfn = 0;
+
+       if (dirty)
+               kvm_release_pfn_dirty(pfn);
+       else
+               kvm_release_pfn_clean(pfn);
+}
+
+static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
+                                struct gfn_to_pfn_cache *cache, u64 gen)
+{
+       kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+       cache->pfn = gfn_to_pfn_memslot(slot, gfn);
+       cache->gfn = gfn;
+       cache->dirty = false;
+       cache->generation = gen;
+}
+
+static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+                        struct kvm_host_map *map,
+                        struct gfn_to_pfn_cache *cache,
+                        bool atomic)
 {
        kvm_pfn_t pfn;
        void *hva = NULL;
        struct page *page = KVM_UNMAPPED_PAGE;
+       struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
+       u64 gen = slots->generation;
 
        if (!map)
                return -EINVAL;
 
-       pfn = gfn_to_pfn_memslot(slot, gfn);
+       if (cache) {
+               if (!cache->pfn || cache->gfn != gfn ||
+                       cache->generation != gen) {
+                       if (atomic)
+                               return -EAGAIN;
+                       kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
+               }
+               pfn = cache->pfn;
+       } else {
+               if (atomic)
+                       return -EAGAIN;
+               pfn = gfn_to_pfn_memslot(slot, gfn);
+       }
        if (is_error_noslot_pfn(pfn))
                return -EINVAL;
 
        if (pfn_valid(pfn)) {
                page = pfn_to_page(pfn);
-               hva = kmap(page);
+               if (atomic)
+                       hva = kmap_atomic(page);
+               else
+                       hva = kmap(page);
 #ifdef CONFIG_HAS_IOMEM
-       } else {
+       } else if (!atomic) {
                hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+       } else {
+               return -EINVAL;
 #endif
        }
 
@@ -1855,14 +1897,25 @@ static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
        return 0;
 }
 
+int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+               struct gfn_to_pfn_cache *cache, bool atomic)
+{
+       return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
+                       cache, atomic);
+}
+EXPORT_SYMBOL_GPL(kvm_map_gfn);
+
 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
 {
-       return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+       return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
+               NULL, false);
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_map);
 
-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
-                   bool dirty)
+static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+                       struct kvm_host_map *map,
+                       struct gfn_to_pfn_cache *cache,
+                       bool dirty, bool atomic)
 {
        if (!map)
                return;
@@ -1870,23 +1923,45 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
        if (!map->hva)
                return;
 
-       if (map->page != KVM_UNMAPPED_PAGE)
-               kunmap(map->page);
+       if (map->page != KVM_UNMAPPED_PAGE) {
+               if (atomic)
+                       kunmap_atomic(map->hva);
+               else
+                       kunmap(map->page);
+       }
 #ifdef CONFIG_HAS_IOMEM
-       else
+       else if (!atomic)
                memunmap(map->hva);
+       else
+               WARN_ONCE(1, "Unexpected unmapping in atomic context");
 #endif
 
-       if (dirty) {
-               kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
-               kvm_release_pfn_dirty(map->pfn);
-       } else {
-               kvm_release_pfn_clean(map->pfn);
-       }
+       if (dirty)
+               mark_page_dirty_in_slot(memslot, map->gfn);
+
+       if (cache)
+               cache->dirty |= dirty;
+       else
+               kvm_release_pfn(map->pfn, dirty, NULL);
 
        map->hva = NULL;
        map->page = NULL;
 }
+
+int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 
+                 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
+{
+       __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
+                       cache, dirty, atomic);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
+
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+{
+       __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
+                       dirty, false);
+}
 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
 
 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -1931,11 +2006,8 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
 
 void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 {
-       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) {
-               struct page *page = pfn_to_page(pfn);
-
-               SetPageDirty(page);
-       }
+       if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn))
+               SetPageDirty(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
 
@@ -2051,17 +2123,6 @@ static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
        return 0;
 }
 
-int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
-                         unsigned long len)
-{
-       gfn_t gfn = gpa >> PAGE_SHIFT;
-       struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
-       int offset = offset_in_page(gpa);
-
-       return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
-}
-EXPORT_SYMBOL_GPL(kvm_read_guest_atomic);
-
 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
                               void *data, unsigned long len)
 {
@@ -2158,33 +2219,36 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
        gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT;
        gfn_t nr_pages_needed = end_gfn - start_gfn + 1;
        gfn_t nr_pages_avail;
-       int r = start_gfn <= end_gfn ? 0 : -EINVAL;
 
-       ghc->gpa = gpa;
+       /* Update ghc->generation before performing any error checks. */
        ghc->generation = slots->generation;
-       ghc->len = len;
-       ghc->hva = KVM_HVA_ERR_BAD;
+
+       if (start_gfn > end_gfn) {
+               ghc->hva = KVM_HVA_ERR_BAD;
+               return -EINVAL;
+       }
 
        /*
         * If the requested region crosses two memslots, we still
         * verify that the entire region is valid here.
         */
-       while (!r && start_gfn <= end_gfn) {
+       for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
                ghc->memslot = __gfn_to_memslot(slots, start_gfn);
                ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn,
                                           &nr_pages_avail);
                if (kvm_is_error_hva(ghc->hva))
-                       r = -EFAULT;
-               start_gfn += nr_pages_avail;
+                       return -EFAULT;
        }
 
        /* Use the slow path for cross page reads and writes. */
-       if (!r && nr_pages_needed == 1)
+       if (nr_pages_needed == 1)
                ghc->hva += offset;
        else
                ghc->memslot = NULL;
 
-       return r;
+       ghc->gpa = gpa;
+       ghc->len = len;
+       return 0;
 }
 
 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
@@ -2205,15 +2269,17 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
        BUG_ON(len + offset > ghc->len);
 
-       if (slots->generation != ghc->generation)
-               __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
-
-       if (unlikely(!ghc->memslot))
-               return kvm_write_guest(kvm, gpa, data, len);
+       if (slots->generation != ghc->generation) {
+               if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
+                       return -EFAULT;
+       }
 
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
 
+       if (unlikely(!ghc->memslot))
+               return kvm_write_guest(kvm, gpa, data, len);
+
        r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
        if (r)
                return -EFAULT;
@@ -2238,15 +2304,17 @@ int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
 
        BUG_ON(len > ghc->len);
 
-       if (slots->generation != ghc->generation)
-               __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
-
-       if (unlikely(!ghc->memslot))
-               return kvm_read_guest(kvm, ghc->gpa, data, len);
+       if (slots->generation != ghc->generation) {
+               if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
+                       return -EFAULT;
+       }
 
        if (kvm_is_error_hva(ghc->hva))
                return -EFAULT;
 
+       if (unlikely(!ghc->memslot))
+               return kvm_read_guest(kvm, ghc->gpa, data, len);
+
        r = __copy_from_user(data, (void __user *)ghc->hva, len);
        if (r)
                return -EFAULT;
@@ -2718,6 +2786,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 {
        int r;
        struct kvm_vcpu *vcpu;
+       struct page *page;
 
        if (id >= KVM_MAX_VCPU_ID)
                return -EINVAL;
@@ -2731,17 +2800,29 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
        kvm->created_vcpus++;
        mutex_unlock(&kvm->lock);
 
-       vcpu = kvm_arch_vcpu_create(kvm, id);
-       if (IS_ERR(vcpu)) {
-               r = PTR_ERR(vcpu);
+       r = kvm_arch_vcpu_precreate(kvm, id);
+       if (r)
+               goto vcpu_decrement;
+
+       vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+       if (!vcpu) {
+               r = -ENOMEM;
                goto vcpu_decrement;
        }
 
-       preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
+       BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
+       page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+       if (!page) {
+               r = -ENOMEM;
+               goto vcpu_free;
+       }
+       vcpu->run = page_address(page);
+
+       kvm_vcpu_init(vcpu, kvm, id);
 
-       r = kvm_arch_vcpu_setup(vcpu);
+       r = kvm_arch_vcpu_create(vcpu);
        if (r)
-               goto vcpu_destroy;
+               goto vcpu_free_run_page;
 
        kvm_create_vcpu_debugfs(vcpu);
 
@@ -2778,8 +2859,11 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
 unlock_vcpu_destroy:
        mutex_unlock(&kvm->lock);
        debugfs_remove_recursive(vcpu->debugfs_dentry);
-vcpu_destroy:
        kvm_arch_vcpu_destroy(vcpu);
+vcpu_free_run_page:
+       free_page((unsigned long)vcpu->run);
+vcpu_free:
+       kmem_cache_free(kvm_vcpu_cache, vcpu);
 vcpu_decrement:
        mutex_lock(&kvm->lock);
        kvm->created_vcpus--;
@@ -4013,8 +4097,9 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file,
                return -ENOENT;
 
        if (simple_attr_open(inode, file, get,
-                            stat_data->mode & S_IWUGO ? set : NULL,
-                            fmt)) {
+                   KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222
+                   ? set : NULL,
+                   fmt)) {
                kvm_put_kvm(stat_data->kvm);
                return -ENOMEM;
        }
@@ -4033,105 +4118,111 @@ static int kvm_debugfs_release(struct inode *inode, struct file *file)
        return 0;
 }
 
-static int vm_stat_get_per_vm(void *data, u64 *val)
+static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val)
 {
-       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       *val = *(ulong *)((void *)kvm + offset);
+
+       return 0;
+}
 
-       *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset);
+static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
+{
+       *(ulong *)((void *)kvm + offset) = 0;
 
        return 0;
 }
 
-static int vm_stat_clear_per_vm(void *data, u64 val)
+static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val)
 {
-       struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
+       int i;
+       struct kvm_vcpu *vcpu;
 
-       if (val)
-               return -EINVAL;
+       *val = 0;
 
-       *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0;
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               *val += *(u64 *)((void *)vcpu + offset);
 
        return 0;
 }
 
-static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
 {
-       __simple_attr_check_format("%llu\n", 0ull);
-       return kvm_debugfs_open(inode, file, vm_stat_get_per_vm,
-                               vm_stat_clear_per_vm, "%llu\n");
-}
+       int i;
+       struct kvm_vcpu *vcpu;
 
-static const struct file_operations vm_stat_get_per_vm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = vm_stat_get_per_vm_open,
-       .release = kvm_debugfs_release,
-       .read    = simple_attr_read,
-       .write   = simple_attr_write,
-       .llseek  = no_llseek,
-};
+       kvm_for_each_vcpu(i, vcpu, kvm)
+               *(u64 *)((void *)vcpu + offset) = 0;
+
+       return 0;
+}
 
-static int vcpu_stat_get_per_vm(void *data, u64 *val)
+static int kvm_stat_data_get(void *data, u64 *val)
 {
-       int i;
+       int r = -EFAULT;
        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-       struct kvm_vcpu *vcpu;
-
-       *val = 0;
 
-       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *val += *(u64 *)((void *)vcpu + stat_data->offset);
+       switch (stat_data->dbgfs_item->kind) {
+       case KVM_STAT_VM:
+               r = kvm_get_stat_per_vm(stat_data->kvm,
+                                       stat_data->dbgfs_item->offset, val);
+               break;
+       case KVM_STAT_VCPU:
+               r = kvm_get_stat_per_vcpu(stat_data->kvm,
+                                         stat_data->dbgfs_item->offset, val);
+               break;
+       }
 
-       return 0;
+       return r;
 }
 
-static int vcpu_stat_clear_per_vm(void *data, u64 val)
+static int kvm_stat_data_clear(void *data, u64 val)
 {
-       int i;
+       int r = -EFAULT;
        struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data;
-       struct kvm_vcpu *vcpu;
 
        if (val)
                return -EINVAL;
 
-       kvm_for_each_vcpu(i, vcpu, stat_data->kvm)
-               *(u64 *)((void *)vcpu + stat_data->offset) = 0;
+       switch (stat_data->dbgfs_item->kind) {
+       case KVM_STAT_VM:
+               r = kvm_clear_stat_per_vm(stat_data->kvm,
+                                         stat_data->dbgfs_item->offset);
+               break;
+       case KVM_STAT_VCPU:
+               r = kvm_clear_stat_per_vcpu(stat_data->kvm,
+                                           stat_data->dbgfs_item->offset);
+               break;
+       }
 
-       return 0;
+       return r;
 }
 
-static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file)
+static int kvm_stat_data_open(struct inode *inode, struct file *file)
 {
        __simple_attr_check_format("%llu\n", 0ull);
-       return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm,
-                                vcpu_stat_clear_per_vm, "%llu\n");
+       return kvm_debugfs_open(inode, file, kvm_stat_data_get,
+                               kvm_stat_data_clear, "%llu\n");
 }
 
-static const struct file_operations vcpu_stat_get_per_vm_fops = {
-       .owner   = THIS_MODULE,
-       .open    = vcpu_stat_get_per_vm_open,
+static const struct file_operations stat_fops_per_vm = {
+       .owner = THIS_MODULE,
+       .open = kvm_stat_data_open,
        .release = kvm_debugfs_release,
-       .read    = simple_attr_read,
-       .write   = simple_attr_write,
-       .llseek  = no_llseek,
-};
-
-static const struct file_operations *stat_fops_per_vm[] = {
-       [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops,
-       [KVM_STAT_VM]   = &vm_stat_get_per_vm_fops,
+       .read = simple_attr_read,
+       .write = simple_attr_write,
+       .llseek = no_llseek,
 };
 
 static int vm_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
        u64 tmp_val;
 
        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               kvm_get_stat_per_vm(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
@@ -4142,15 +4233,13 @@ static int vm_stat_clear(void *_offset, u64 val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
 
        if (val)
                return -EINVAL;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vm_stat_clear_per_vm((void *)&stat_tmp, 0);
+               kvm_clear_stat_per_vm(kvm, offset);
        }
        mutex_unlock(&kvm_lock);
 
@@ -4163,14 +4252,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
        u64 tmp_val;
 
        *val = 0;
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val);
+               kvm_get_stat_per_vcpu(kvm, offset, &tmp_val);
                *val += tmp_val;
        }
        mutex_unlock(&kvm_lock);
@@ -4181,15 +4268,13 @@ static int vcpu_stat_clear(void *_offset, u64 val)
 {
        unsigned offset = (long)_offset;
        struct kvm *kvm;
-       struct kvm_stat_data stat_tmp = {.offset = offset};
 
        if (val)
                return -EINVAL;
 
        mutex_lock(&kvm_lock);
        list_for_each_entry(kvm, &vm_list, vm_list) {
-               stat_tmp.kvm = kvm;
-               vcpu_stat_clear_per_vm((void *)&stat_tmp, 0);
+               kvm_clear_stat_per_vcpu(kvm, offset);
        }
        mutex_unlock(&kvm_lock);
 
@@ -4262,9 +4347,8 @@ static void kvm_init_debug(void)
 
        kvm_debugfs_num_entries = 0;
        for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) {
-               int mode = p->mode ? p->mode : 0644;
-               debugfs_create_file(p->name, mode, kvm_debugfs_dir,
-                                   (void *)(long)p->offset,
+               debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p),
+                                   kvm_debugfs_dir, (void *)(long)p->offset,
                                    stat_fops[p->kind]);
        }
 }
@@ -4304,8 +4388,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
        WRITE_ONCE(vcpu->preempted, false);
        WRITE_ONCE(vcpu->ready, false);
 
+       __this_cpu_write(kvm_running_vcpu, vcpu);
        kvm_arch_sched_in(vcpu, cpu);
-
        kvm_arch_vcpu_load(vcpu, cpu);
 }
 
@@ -4319,6 +4403,25 @@ static void kvm_sched_out(struct preempt_notifier *pn,
                WRITE_ONCE(vcpu->ready, true);
        }
        kvm_arch_vcpu_put(vcpu);
+       __this_cpu_write(kvm_running_vcpu, NULL);
+}
+
+/**
+ * kvm_get_running_vcpu - get the vcpu running on the current CPU.
+ * Thanks to preempt notifiers, this can also be called from
+ * preemptible context.
+ */
+struct kvm_vcpu *kvm_get_running_vcpu(void)
+{
+        return __this_cpu_read(kvm_running_vcpu);
+}
+
+/**
+ * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
+ */
+struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void)
+{
+        return &kvm_running_vcpu;
 }
 
 static void check_processor_compat(void *rtn)