From fc25376e9cd72c71a30c4343b02af122938d8532 Mon Sep 17 00:00:00 2001 From: Adttil <2429917001@qq.com> Date: Sat, 23 Aug 2025 17:23:52 +0800 Subject: [PATCH] kvm: Only enable shadow mode for ioeventfd and Align the lifecycle of KVM shadow with KVM --- include/linux/kvm_host.h | 27 ++++++++----- virt/kvm/eventfd.c | 56 ++++++++++++--------------- virt/kvm/kvm_main.c | 82 +++++++++++++++++++++++++++++++--------- 3 files changed, 107 insertions(+), 58 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index a5cebf76aaa5..4e9645af5f0c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -746,6 +746,7 @@ struct kvm_shadow { struct list_head ioeventfds_shadow; /* Used for temporarily storing modifications to the bus */ struct kvm_io_bus *buses_shadow[KVM_NR_BUSES]; + bool in_shadow; }; /* Global linked list, used to associate kvm with kvm_shadow */ @@ -967,18 +968,16 @@ static struct kvm_shadow *kvm_find_shadow(struct kvm *kvm) { struct kvm_shadow *ks; - list_for_each_entry(ks, &kvm_shadow_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(ks, &kvm_shadow_list, list) { if (ks->kvm == kvm) + rcu_read_unlock(); return ks; } + rcu_read_unlock(); return NULL; } -static inline bool is_kvm_in_shadow(struct kvm *kvm) -{ - return kvm_find_shadow(kvm) != NULL; -} - static inline struct kvm_io_bus *kvm_get_real_bus(struct kvm *kvm, enum kvm_bus idx) { return srcu_dereference_check(kvm->buses[idx], &kvm->srcu, @@ -986,10 +985,17 @@ static inline struct kvm_io_bus *kvm_get_real_bus(struct kvm *kvm, enum kvm_bus !refcount_read(&kvm->users_count)); } -static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx) +/* The eventfd device may need to processed through the shadow bus */ +static inline struct kvm_io_bus *kvm_get_bus(struct kvm *kvm, enum kvm_bus idx, bool is_eventfd) { - return is_kvm_in_shadow(kvm) ? - kvm_find_shadow(kvm)->buses_shadow[idx] : kvm_get_real_bus(kvm, idx); + struct kvm_shadow *ks; + + ks = kvm_find_shadow(kvm); + if (is_eventfd && ks && ks->in_shadow) { + return ks->buses_shadow[idx]; + } else { + return kvm_get_real_bus(kvm, idx); + } } static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i) @@ -2162,7 +2168,8 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi); #ifdef CONFIG_HAVE_KVM_EVENTFD -void kvm_release_ioeventfds_shadow(struct kvm *kvm); +bool kvm_io_device_is_ioeventfd(struct kvm_io_device *dev); +void kvm_release_ioeventfds_shadow(struct kvm_shadow *kvm); void kvm_eventfd_init(struct kvm *kvm); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 24f685ce8f92..c1c1061b25c1 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -163,6 +163,10 @@ irqfd_shutdown(struct work_struct *work) kfree(irqfd); } +bool kvm_io_device_is_ioeventfd(struct kvm_io_device *dev) +{ + return (const struct kvm_io_device_ops *)dev->ops == &ioeventfd_ops; +} /* assumes kvm->irqfds.lock is held */ static bool @@ -902,7 +906,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, * record the eventfd and operation, and process it later */ ks = kvm_find_shadow(kvm); - if (ks) { + if (ks && ks->in_shadow) { es = kzalloc(sizeof(*es), GFP_KERNEL_ACCOUNT); if (!es) { ret = -ENOMEM; @@ -925,7 +929,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, if (ret < 0) goto unlock_fail; - kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; + kvm_get_bus(kvm, bus_idx, true)->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); } @@ -979,7 +983,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, * record the eventfd and operation, and process it later */ ks = kvm_find_shadow(kvm); - if (ks) { + if (ks && ks->in_shadow) { es = kzalloc(sizeof(*es), GFP_KERNEL_ACCOUNT); if (!es) { ret = -ENOMEM; @@ -993,7 +997,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, ret = 0; } else { kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); - bus = kvm_get_bus(kvm, bus_idx); + bus = kvm_get_bus(kvm, bus_idx, true); if (bus) bus->ioeventfd_count--; ret = 0; @@ -1078,21 +1082,13 @@ static int kvm_ioeventfd_batch_begin(struct kvm *kvm) mutex_lock(&kvm->slots_lock); - if (is_kvm_in_shadow(kvm)) { + ks = kvm_find_shadow(kvm); + if (!ks || ks->in_shadow) { ret = -EBUSY; goto out; } - ks = kzalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); - if (!ks) { - ret = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&ks->list); - INIT_LIST_HEAD(&ks->ioeventfds_shadow); - ks->kvm = kvm; - list_add_tail(&ks->list, &kvm_shadow_list); - + ks->in_shadow = true; out: mutex_unlock(&kvm->slots_lock); return ret; @@ -1105,16 +1101,16 @@ static int kvm_ioeventfd_batch_begin(struct kvm *kvm) * the reference count of fd, * and the maintenance of each list. */ -void kvm_release_ioeventfds_shadow(struct kvm *kvm) +void kvm_release_ioeventfds_shadow(struct kvm_shadow *ks) { - struct kvm_shadow *ks; struct eventfd_shadow *es, *tmp; struct _ioeventfd *p; + struct kvm *kvm; - ks = kvm_find_shadow(kvm); - if (!ks) + if (!ks || !ks->kvm) return; + kvm = ks->kvm; list_for_each_entry_safe(es, tmp, &ks->ioeventfds_shadow, node) { p = es->eventfd; list_del_init(&es->node); @@ -1132,19 +1128,19 @@ void kvm_release_ioeventfds_shadow(struct kvm *kvm) * to the buses according to the operation, then discard the failed eventfd. * Continue processing the rest and ultimately return a failure. */ -static int kvm_handle_ioeventfds_shadow(struct kvm *kvm) +static int kvm_handle_ioeventfds_shadow(struct kvm_shadow *ks) { - struct kvm_shadow *ks; struct eventfd_shadow *es, *tmp; struct _ioeventfd *p; struct kvm_io_bus *bus; + struct kvm *kvm; int result = 0; int ret = 0; - ks = kvm_find_shadow(kvm); - if (!ks) + if (!ks || !ks->in_shadow || !ks->kvm) return 0; + kvm = ks->kvm; list_for_each_entry_safe(es, tmp, &ks->ioeventfds_shadow, node) { list_del_init(&es->node); p = es->eventfd; @@ -1156,12 +1152,12 @@ static int kvm_handle_ioeventfds_shadow(struct kvm *kvm) ioeventfd_release(p); result = ret; } else { - kvm_get_bus(kvm, p->bus_idx)->ioeventfd_count++; + kvm_get_bus(kvm, p->bus_idx, true)->ioeventfd_count++; list_add_tail(&p->list, &kvm->ioeventfds); } } else { kvm_io_bus_unregister_dev(kvm, p->bus_idx, &p->dev); - bus = kvm_get_bus(kvm, p->bus_idx); + bus = kvm_get_bus(kvm, p->bus_idx, true); if (bus) bus->ioeventfd_count--; ioeventfd_release(p); @@ -1191,7 +1187,7 @@ static int kvm_ioeventfd_batch_end(struct kvm *kvm) mutex_lock(&kvm->slots_lock); ks = kvm_find_shadow(kvm); - if (!ks) { + if (!ks || !ks->in_shadow) { ret = -EINVAL; goto out; } @@ -1209,7 +1205,7 @@ static int kvm_ioeventfd_batch_end(struct kvm *kvm) ks->buses_shadow[i] = new_bus; } - ret = kvm_handle_ioeventfds_shadow(kvm); + ret = kvm_handle_ioeventfds_shadow(ks); for (i = 0; i < KVM_NR_BUSES; i++) { old[i] = kvm_get_real_bus(kvm, i); @@ -1226,16 +1222,14 @@ static int kvm_ioeventfd_batch_end(struct kvm *kvm) goto out; fail: - kvm_release_ioeventfds_shadow(kvm); + kvm_release_ioeventfds_shadow(ks); for (i = 0; i < KVM_NR_BUSES; i++) { kfree(ks->buses_shadow[i]); ks->buses_shadow[i] = NULL; } out: if (ks) - list_del_init(&ks->list); - - kfree(ks); + ks->in_shadow =false; mutex_unlock(&kvm->slots_lock); return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9e351bce483e..11d7c346a1f2 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1165,10 +1165,28 @@ int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) return 0; } +static struct kvm_shadow *kvm_create_shadow(struct kvm *kvm) +{ + struct kvm_shadow *ks; + + ks = kzalloc(sizeof(*ks), GFP_KERNEL_ACCOUNT); + if (!ks) { + return NULL; + } + + INIT_LIST_HEAD(&ks->list); + INIT_LIST_HEAD(&ks->ioeventfds_shadow); + ks->kvm = kvm; + ks->in_shadow = false; + + return ks; +} + static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) { struct kvm *kvm = kvm_arch_alloc_vm(); struct kvm_memslots *slots; + struct kvm_shadow *ks; int r = -ENOMEM; int i, j; @@ -1261,12 +1279,19 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) if (r) goto out_err_no_debugfs; + ks = kvm_create_shadow(kvm); + if (!ks) { + r = -ENOMEM; + goto out_shadow; + } + r = kvm_arch_post_init_vm(kvm); if (r) goto out_err; mutex_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); + list_add_tail_rcu(&ks->list, &kvm_shadow_list); mutex_unlock(&kvm_lock); preempt_notifier_inc(); @@ -1275,6 +1300,8 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) return kvm; out_err: + kfree(ks); +out_shadow: kvm_destroy_vm_debugfs(kvm); out_err_no_debugfs: kvm_coalesced_mmio_free(kvm); @@ -1290,7 +1317,7 @@ static struct kvm *kvm_create_vm(unsigned long type, const char *fdname) out_err_no_arch_destroy_vm: WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); for (i = 0; i < KVM_NR_BUSES; i++) - kfree(kvm_get_bus(kvm, i)); + kfree(kvm_get_bus(kvm, i, false)); cleanup_srcu_struct(&kvm->irq_srcu); out_err_no_irq_srcu: cleanup_srcu_struct(&kvm->srcu); @@ -1329,26 +1356,35 @@ static void kvm_destroy_vm(struct kvm *kvm) mm->kvm = NULL; #endif kvm_destroy_vm_debugfs(kvm); + + /** + * Release the temporarily stored eventfd information, + * if it is currently in the eventfd batch process. + * kvm_arch_sync_events() needs to access buses; + * so exit shadow mode before kvm_arch_sync_events(). + */ + mutex_lock(&kvm->slots_lock); + ks = kvm_find_shadow(kvm); + if (ks && ks->in_shadow) { + kvm_release_ioeventfds_shadow(ks); + ks->in_shadow = false; + } + mutex_unlock(&kvm->slots_lock); + kvm_arch_sync_events(kvm); mutex_lock(&kvm_lock); + list_del_rcu(&ks->list); list_del(&kvm->vm_list); mutex_unlock(&kvm_lock); kvm_arch_pre_destroy_vm(kvm); + synchronize_rcu(); + kfree(ks); + kvm_free_irq_routing(kvm); - /** - * Release the temporarily stored eventfd information, - * if it is currently in the eventfd batch process - */ - ks = kvm_find_shadow(kvm); - if (ks) { - kvm_release_ioeventfds_shadow(kvm); - list_del_init(&ks->list); - kfree(ks); - } for (i = 0; i < KVM_NR_BUSES; i++) { - struct kvm_io_bus *bus = kvm_get_bus(kvm, i); + struct kvm_io_bus *bus = kvm_get_bus(kvm, i, false); if (bus) kvm_io_bus_destroy(bus); @@ -5432,7 +5468,7 @@ static void hardware_disable_all(void) static void kvm_iodevice_destructor(struct kvm_io_device *dev) { - if (dev->ops->destructor) + if (dev && dev->ops && dev->ops->destructor) dev->ops->destructor(dev); } @@ -5621,8 +5657,14 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, struct kvm_io_bus *new_bus, *bus; struct kvm_io_range range; struct kvm_shadow *ks; + bool is_ioeventfd; + + if (!dev) { + return -EINVAL; + } + is_ioeventfd = kvm_io_device_is_ioeventfd(dev); - bus = kvm_get_bus(kvm, bus_idx); + bus = kvm_get_bus(kvm, bus_idx, is_ioeventfd); if (!bus) return -ENOMEM; @@ -5651,7 +5693,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, memcpy(new_bus->range + i + 1, bus->range + i, (bus->dev_count - i) * sizeof(struct kvm_io_range)); ks = kvm_find_shadow(kvm); - if (ks) { + if (is_ioeventfd && ks && ks->in_shadow) { ks->buses_shadow[bus_idx] = new_bus; } else { rcu_assign_pointer(kvm->buses[bus_idx], new_bus); @@ -5668,10 +5710,16 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, int i; struct kvm_io_bus *new_bus, *bus; struct kvm_shadow *ks; + bool is_ioeventfd; + + if (!dev) { + return -EINVAL; + } + is_ioeventfd = kvm_io_device_is_ioeventfd(dev); lockdep_assert_held(&kvm->slots_lock); - bus = kvm_get_bus(kvm, bus_idx); + bus = kvm_get_bus(kvm, bus_idx, is_ioeventfd); if (!bus) return 0; @@ -5694,7 +5742,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, } ks = kvm_find_shadow(kvm); - if (ks) { + if (is_ioeventfd && ks && ks->in_shadow) { ks->buses_shadow[bus_idx] = new_bus; kfree(bus); return 0; -- Gitee